diff --git a/.buildkite/performance-benchmarks/README.md b/.buildkite/performance-benchmarks/README.md
index 015f48c2520d6..64a262c6cb401 100644
--- a/.buildkite/performance-benchmarks/README.md
+++ b/.buildkite/performance-benchmarks/README.md
@@ -7,7 +7,7 @@ vLLM also maintains a continuous performance benchmark under [perf.vllm.ai](http
 
 ## Performance benchmark quick overview
 
-**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100, Intel® Xeon® Processors and Intel® Gaudi® 3 Accelerators with different models.
+**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100, Intel® Xeon® Processors, Intel® Gaudi® 3 Accelerators and Arm® Neoverse™ with different models.
 
 **Benchmarking Duration**: about 1hr.
 
@@ -23,7 +23,7 @@ bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
 
 Runtime environment variables:
 
-- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
+- `ON_CPU`: set the value to '1' on Intel® Xeon® and Arm® Neoverse™ Processors. Default value is 0.
 - `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
 - `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
 - `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
@@ -34,8 +34,9 @@ Runtime environment variables:
 
 See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
 > NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
-For Intel® Gaudi® 3 Accelerators, use `tests/latency-tests-hpu.json`, `tests/throughput-tests-hpu.json`, `tests/serving-tests-hpu.json` instead.
->
+> For Intel® Gaudi® 3 Accelerators, use `tests/latency-tests-hpu.json`, `tests/throughput-tests-hpu.json`, `tests/serving-tests-hpu.json` instead.
+> For Arm® Neoverse™, use `tests/latency-tests-arm64-cpu.json`, `tests/throughput-tests-arm64-cpu.json`, `tests/serving-tests-arm64-cpu.json` instead.
+
 ### Latency test
 
 Here is an example of one test inside `latency-tests.json`:
diff --git a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
old mode 100644
new mode 100755
index 34ceefe0996f2..6b6a7e472b9c8
--- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
@@ -49,7 +49,11 @@ check_cpus() {
     echo "Need at least 1 NUMA to run benchmarking."
     exit 1
   fi
-  declare -g gpu_type="cpu"
+  if [[ "$(uname -m)" == "aarch64" ]] || [[ "$(uname -m)" == "arm64" ]]; then
+    declare -g gpu_type="arm64-cpu"
+  else
+    declare -g gpu_type="cpu"
+  fi
   echo "GPU type is $gpu_type"
 }
 
@@ -207,8 +211,8 @@ run_latency_tests() {
 
     # check if there is enough GPU to run the test
     tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
-    if [ "$ON_CPU" == "1" ]; then
-      pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size')
+    if [[ "$ON_CPU" == "1" ]]; then
+      pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size // 1')
       world_size=$(($tp*$pp))
       if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
         echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
@@ -276,8 +280,8 @@ run_throughput_tests() {
 
     # check if there is enough GPU to run the test
     tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
-    if [ "$ON_CPU" == "1" ]; then
-      pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size')
+    if [[ "$ON_CPU" == "1" ]]; then
+      pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size // 1')
       world_size=$(($tp*$pp))
       if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
         echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
@@ -393,8 +397,8 @@ run_serving_tests() {
 
     # check if there is enough resources to run the test
     tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
-    if [ "$ON_CPU" == "1" ]; then
-      pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size')
+    if [[ "$ON_CPU" == "1" ]]; then
+      pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size // 1')
       world_size=$(($tp*$pp))
       if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
         echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
@@ -496,9 +500,9 @@ run_serving_tests() {
 main() {
   local ARCH
   ARCH=''
-  if [ "$ON_CPU" == "1" ];then
-     check_cpus
-     ARCH='-cpu'
+  if [[ "$ON_CPU" == "1" ]]; then
+    check_cpus
+    ARCH="-$gpu_type"
   else
      check_gpus
      ARCH="$arch_suffix"
diff --git a/.buildkite/performance-benchmarks/tests/latency-tests-arm64-cpu.json b/.buildkite/performance-benchmarks/tests/latency-tests-arm64-cpu.json
new file mode 100644
index 0000000000000..fba695041e3ee
--- /dev/null
+++ b/.buildkite/performance-benchmarks/tests/latency-tests-arm64-cpu.json
@@ -0,0 +1,26 @@
+[
+    {
+        "test_name": "latency_llama8B_tp1",
+        "environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+            "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+            "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+            "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "dtype": "bfloat16",
+            "distributed_executor_backend": "mp",
+            "block_size": 128,
+            "trust_remote_code": "",
+            "disable_log_stats": "",
+            "enforce_eager": "",
+            "max_num_batched_tokens": 2048,
+            "max_num_seqs": 256,
+            "num_iters_warmup": 5,
+            "num_iters": 15
+        }
+    }
+]
diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-arm64-cpu.json b/.buildkite/performance-benchmarks/tests/serving-tests-arm64-cpu.json
new file mode 100644
index 0000000000000..63f1f8ab887b3
--- /dev/null
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-arm64-cpu.json
@@ -0,0 +1,130 @@
+{
+  "defaults": {
+    "qps_list": [
+      "inf"
+    ],
+    "max_concurrency_list": [
+      12,
+      16,
+      24,
+      32,
+      64,
+      128,
+      200
+    ],
+    "server_environment_variables": {
+      "VLLM_RPC_TIMEOUT": 100000,
+      "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+      "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+      "VLLM_CPU_SGL_KERNEL": 1,
+      "VLLM_CPU_KVCACHE_SPACE": 40
+    },
+    "server_parameters": {
+      "model": "meta-llama/Llama-3.1-8B-Instruct",
+      "tensor_parallel_size": 1,
+      "dtype": "bfloat16",
+      "distributed_executor_backend": "mp",
+      "block_size": 128,
+      "trust_remote_code": "",
+      "disable_log_stats": "",
+      "enforce_eager": "",
+      "max_num_batched_tokens": 2048,
+      "max_num_seqs": 256,
+      "load_format": "dummy"
+    },
+    "client_parameters": {
+      "model": "meta-llama/Llama-3.1-8B-Instruct",
+      "backend": "vllm",
+      "ignore-eos": "",
+      "num_prompts": 200
+    }
+  },
+  "tests": [
+    {
+      "test_name": "serving_llama8B_tp1_sharegpt",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "sharegpt",
+        "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_sharegpt",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "sharegpt",
+        "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp1_random_128_128",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_random_128_128",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp1_random_128_2048",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 2048
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_random_128_2048",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 2048
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp1_random_2048_128",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 2048,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_random_2048_128",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 2048,
+        "random-output-len": 128
+      }
+    }
+  ]
+}
\ No newline at end of file
diff --git a/.buildkite/performance-benchmarks/tests/throughput-tests-arm64-cpu.json b/.buildkite/performance-benchmarks/tests/throughput-tests-arm64-cpu.json
new file mode 100644
index 0000000000000..da84dd4d0c67a
--- /dev/null
+++ b/.buildkite/performance-benchmarks/tests/throughput-tests-arm64-cpu.json
@@ -0,0 +1,27 @@
+[
+    {
+        "test_name": "throughput_llama8B_tp1",
+        "environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+            "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+            "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+            "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "dtype": "bfloat16",
+            "distributed_executor_backend": "mp",
+            "block_size": 128,
+            "trust_remote_code": "",
+            "disable_log_stats": "",
+            "enforce_eager": "",
+            "max_num_batched_tokens": 2048,
+            "max_num_seqs": 256,
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm"
+        }
+    }
+]
diff --git a/.buildkite/scripts/generate-nightly-index.py b/.buildkite/scripts/generate-nightly-index.py
index d0965fbd56405..1794df9479e55 100644
--- a/.buildkite/scripts/generate-nightly-index.py
+++ b/.buildkite/scripts/generate-nightly-index.py
@@ -291,6 +291,7 @@ if __name__ == "__main__":
     """
     Arguments:
         --version <version> : version string for the current build (e.g., commit hash)
+        --wheel-dir <wheel_directory> : directory containing wheel files (default to be same as `version`)
         --current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory
         --output-dir <output_directory> : directory to store generated index files
         --alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant
@@ -318,6 +319,12 @@ if __name__ == "__main__":
         required=True,
         help="Directory to store generated index files",
     )
+    parser.add_argument(
+        "--wheel-dir",
+        type=str,
+        default=None,
+        help="Directory containing wheel files (default to be same as `version`)",
+    )
     parser.add_argument(
         "--alias-to-default",
         type=str,
@@ -372,7 +379,7 @@ if __name__ == "__main__":
 
     print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")
 
-    # keep only "official" files for a non-nightly version (specifed by cli args)
+    # keep only "official" files for a non-nightly version (specified by cli args)
     PY_VERSION_RE = re.compile(r"^\d+\.\d+\.\d+([a-zA-Z0-9.+-]*)?$")
     if PY_VERSION_RE.match(version):
         # upload-wheels.sh ensures no "dev" is in args.version
@@ -384,9 +391,10 @@ if __name__ == "__main__":
         print("Nightly version detected, keeping all wheel files.")
 
     # Generate index and metadata, assuming wheels and indices are stored as:
-    # s3://vllm-wheels/{version}/<wheel files>
+    # s3://vllm-wheels/{wheel_dir}/<wheel files>
     # s3://vllm-wheels/<anything>/<index files>
-    wheel_base_dir = Path(output_dir).parent / version
+    wheel_dir = args.wheel_dir or version
+    wheel_base_dir = Path(output_dir).parent / wheel_dir.strip().rstrip("/")
     index_base_dir = Path(output_dir)
 
     generate_index_and_metadata(
diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
index 438fe522c8702..471c8616df85c 100644
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -50,6 +50,7 @@ function cpu_tests() {
   docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
     pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
+    pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
     pytest -x -v -s tests/kernels/test_onednn.py"
 
   # Run basic model test
diff --git a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
index 6a1bef275d047..d0921c5699d5d 100644
--- a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
@@ -44,10 +44,10 @@ trap cleanup EXIT
 
 for BACK in "${BACKENDS[@]}"; do
   VLLM_DEEP_GEMM_WARMUP=skip \
-  VLLM_ALL2ALL_BACKEND=$BACK \
   vllm serve "$MODEL" \
     --enforce-eager \
     --enable-eplb \
+    --all2all-backend $BACK \
     --eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \
     --tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
     --data-parallel-size ${DATA_PARALLEL_SIZE} \
diff --git a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
index 937a43d1a3221..b3b65128e6062 100644
--- a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
@@ -43,12 +43,12 @@ trap cleanup EXIT
 
 for BACK in "${BACKENDS[@]}"; do
   VLLM_DEEP_GEMM_WARMUP=skip \
-  VLLM_ALL2ALL_BACKEND=$BACK \
   vllm serve "$MODEL" \
     --enforce-eager \
     --tensor-parallel-size 4 \
     --enable-expert-parallel \
     --enable-eplb \
+    --all2all-backend $BACK \
     --eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
     --speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
     --trust-remote-code \
diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh
index 3a218a4bb2e6d..1af7f476ae74b 100644
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@@ -102,6 +102,7 @@ if [[ "$version" != *"dev"* ]]; then
     echo "Re-generating indices for /$pure_version/"
     rm -rf "$INDICES_OUTPUT_DIR/*"
     mkdir -p "$INDICES_OUTPUT_DIR"
-    $PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg
+    # wheel-dir is overridden to be the commit directory, so that the indices point to the correct wheel path
+    $PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --wheel-dir "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg
     aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
 fi
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 6df373632d730..a4d89a46b01ac 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -349,7 +349,9 @@ steps:
 - label: V1 Test e2e + engine # 65min
   timeout_in_minutes: 90
   mirror_hardwares: [amdexperimental]
-  agent_pool: mi325_4
+  # The test uses 4 GPUs, but we schedule it on 8-GPU machines for stability.
+  # See discussion here: https://github.com/vllm-project/vllm/pull/31040
+  agent_pool: mi325_8
   # grade: Blocking
   source_file_dependencies:
     - vllm/
@@ -754,7 +756,7 @@ steps:
   - vllm/model_executor/layers/quantization
   autorun_on_main: true
   commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
 
 - label: OpenAI API correctness # 10min
   timeout_in_minutes: 15
@@ -964,7 +966,7 @@ steps:
     - pytest -v -s models/multimodal/processing
 
 - label: Multi-Modal Models Test (Standard) # 60min
-  timeout_in_minutes: 80
+  timeout_in_minutes: 100
   mirror_hardwares: [amdexperimental]
   agent_pool: mi325_1
   # grade: Blocking
@@ -973,13 +975,15 @@ steps:
   - vllm/
   - tests/models/multimodal
   commands:
+    - export MIOPEN_DEBUG_CONV_DIRECT=0
+    - export MIOPEN_DEBUG_CONV_GEMM=0
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pip freeze | grep -E 'torch'
     - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
     - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
 
-- label: Multi-Modal Accuracy Eval (Small Models) # 150min - 180min
-  timeout_in_minutes: 180
+- label: Multi-Modal Accuracy Eval (Small Models) # 5min
+  timeout_in_minutes: 10
   mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
@@ -989,7 +993,9 @@ steps:
   - vllm/inputs/
   - vllm/v1/core/
   commands:
-  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
+  - export MIOPEN_DEBUG_CONV_DIRECT=0
+  - export MIOPEN_DEBUG_CONV_GEMM=0
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt
 
 - label: Multi-Modal Models Test (Extended) 1 # 60min
   timeout_in_minutes: 120
@@ -1001,10 +1007,13 @@ steps:
   - vllm/
   - tests/models/multimodal
   commands:
+    - export MIOPEN_DEBUG_CONV_DIRECT=0
+    - export MIOPEN_DEBUG_CONV_GEMM=0
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
 
-- label: Multi-Modal Models Test (Extended) 2
+- label: Multi-Modal Models Test (Extended) 2 #60min
+  timeout_in_minutes: 120
   mirror_hardwares: [amdexperimental]
   agent_pool: mi325_1
   # grade: Blocking
@@ -1013,6 +1022,8 @@ steps:
   - vllm/
   - tests/models/multimodal
   commands:
+    - export MIOPEN_DEBUG_CONV_DIRECT=0
+    - export MIOPEN_DEBUG_CONV_GEMM=0
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
 
@@ -1026,6 +1037,8 @@ steps:
   - vllm/
   - tests/models/multimodal
   commands:
+    - export MIOPEN_DEBUG_CONV_DIRECT=0
+    - export MIOPEN_DEBUG_CONV_GEMM=0
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
 
@@ -1203,7 +1216,7 @@ steps:
   - csrc/
   - vllm/model_executor/layers/quantization
   commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
 
 #####  1 GPU test  #####
 #####  multi gpus test  #####
@@ -1243,13 +1256,13 @@ steps:
   - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
     - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
     - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
-    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
+    - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
     - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
     - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
   - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
     - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
     - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
-    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
+    - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
 
 - label: Distributed Tests (2 GPUs) # 68min
   timeout_in_minutes: 90
@@ -1497,7 +1510,7 @@ steps:
     - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
     - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
     - pytest -v -s tests/distributed/test_context_parallel.py
-    - HIP_VISIBLE_DEVICES=0,1 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
+    - HIP_VISIBLE_DEVICES=0,1 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
     - pytest -v -s tests/v1/distributed/test_dbo.py
 
 ##### B200 test #####
@@ -1521,7 +1534,7 @@ steps:
   - csrc/
   - vllm/model_executor/layers/quantization
   commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
 
 - label: LM Eval Large Models (4 Card)
   mirror_hardwares: [amdexperimental, amdproduction]
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 8e3bcfe4a36bc..7b664c4fa15fe 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -319,7 +319,10 @@ steps:
     # TODO: accuracy does not match, whether setting
     # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
     - pytest -v -s v1/e2e
-    - pytest -v -s v1/engine
+    # Run this test standalone for now;
+    # need to untangle use (implicit) use of spawn/fork across the tests.
+    - pytest -v -s v1/engine/test_preprocess_error_handling.py
+    - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
 
 - label: V1 Test entrypoints # 35min
   timeout_in_minutes: 50
@@ -1106,13 +1109,13 @@ steps:
   - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
     - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
     - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
-    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
+    - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
     - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
     - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
   - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
     - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
     - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
-    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
+    - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
 
 - label: Distributed Tests (2 GPUs) # 68min
   timeout_in_minutes: 90
@@ -1331,7 +1334,7 @@ steps:
     - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
     - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
     - pytest -v -s tests/distributed/test_context_parallel.py
-    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
+    - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
     - pytest -v -s tests/v1/distributed/test_dbo.py
 
 ##### B200 test #####
@@ -1356,6 +1359,7 @@ steps:
   - vllm/
   - .buildkite/scripts/run-prime-rl-test.sh
   commands:
+    - nvidia-smi
     - bash .buildkite/scripts/run-prime-rl-test.sh
 
 - label: DeepSeek V2-Lite Accuracy
diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml
index 2cc90698d916a..65a981a9d6d00 100644
--- a/.buildkite/test_areas/distributed.yaml
+++ b/.buildkite/test_areas/distributed.yaml
@@ -145,7 +145,7 @@ steps:
     - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
     - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
     - pytest -v -s tests/distributed/test_context_parallel.py
-    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
+    - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
     - pytest -v -s tests/v1/distributed/test_dbo.py
 
 - label: Distributed Tests (2 GPUs)(B200)
@@ -171,7 +171,7 @@ steps:
   - tests/distributed/
   - tests/examples/offline_inference/data_parallel.py
   commands:
-    - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code"
+    - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code"
 
 - label: Distributed NixlConnector PD accuracy (4 GPUs)
   timeout_in_minutes: 30
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index d6447649cd89a..4d7a366f05e37 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -15,6 +15,7 @@
 /vllm/lora @jeejeelee
 /vllm/reasoning @aarnphm @chaunceyjiang
 /vllm/entrypoints @aarnphm @chaunceyjiang
+/vllm/tool_parsers @aarnphm @chaunceyjiang
 /vllm/compilation @zou3519 @youkaichao @ProExpertProg
 /vllm/distributed/kv_transfer @NickLucche @ApostaC
 CMakeLists.txt @tlrmchlsmth @LucasWilkinson
diff --git a/.github/mergify.yml b/.github/mergify.yml
index 3e4e21efe39df..61a03135be395 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -349,6 +349,18 @@ pull_request_rules:
       add:
         - tool-calling
 
+- name: auto-rebase if approved, ready, and 40 commits behind main
+  conditions:
+    - base = main
+    - label=ready
+    - "#approved-reviews-by >= 1"
+    - "#commits-behind >= 40"
+    - -closed
+    - -draft
+    - -conflict
+  actions:
+    rebase: {}
+
 - name: ping author on conflicts and add 'needs-rebase' label
   conditions:
     - label != stale
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a14496e035d9a..c46fb18d7bfef 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -799,24 +799,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   else()
     cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
   endif()
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
-    set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu")
-    set_gencode_flags_for_srcs(
-      SRCS "${SRCS}"
-      CUDA_ARCHS "${SCALED_MM_ARCHS}")
-    list(APPEND VLLM_EXT_SRC "${SRCS}")
-    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
-    message(STATUS "Building blockwise_scaled_group_mm_sm100 for archs: ${SCALED_MM_ARCHS}")
-  else()
-    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
-      message(STATUS "Not building blockwise_scaled_group_mm_sm100 kernels as CUDA Compiler version is "
-                     "not >= 12.8, we recommend upgrading to CUDA 12.8 or later "
-                     "if you intend on running FP8 quantized MoE models on Blackwell.")
-    else()
-      message(STATUS "Not building blockwise_scaled_group_mm_sm100 as no compatible archs found "
-                     "in CUDA target architectures")
-    endif()
-  endif()
 
   #
   # Machete kernels
diff --git a/benchmarks/benchmark_batch_invariance.py b/benchmarks/benchmark_batch_invariance.py
index b5c16c42de467..7473a41e51406 100755
--- a/benchmarks/benchmark_batch_invariance.py
+++ b/benchmarks/benchmark_batch_invariance.py
@@ -104,7 +104,6 @@ def run_benchmark_with_batch_invariant(
     random.seed(seed)
 
     # Set environment variables
-    os.environ["VLLM_ATTENTION_BACKEND"] = backend
     if batch_invariant:
         os.environ["VLLM_BATCH_INVARIANT"] = "1"
     else:
@@ -140,6 +139,7 @@ def run_benchmark_with_batch_invariant(
             max_model_len=max_model_len,
             dtype="bfloat16",
             tensor_parallel_size=tp_size,
+            attention_config={"backend": backend},
             enable_prefix_caching=False,
         )
         init_time = time.perf_counter() - start_init
diff --git a/benchmarks/kernels/bench_nvfp4_quant.py b/benchmarks/kernels/bench_nvfp4_quant.py
new file mode 100644
index 0000000000000..7517376535925
--- /dev/null
+++ b/benchmarks/kernels/bench_nvfp4_quant.py
@@ -0,0 +1,177 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+import copy
+import itertools
+
+import torch
+from weight_shapes import WEIGHT_SHAPES
+
+from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
+from vllm.scalar_type import scalar_types
+from vllm.triton_utils import triton
+from vllm.utils.flashinfer import flashinfer_fp4_quantize
+
+if not current_platform.has_device_capability(100):
+    raise RuntimeError("NVFP4 requires compute capability of 10.0 (Blackwell)")
+
+FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
+FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
+
+PROVIDER_CFGS = {
+    "vllm": dict(backend="vllm", enabled=True),
+    "flashinfer": dict(backend="flashinfer", enabled=True),
+}
+
+_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
+
+
+def compute_global_scale(tensor: torch.Tensor) -> torch.Tensor:
+    """Compute global scale for FP4 quantization."""
+    amax = torch.abs(tensor).max().to(torch.float32)
+    return FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / amax
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["batch_size"],
+        x_vals=[1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096],
+        x_log=False,
+        line_arg="provider",
+        line_vals=_enabled,
+        line_names=_enabled,
+        ylabel="us (lower is better)",
+        plot_name="NVFP4 Input Quantization Latency (us)",
+        args={},
+    )
+)
+def benchmark(batch_size, provider, N, K):
+    M = batch_size
+    device = "cuda"
+    dtype = torch.bfloat16
+
+    # Create input tensor
+    a = torch.randn((M, K), device=device, dtype=dtype)
+
+    # Compute global scale for activation
+    a_global_scale = compute_global_scale(a)
+
+    quantiles = [0.5, 0.2, 0.8]
+
+    cfg = PROVIDER_CFGS[provider]
+
+    if cfg["backend"] == "vllm":
+        # vLLM's FP4 quantization
+        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+            lambda: ops.scaled_fp4_quant(a, a_global_scale),
+            quantiles=quantiles,
+        )
+    elif cfg["backend"] == "flashinfer":
+        # FlashInfer's FP4 quantization
+        # Use is_sf_swizzled_layout=True to match vLLM's output format
+        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+            lambda: flashinfer_fp4_quantize(
+                a, a_global_scale, is_sf_swizzled_layout=True
+            ),
+            quantiles=quantiles,
+        )
+
+    # Convert ms to us for better readability at small batch sizes
+    to_us = lambda t_ms: t_ms * 1000
+    return to_us(ms), to_us(max_ms), to_us(min_ms)
+
+
+def prepare_shapes(args):
+    out = []
+    for model, tp_size in itertools.product(args.models, args.tp_sizes):
+        for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
+            KN[tp_dim] //= tp_size
+            KN.append(model)
+            out.append(KN)
+    return out
+
+
+def _test_accuracy_once(M: int, K: int, dtype: torch.dtype, device: str):
+    """Test accuracy between vLLM and FlashInfer FP4 quantization."""
+    # Create input tensor
+    a = torch.randn((M, K), device=device, dtype=dtype)
+
+    # Compute global scale
+    a_global_scale = compute_global_scale(a)
+
+    # vLLM quantization
+    vllm_fp4, vllm_scale = ops.scaled_fp4_quant(a, a_global_scale)
+
+    # FlashInfer quantization (with swizzled layout to match vLLM's output)
+    flashinfer_fp4, flashinfer_scale = flashinfer_fp4_quantize(
+        a, a_global_scale, is_sf_swizzled_layout=True
+    )
+    flashinfer_scale = flashinfer_scale.view(torch.float8_e4m3fn)
+
+    # Compare outputs
+    torch.testing.assert_close(
+        vllm_fp4,
+        flashinfer_fp4,
+    )
+    print(f"M={M}, K={K}, dtype={dtype}: PASSED")
+
+
+def test_accuracy():
+    """Run accuracy tests across various shapes."""
+    print("\n" + "=" * 60)
+    print("Running accuracy tests: vLLM vs FlashInfer")
+    print("=" * 60)
+
+    device = "cuda"
+    dtype = torch.bfloat16
+
+    # Test various batch sizes and hidden dimensions
+    Ms = [1, 1024]
+    Ks = [4096]
+
+    for M in Ms:
+        for K in Ks:
+            _test_accuracy_once(M, K, dtype, device)
+
+    print("\nAll accuracy tests passed!")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Benchmark NVFP4 quantization: vLLM vs FlashInfer"
+    )
+    parser.add_argument(
+        "--models",
+        nargs="+",
+        type=str,
+        default=["meta-llama/Llama-3.1-8B-Instruct"],
+        choices=list(WEIGHT_SHAPES.keys()),
+    )
+    parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
+    parser.add_argument(
+        "--save-path",
+        type=str,
+        default=None,
+        help="Path to save benchmark results",
+    )
+    parser.add_argument(
+        "--accuracy",
+        action="store_true",
+        help="Run accuracy tests",
+    )
+    args = parser.parse_args()
+
+    if args.accuracy:
+        test_accuracy()
+
+    for K, N, model in prepare_shapes(args):
+        print(f"\n{model}, N={N} K={K}")
+        benchmark.run(
+            print_data=True,
+            save_path=args.save_path,
+            N=N,
+            K=K,
+        )
+
+    print("\nBenchmark finished!")
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index 85b286f8d8d0a..0af87fd7f0b53 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -330,7 +330,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
         PUBLIC ${oneDNN_BINARY_DIR}/include
         PRIVATE ${oneDNN_SOURCE_DIR}/src
     )
-    target_link_libraries(dnnl_ext dnnl)
+    target_link_libraries(dnnl_ext dnnl torch)
     target_compile_options(dnnl_ext PRIVATE ${CXX_COMPILE_FLAGS} -fPIC)
     list(APPEND LIBS dnnl_ext)
     set(USE_ONEDNN ON)
@@ -358,13 +358,13 @@ set(VLLM_EXT_SRC
     "csrc/cpu/pos_encoding.cpp"
     "csrc/moe/dynamic_4bit_int_moe_cpu.cpp"
     "csrc/cpu/cpu_attn.cpp"
-    "csrc/cpu/scratchpad_manager.cpp"
     "csrc/cpu/torch_bindings.cpp")
 
 if (AVX512_FOUND AND NOT AVX512_DISABLED)
     set(VLLM_EXT_SRC
         "csrc/cpu/shm.cpp"
         "csrc/cpu/cpu_wna16.cpp"
+        "csrc/cpu/cpu_fused_moe.cpp"
         ${VLLM_EXT_SRC})
     if (ENABLE_AVX512BF16 AND ENABLE_AVX512VNNI)
         set(VLLM_EXT_SRC
diff --git a/csrc/cpu/cpu_attn_macros.h b/csrc/cpu/cpu_arch_macros.h
similarity index 97%
rename from csrc/cpu/cpu_attn_macros.h
rename to csrc/cpu/cpu_arch_macros.h
index 35716a0790ab3..c73b62ecdec90 100644
--- a/csrc/cpu/cpu_attn_macros.h
+++ b/csrc/cpu/cpu_arch_macros.h
@@ -1,5 +1,5 @@
-#ifndef CPU_ATTN_MACROS_H
-#define CPU_ATTN_MACROS_H
+#ifndef CPU_ARCH_MACROS_H
+#define CPU_ARCH_MACROS_H
 
 // x86_64
 #ifdef __x86_64__
@@ -26,7 +26,7 @@
           _mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218));                  \
       const __m512i vec_127 = _mm512_set1_epi32(0x0000007f);                   \
       const int n_mantissa_bits = 23;                                          \
-      auto fast_exp = [&](vec_op::FP32Vec16& vec) __attribute__((              \
+      auto fast_exp = [&](const vec_op::FP32Vec16& vec) __attribute__((        \
                           always_inline)) {                                    \
         __m512 values = vec.reg;                                               \
         auto less_ln_flt_min_mask =                                            \
@@ -98,7 +98,7 @@
       poly = vbslq_f32(hi_mask, inf, poly);                                    \
       return vbslq_f32(lo_mask, zero, poly);                                   \
     };                                                                         \
-    auto fast_exp = [&](vec_op::FP32Vec16& vec)                                \
+    auto fast_exp = [&](const vec_op::FP32Vec16& vec)                          \
                         __attribute__((always_inline)) {                       \
                           float32x4x4_t result;                                \
                           result.val[0] = neon_expf(vec.reg.val[0]);           \
@@ -110,4 +110,4 @@
 
 #endif  // __aarch64__
 
-#endif
\ No newline at end of file
+#endif
diff --git a/csrc/cpu/cpu_attn_impl.hpp b/csrc/cpu/cpu_attn_impl.hpp
index e3e077b845f4f..08d208e05a62c 100644
--- a/csrc/cpu/cpu_attn_impl.hpp
+++ b/csrc/cpu/cpu_attn_impl.hpp
@@ -8,10 +8,8 @@
   #include <sys/sysctl.h>
 #endif
 
-#include "cpu_types.hpp"
-#include "scratchpad_manager.h"
-#include "cpu_attn_macros.h"
-#include "utils.hpp"
+#include "cpu/cpu_arch_macros.h"
+#include "cpu/utils.hpp"
 
 namespace cpu_attention {
 enum class ISA { AMX, VEC, VEC16, NEON };
@@ -378,12 +376,13 @@ class AttentionScheduler {
 
   static constexpr int32_t MaxQTileIterNum = 128;
 
-  AttentionScheduler() : available_cache_size_(get_available_l2_size()) {}
+  AttentionScheduler()
+      : available_cache_size_(cpu_utils::get_available_l2_size()) {}
 
   torch::Tensor schedule(const ScheduleInput& input) const {
     const bool casual = input.casual;
     const int32_t thread_num = omp_get_max_threads();
-    const int64_t cache_size = get_available_l2_size();
+    const int64_t cache_size = cpu_utils::get_available_l2_size();
     const int32_t max_num_q_per_iter = input.max_num_q_per_iter;
     const int32_t kv_len_alignment = input.kv_block_alignment;
     int32_t q_head_per_kv = input.num_heads_q / input.num_heads_kv;
@@ -659,7 +658,7 @@ class AttentionScheduler {
             metadata_ptr->thread_num +
         metadata_ptr->reduction_scratchpad_size_per_kv_head *
             (use_gqa ? input.num_heads_kv : input.num_heads_q);
-    DNNLScratchPadManager::get_dnnl_scratchpad_manager()->realloc(
+    cpu_utils::ScratchPadManager::get_scratchpad_manager()->realloc(
         scratchpad_size);
 
     // metadata_ptr->print();
@@ -667,7 +666,7 @@ class AttentionScheduler {
     // test out of boundary access
     // {
     //     float* cache_ptr =
-    //     DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data<float>();
+    //     cpu_utils::ScratchPadManager::getl_scratchpad_manager()->get_data<float>();
     //     for (int64_t i = 0; i < scratchpad_size / sizeof(float); ++i) {
     //         cache_ptr[i] = std::numeric_limits<float>::quiet_NaN();
     //     }
@@ -749,27 +748,6 @@ class AttentionScheduler {
     return std::max(rounded_tile_size, round_size);
   }
 
-  static int64_t get_available_l2_size() {
-    static int64_t size = []() {
-#if defined(__APPLE__)
-      // macOS doesn't have _SC_LEVEL2_CACHE_SIZE. Use sysctlbyname.
-      int64_t l2_cache_size = 0;
-      size_t len = sizeof(l2_cache_size);
-      if (sysctlbyname("hw.l2cachesize", &l2_cache_size, &len, NULL, 0) == 0 &&
-          l2_cache_size > 0) {
-        return l2_cache_size >> 1;  // use 50% of L2 cache
-      }
-      // Fallback if sysctlbyname fails
-      return 128LL * 1024 >> 1;  // use 50% of 128KB
-#else
-      long l2_cache_size = sysconf(_SC_LEVEL2_CACHE_SIZE);
-      TORCH_CHECK_NE(l2_cache_size, -1);
-      return l2_cache_size >> 1;  // use 50% of L2 cache
-#endif
-    }();
-    return size;
-  }
-
  private:
   int64_t available_cache_size_;
 };
@@ -1402,7 +1380,7 @@ class AttentionMainLoop {
 
       // init buffers
       void* scratchpad_ptr =
-          DNNLScratchPadManager::get_dnnl_scratchpad_manager()
+          cpu_utils::ScratchPadManager::get_scratchpad_manager()
               ->get_data<void>();
       AttentionScratchPad buffer_manager(thread_id, metadata, scratchpad_ptr);
 
@@ -1422,8 +1400,7 @@ class AttentionMainLoop {
         }
       }
 
-      const int64_t available_cache_size =
-          AttentionScheduler::get_available_l2_size();
+      const int64_t available_cache_size = cpu_utils::get_available_l2_size();
       const int32_t default_tile_size =
           AttentionScheduler::calcu_default_tile_size(
               available_cache_size, head_dim, sizeof(kv_cache_t),
diff --git a/csrc/cpu/cpu_fused_moe.cpp b/csrc/cpu/cpu_fused_moe.cpp
new file mode 100644
index 0000000000000..090e2d4cd4b56
--- /dev/null
+++ b/csrc/cpu/cpu_fused_moe.cpp
@@ -0,0 +1,727 @@
+#include "cpu/cpu_types.hpp"
+#include "cpu/utils.hpp"
+#include "cpu/micro_gemm/cpu_micro_gemm_vec.hpp"
+#include "cpu/cpu_arch_macros.h"
+
+#ifdef CPU_CAPABILITY_AMXBF16
+  #include "cpu/micro_gemm/cpu_micro_gemm_amx.hpp"
+  #define AMX_DISPATCH(...)                                                    \
+    case cpu_utils::ISA::AMX: {                                                \
+      using gemm_t = cpu_micro_gemm::MicroGemm<cpu_utils::ISA::AMX, scalar_t>; \
+      return __VA_ARGS__();                                                    \
+    }
+#else
+  #define AMX_DISPATCH(...) case cpu_utils::ISA::AMX:
+#endif
+
+#define CPU_ISA_DISPATCH_IMPL(ISA_TYPE, ...)                          \
+  [&] {                                                               \
+    switch (ISA_TYPE) {                                               \
+      AMX_DISPATCH(__VA_ARGS__)                                       \
+      case cpu_utils::ISA::VEC: {                                     \
+        using gemm_t =                                                \
+            cpu_micro_gemm::MicroGemm<cpu_utils::ISA::VEC, scalar_t>; \
+        return __VA_ARGS__();                                         \
+      }                                                               \
+      default: {                                                      \
+        TORCH_CHECK(false, "Invalid CPU ISA type.");                  \
+      }                                                               \
+    }                                                                 \
+  }()
+
+namespace {
+enum class FusedMOEAct { SiluAndMul, SwigluOAIAndMul };
+
+FusedMOEAct get_act_type(const std::string& act) {
+  if (act == "silu") {
+    return FusedMOEAct::SiluAndMul;
+  } else if (act == "swigluoai") {
+    return FusedMOEAct::SwigluOAIAndMul;
+  } else {
+    TORCH_CHECK(false, "Invalid act type: " + act);
+  }
+}
+
+template <typename scalar_t>
+void swigluoai_and_mul(float* __restrict__ input, scalar_t* __restrict__ output,
+                       const int32_t m_size, const int32_t n_size,
+                       const int32_t input_stride,
+                       const int32_t output_stride) {
+  using scalar_vec_t = typename cpu_utils::VecTypeTrait<scalar_t>::vec_t;
+  // For GPT-OSS interleaved gate-up weights
+  alignas(64) static int32_t index[16] = {0,  2,  4,  6,  8,  10, 12, 14,
+                                          16, 18, 20, 22, 24, 26, 28, 30};
+  vec_op::INT32Vec16 index_vec(index);
+  vec_op::FP32Vec16 gate_up_max_vec(7.0);
+  vec_op::FP32Vec16 up_min_vec(-7.0);
+  vec_op::FP32Vec16 alpha_vec(1.702);
+  vec_op::FP32Vec16 one_vec(1.0);
+
+  DEFINE_FAST_EXP
+
+  for (int32_t m = 0; m < m_size; ++m) {
+    for (int32_t n = 0; n < n_size; n += 32) {
+      vec_op::FP32Vec16 gate_vec(input + n, index_vec);
+      vec_op::FP32Vec16 up_vec(input + n + 1, index_vec);
+      gate_vec = gate_vec.min(gate_up_max_vec);
+      up_vec = up_vec.clamp(up_min_vec, gate_up_max_vec);
+      auto sigmoid_vec = one_vec / (one_vec + fast_exp(-gate_vec * alpha_vec));
+      auto glu = gate_vec * sigmoid_vec;
+      auto gated_output_fp32 = (one_vec + up_vec) * glu;
+      scalar_vec_t gated_output = scalar_vec_t(gated_output_fp32);
+      gated_output.save(output + n / 2);
+    }
+    input += input_stride;
+    output += output_stride;
+  }
+}
+
+template <typename scalar_t>
+void silu_and_mul(float* __restrict__ input, scalar_t* __restrict__ output,
+                  const int32_t m_size, const int32_t n_size,
+                  const int32_t input_stride, const int32_t output_stride) {
+  using scalar_vec_t = typename cpu_utils::VecTypeTrait<scalar_t>::vec_t;
+  const int32_t dim = n_size / 2;
+  float* __restrict__ gate = input;
+  float* __restrict__ up = input + dim;
+  vec_op::FP32Vec16 one_vec(1.0);
+
+  DEFINE_FAST_EXP
+
+  for (int32_t m = 0; m < m_size; ++m) {
+    for (int32_t n = 0; n < dim; n += 16) {
+      vec_op::FP32Vec16 gate_vec(gate + n);
+      vec_op::FP32Vec16 up_vec(up + n);
+      auto sigmoid_vec = one_vec / (one_vec + fast_exp(-gate_vec));
+      auto silu = gate_vec * sigmoid_vec;
+      auto gated_output_fp32 = up_vec * silu;
+      scalar_vec_t gated_output = scalar_vec_t(gated_output_fp32);
+      gated_output.save(output + n);
+    }
+    gate += input_stride;
+    up += input_stride;
+    output += output_stride;
+  }
+}
+
+template <typename scalar_t>
+FORCE_INLINE void apply_gated_act(const FusedMOEAct act,
+                                  float* __restrict__ input,
+                                  scalar_t* __restrict__ output,
+                                  const int32_t m, const int32_t n,
+                                  const int32_t input_stride,
+                                  const int32_t output_stride) {
+  switch (act) {
+    case FusedMOEAct::SwigluOAIAndMul:
+      swigluoai_and_mul(input, output, m, n, input_stride, output_stride);
+      return;
+    case FusedMOEAct::SiluAndMul:
+      silu_and_mul(input, output, m, n, input_stride, output_stride);
+      return;
+    default:
+      TORCH_CHECK(false, "Unsupported act type.");
+  }
+}
+
+template <typename scalar_t, typename gemm_t>
+void prepack_moe_weight_impl(scalar_t* __restrict__ weight_ptr,
+                             scalar_t* __restrict__ packed_weight_ptr,
+                             const int32_t expert_num,
+                             const int32_t output_size,
+                             const int32_t input_size,
+                             const int64_t expert_stride) {
+#pragma omp parallel for
+  for (int32_t e_idx = 0; e_idx < expert_num; ++e_idx) {
+    gemm_t::pack_weight(weight_ptr + expert_stride * e_idx,
+                        packed_weight_ptr + expert_stride * e_idx, output_size,
+                        input_size);
+  }
+}
+
+template <typename scalar_t, typename w_t, typename gemm_t>
+void fused_moe_impl(scalar_t* __restrict__ output, scalar_t* __restrict__ input,
+                    w_t* __restrict__ w13, w_t* __restrict__ w2,
+                    w_t* __restrict__ w13_bias, w_t* __restrict__ w2_bias,
+                    float* __restrict__ topk_weights,
+                    int32_t* __restrict__ topk_id, FusedMOEAct act_type,
+                    const int32_t token_num, const int32_t expert_num,
+                    const int32_t topk_num, const int32_t input_size_13,
+                    const int32_t output_size_13, const int32_t input_size_2,
+                    const int32_t output_size_2) {
+  using scalar_vec_t = typename cpu_utils::VecTypeTrait<scalar_t>::vec_t;
+  constexpr int32_t gemm_n_tile_size = gemm_t::NSize;
+  constexpr int32_t gemm_m_tile_size = gemm_t::MaxMSize;
+  constexpr int32_t min_w13_n_tile_size = 2 * gemm_n_tile_size;
+  static_assert(gemm_n_tile_size % 16 == 0);
+
+  TORCH_CHECK_EQ(output_size_13 % min_w13_n_tile_size, 0);
+  TORCH_CHECK_EQ(output_size_2 % gemm_n_tile_size, 0);
+  TORCH_CHECK_EQ(output_size_13 / 2, input_size_2);
+
+  const int32_t thread_num = omp_get_max_threads();
+
+  const int32_t w13_input_buffer_size = cpu_utils::round_up<64>(
+      gemm_m_tile_size * input_size_13 * sizeof(scalar_t));
+
+  const int32_t w13_n_tile_size = [&]() {
+    const int64_t cache_size = cpu_utils::get_available_l2_size();
+    // input buffer + output buffer + weight
+    const int32_t n_size_cache_limit =
+        (cache_size - w13_input_buffer_size) /
+        (gemm_m_tile_size * sizeof(float) + input_size_13 * sizeof(scalar_t));
+    const int32_t n_size_thread_limit =
+        output_size_13 / std::max(1, thread_num / topk_num);
+    const int32_t n_size = cpu_utils::round_down<min_w13_n_tile_size>(
+        std::min(n_size_cache_limit, n_size_thread_limit));
+    return std::max(n_size, min_w13_n_tile_size);
+  }();
+
+  const int32_t w2_input_tile_size = cpu_utils::round_up<64>(
+      gemm_m_tile_size * input_size_2 * sizeof(scalar_t));
+
+  const int32_t w2_n_tile_size = [&]() {
+    const int64_t cache_size = cpu_utils::get_available_l2_size();
+    // input tile + weight
+    const int32_t n_size_cache_limit =
+        (cache_size - w2_input_tile_size) / (input_size_2 * sizeof(scalar_t));
+    const int32_t n_size_thread_limit =
+        output_size_2 / std::max(1, thread_num / topk_num);
+    const int32_t n_size = cpu_utils::round_down<gemm_n_tile_size>(
+        std::min(n_size_cache_limit, n_size_thread_limit));
+    return std::max(n_size, gemm_n_tile_size);
+  }();
+
+  // allocate buffers
+  int32_t common_buffer_offset = 0;
+  int32_t w13_thread_buffer_offset = 0;
+  int32_t ws_thread_buffer_offset = 0;
+
+  // common buffers
+  const int32_t token_num_per_group_buffer_size =
+      cpu_utils::round_up<64>(expert_num * sizeof(int32_t));
+  const int32_t token_num_per_group_buffer_offset = common_buffer_offset;
+  common_buffer_offset += token_num_per_group_buffer_size;
+
+  const int32_t cu_token_num_per_group_buffer_size =
+      cpu_utils::round_up<64>((expert_num + 1) * sizeof(int32_t));
+  const int32_t cu_token_num_per_group_buffer_offset = common_buffer_offset;
+  common_buffer_offset += cu_token_num_per_group_buffer_size;
+
+  const int32_t expand_token_id_buffer_size =
+      cpu_utils::round_up<64>(token_num * topk_num * sizeof(int32_t));
+  const int32_t expand_token_id_buffer_offset = common_buffer_offset;
+  common_buffer_offset += expand_token_id_buffer_size;
+
+  const int32_t expand_token_id_index_buffer_size =
+      cpu_utils::round_up<64>(token_num * topk_num * sizeof(int32_t));
+  const int32_t expand_token_id_index_buffer_offset = common_buffer_offset;
+  common_buffer_offset += expand_token_id_index_buffer_size;
+
+  const int32_t w13_gemm_output_buffer_size = cpu_utils::round_up<64>(
+      token_num * topk_num * (output_size_13 / 2) * sizeof(scalar_t));
+  const int32_t w13_gemm_output_buffer_offset = common_buffer_offset;
+  common_buffer_offset += w13_gemm_output_buffer_size;
+
+  const int32_t w2_gemm_output_buffer_size = cpu_utils::round_up<64>(
+      token_num * topk_num * output_size_2 * sizeof(float));
+  const int32_t w2_gemm_output_buffer_offset = common_buffer_offset;
+  common_buffer_offset += w2_gemm_output_buffer_size;
+
+  // w13 GEMM thread buffers
+  const int32_t w13_input_buffer_offset = w13_thread_buffer_offset;
+  w13_thread_buffer_offset += w13_input_buffer_size;
+
+  const int32_t w13_output_buffer_size = cpu_utils::round_up<64>(
+      gemm_m_tile_size * w13_n_tile_size * sizeof(float));
+  const int32_t w13_output_buffer_offset = w13_thread_buffer_offset;
+  w13_thread_buffer_offset += w13_output_buffer_size;
+
+  // Weighted sum thread buffer
+  const int32_t ws_output_buffer_size =
+      cpu_utils::round_up<64>(output_size_2 * sizeof(float));
+  const int32_t ws_output_buffer_offset = ws_thread_buffer_offset;
+  ws_thread_buffer_offset += ws_output_buffer_size;
+
+  const int32_t buffer_size =
+      common_buffer_offset +
+      std::max(w13_thread_buffer_offset, ws_thread_buffer_offset) * thread_num;
+  cpu_utils::ScratchPadManager::get_scratchpad_manager()->realloc(buffer_size);
+  uint8_t* common_buffer_start =
+      cpu_utils::ScratchPadManager::get_scratchpad_manager()
+          ->get_data<uint8_t>();
+  uint8_t* thread_buffer_start = common_buffer_start + common_buffer_offset;
+
+  int32_t* __restrict__ token_num_per_group_buffer = reinterpret_cast<int32_t*>(
+      common_buffer_start + token_num_per_group_buffer_offset);
+  int32_t* __restrict__ cu_token_num_per_group_buffer =
+      reinterpret_cast<int32_t*>(common_buffer_start +
+                                 cu_token_num_per_group_buffer_offset);
+  int32_t* __restrict__ expand_token_id_buffer = reinterpret_cast<int32_t*>(
+      common_buffer_start + expand_token_id_buffer_offset);
+  int32_t* __restrict__ expand_token_id_index_buffer =
+      reinterpret_cast<int32_t*>(common_buffer_start +
+                                 expand_token_id_index_buffer_offset);
+
+  // prepare token-expert mappings
+  {
+    std::memset(token_num_per_group_buffer, 0, expert_num * sizeof(int32_t));
+    for (int32_t i = 0; i < token_num * topk_num; ++i) {
+      int32_t curr_expert_id = topk_id[i];
+      ++token_num_per_group_buffer[curr_expert_id];
+    }
+
+    int32_t token_num_sum = 0;
+    cu_token_num_per_group_buffer[0] = 0;
+    int32_t* token_index_buffer = cu_token_num_per_group_buffer + 1;
+    for (int32_t i = 0; i < expert_num; ++i) {
+      token_index_buffer[i] = token_num_sum;
+      token_num_sum += token_num_per_group_buffer[i];
+    }
+
+    for (int32_t i = 0; i < token_num; ++i) {
+      int32_t* curr_topk_id = topk_id + i * topk_num;
+      int32_t* curr_index_buffer = expand_token_id_index_buffer + i * topk_num;
+      for (int32_t j = 0; j < topk_num; ++j) {
+        int32_t curr_expert_id = curr_topk_id[j];
+        int32_t curr_index = token_index_buffer[curr_expert_id];
+        ++token_index_buffer[curr_expert_id];
+        expand_token_id_buffer[curr_index] = i;
+        curr_index_buffer[j] = curr_index;
+      }
+    }
+  }
+
+  // w13 GEMM + act
+  {
+    alignas(64) cpu_utils::Counter counter;
+    cpu_utils::Counter* counter_ptr = &counter;
+
+#pragma omp parallel for schedule(static, 1)
+    for (int32_t thread_id = 0; thread_id < thread_num; ++thread_id) {
+      const int32_t task_num_per_expert =
+          (output_size_13 + w13_n_tile_size - 1) / w13_n_tile_size;
+      const int32_t task_num = task_num_per_expert * expert_num;
+
+      uint8_t* __restrict__ thread_buffer =
+          thread_buffer_start + thread_id * w13_thread_buffer_offset;
+      scalar_t* __restrict__ w13_input_buffer =
+          reinterpret_cast<scalar_t*>(thread_buffer + w13_input_buffer_offset);
+      float* __restrict__ w13_output_buffer =
+          reinterpret_cast<float*>(thread_buffer + w13_output_buffer_offset);
+      scalar_t* __restrict__ w13_gemm_output_buffer =
+          reinterpret_cast<scalar_t*>(common_buffer_start +
+                                      w13_gemm_output_buffer_offset);
+
+      gemm_t gemm;
+
+      const int32_t input_size_13_bytes = input_size_13 * sizeof(scalar_t);
+      const int32_t w13_n_group_stride = 16 * input_size_13;
+      const int32_t w13_n_tile_stride = gemm_n_tile_size * input_size_13;
+
+      for (;;) {
+        int32_t task_id = counter_ptr->acquire_counter();
+        if (task_id >= task_num) {
+          break;
+        }
+
+        const int32_t curr_expert_id = task_id / task_num_per_expert;
+        const int32_t curr_output_group_id = task_id % task_num_per_expert;
+        const int32_t curr_token_num =
+            token_num_per_group_buffer[curr_expert_id];
+        if (curr_token_num == 0) {
+          continue;
+        }
+
+        const int32_t actual_n_tile_size =
+            std::min(w13_n_tile_size,
+                     output_size_13 - curr_output_group_id * w13_n_tile_size);
+        const int32_t* __restrict__ curr_expand_token_id_buffer =
+            expand_token_id_buffer +
+            cu_token_num_per_group_buffer[curr_expert_id];
+        scalar_t* __restrict__ curr_w13_gemm_output_buffer =
+            w13_gemm_output_buffer +
+            cu_token_num_per_group_buffer[curr_expert_id] *
+                (output_size_13 / 2) +
+            curr_output_group_id * w13_n_tile_size / 2;
+
+        w_t* __restrict__ w13_weight_ptr_0 = nullptr;
+        w_t* __restrict__ w13_weight_ptr_1 = nullptr;
+        w_t* __restrict__ w13_bias_ptr_0 = nullptr;
+        w_t* __restrict__ w13_bias_ptr_1 = nullptr;
+        if (act_type == FusedMOEAct::SwigluOAIAndMul) {
+          // For SwigluOAIAndMul, up and down weights are interleaved
+          w13_weight_ptr_0 =
+              w13 + curr_expert_id * input_size_13 * output_size_13 +
+              curr_output_group_id * w13_n_tile_size * input_size_13;
+          w13_weight_ptr_1 =
+              w13_weight_ptr_0 + actual_n_tile_size / 2 * input_size_13;
+          if (w13_bias != nullptr) {
+            w13_bias_ptr_0 = w13_bias + curr_expert_id * output_size_13 +
+                             curr_output_group_id * w13_n_tile_size;
+            w13_bias_ptr_1 = w13_bias_ptr_0 + actual_n_tile_size / 2;
+          }
+        } else {
+          w13_weight_ptr_0 =
+              w13 + curr_expert_id * input_size_13 * output_size_13 +
+              curr_output_group_id * (w13_n_tile_size / 2) * input_size_13;
+          w13_weight_ptr_1 =
+              w13_weight_ptr_0 + output_size_13 / 2 * input_size_13;
+          if (w13_bias != nullptr) {
+            w13_bias_ptr_0 = w13_bias + curr_expert_id * output_size_13 +
+                             curr_output_group_id * (w13_n_tile_size / 2);
+            w13_bias_ptr_1 = w13_bias_ptr_0 + output_size_13 / 2;
+          }
+        }
+
+        scalar_t* __restrict__ curr_w13_input_buffer = w13_input_buffer;
+        for (int32_t token_idx = 0; token_idx < curr_token_num;
+             token_idx += gemm_m_tile_size) {
+          const int32_t actual_token_num =
+              std::min(gemm_m_tile_size, curr_token_num - token_idx);
+          // copy inputs
+          {
+            scalar_t* __restrict__ curr_w13_input_buffer_iter =
+                curr_w13_input_buffer;
+            for (int32_t i = 0; i < actual_token_num; ++i) {
+              const int32_t curr_token_id = curr_expand_token_id_buffer[i];
+              int8_t* __restrict__ curr_input_iter = reinterpret_cast<int8_t*>(
+                  input + curr_token_id * input_size_13);
+              int8_t* __restrict__ curr_output_iter =
+                  reinterpret_cast<int8_t*>(curr_w13_input_buffer_iter);
+              int32_t j = 0;
+              for (; j < input_size_13_bytes - 64; j += 64) {
+                vec_op::INT8Vec64 vec(curr_input_iter);
+                vec.save(curr_output_iter);
+                curr_input_iter += 64;
+                curr_output_iter += 64;
+              }
+              vec_op::INT8Vec64 vec(curr_input_iter);
+              vec.save(curr_output_iter, input_size_13_bytes - j);
+
+              // update
+              curr_w13_input_buffer_iter += input_size_13;
+            }
+            // update
+            curr_expand_token_id_buffer += actual_token_num;
+          }
+
+          // gemm + act
+          {
+            scalar_t* __restrict__ w13_weight_ptr_0_iter = w13_weight_ptr_0;
+            scalar_t* __restrict__ w13_weight_ptr_1_iter = w13_weight_ptr_1;
+            scalar_t* __restrict__ w13_bias_ptr_0_iter = w13_bias_ptr_0;
+            scalar_t* __restrict__ w13_bias_ptr_1_iter = w13_bias_ptr_1;
+            scalar_t* __restrict__ curr_w13_input_buffer_iter =
+                curr_w13_input_buffer;
+            float* __restrict__ w13_output_buffer_0_iter = w13_output_buffer;
+            float* __restrict__ w13_output_buffer_1_iter =
+                w13_output_buffer + actual_n_tile_size / 2;
+            for (int32_t i = 0; i < actual_n_tile_size;
+                 i += min_w13_n_tile_size) {
+              gemm.gemm(curr_w13_input_buffer_iter, w13_weight_ptr_0_iter,
+                        w13_output_buffer_0_iter, actual_token_num,
+                        input_size_13, input_size_13, w13_n_group_stride,
+                        actual_n_tile_size, false);
+
+              if (w13_bias != nullptr) {
+                cpu_micro_gemm::add_bias_epilogue<gemm_n_tile_size>(
+                    w13_output_buffer_0_iter, w13_output_buffer_0_iter,
+                    w13_bias_ptr_0_iter, actual_token_num, actual_n_tile_size,
+                    actual_n_tile_size);
+                w13_bias_ptr_0_iter += gemm_n_tile_size;
+              }
+
+              gemm.gemm(curr_w13_input_buffer_iter, w13_weight_ptr_1_iter,
+                        w13_output_buffer_1_iter, actual_token_num,
+                        input_size_13, input_size_13, w13_n_group_stride,
+                        actual_n_tile_size, false);
+
+              if (w13_bias != nullptr) {
+                cpu_micro_gemm::add_bias_epilogue<gemm_n_tile_size>(
+                    w13_output_buffer_1_iter, w13_output_buffer_1_iter,
+                    w13_bias_ptr_1_iter, actual_token_num, actual_n_tile_size,
+                    actual_n_tile_size);
+                w13_bias_ptr_1_iter += gemm_n_tile_size;
+              }
+
+              // update
+              w13_weight_ptr_0_iter += w13_n_tile_stride;
+              w13_weight_ptr_1_iter += w13_n_tile_stride;
+              w13_output_buffer_0_iter += gemm_n_tile_size;
+              w13_output_buffer_1_iter += gemm_n_tile_size;
+            }
+
+            apply_gated_act(act_type, w13_output_buffer,
+                            curr_w13_gemm_output_buffer, actual_token_num,
+                            actual_n_tile_size, actual_n_tile_size,
+                            output_size_13 / 2);
+
+            // update
+            curr_w13_gemm_output_buffer +=
+                gemm_m_tile_size * (output_size_13 / 2);
+          }
+        }
+      }
+    }
+  }
+
+  // w2 GEMM
+  {
+    alignas(64) cpu_utils::Counter counter;
+    cpu_utils::Counter* counter_ptr = &counter;
+
+#pragma omp parallel for schedule(static, 1)
+    for (int32_t thread_id = 0; thread_id < thread_num; ++thread_id) {
+      const int32_t task_num_per_expert =
+          (output_size_2 + w2_n_tile_size - 1) / w2_n_tile_size;
+      const int32_t task_num = task_num_per_expert * expert_num;
+      scalar_t* __restrict__ w13_gemm_output_buffer =
+          reinterpret_cast<scalar_t*>(common_buffer_start +
+                                      w13_gemm_output_buffer_offset);
+      float* __restrict__ w2_gemm_output_buffer = reinterpret_cast<float*>(
+          common_buffer_start + w2_gemm_output_buffer_offset);
+
+      gemm_t gemm;
+
+      const int32_t w2_n_tile_stride = gemm_n_tile_size * input_size_2;
+      const int32_t w2_n_group_stride = 16 * input_size_2;
+
+      for (;;) {
+        int32_t task_id = counter_ptr->acquire_counter();
+        if (task_id >= task_num) {
+          break;
+        }
+
+        const int32_t curr_expert_id = task_id / task_num_per_expert;
+        const int32_t curr_output_group_id = task_id % task_num_per_expert;
+        const int32_t curr_token_num =
+            token_num_per_group_buffer[curr_expert_id];
+        if (curr_token_num == 0) {
+          continue;
+        }
+
+        const int32_t actual_n_tile_size =
+            std::min(w2_n_tile_size,
+                     output_size_2 - curr_output_group_id * w2_n_tile_size);
+        scalar_t* __restrict__ curr_w13_gemm_output_buffer =
+            w13_gemm_output_buffer +
+            cu_token_num_per_group_buffer[curr_expert_id] * input_size_2;
+        float* __restrict__ curr_w2_gemm_output_buffer =
+            w2_gemm_output_buffer +
+            cu_token_num_per_group_buffer[curr_expert_id] * output_size_2 +
+            curr_output_group_id * w2_n_tile_size;
+        scalar_t* __restrict__ w2_weight_ptr =
+            w2 + curr_expert_id * output_size_2 * input_size_2 +
+            curr_output_group_id * w2_n_tile_size * input_size_2;
+        scalar_t* __restrict__ w2_bias_ptr = nullptr;
+        if (w2_bias != nullptr) {
+          w2_bias_ptr = w2_bias + curr_expert_id * output_size_2 +
+                        curr_output_group_id * w2_n_tile_size;
+        }
+
+        for (int32_t token_idx = 0; token_idx < curr_token_num;
+             token_idx += gemm_m_tile_size) {
+          const int32_t actual_token_num =
+              std::min(gemm_m_tile_size, curr_token_num - token_idx);
+
+          scalar_t* __restrict__ w2_weight_ptr_iter = w2_weight_ptr;
+          scalar_t* __restrict__ w2_bias_ptr_iter = w2_bias_ptr;
+          float* __restrict__ curr_w2_gemm_output_buffer_iter =
+              curr_w2_gemm_output_buffer;
+          for (int32_t i = 0; i < actual_n_tile_size; i += gemm_n_tile_size) {
+            gemm.gemm(curr_w13_gemm_output_buffer, w2_weight_ptr_iter,
+                      curr_w2_gemm_output_buffer_iter, actual_token_num,
+                      input_size_2, input_size_2, w2_n_group_stride,
+                      output_size_2, false);
+
+            if (w2_bias != nullptr) {
+              cpu_micro_gemm::add_bias_epilogue<gemm_n_tile_size>(
+                  curr_w2_gemm_output_buffer_iter,
+                  curr_w2_gemm_output_buffer_iter, w2_bias_ptr_iter,
+                  actual_token_num, output_size_2, output_size_2);
+              w2_bias_ptr_iter += gemm_n_tile_size;
+            }
+
+            w2_weight_ptr_iter += w2_n_tile_stride;
+            curr_w2_gemm_output_buffer_iter += gemm_n_tile_size;
+          }
+
+          // update
+          curr_w13_gemm_output_buffer += gemm_m_tile_size * input_size_2;
+          curr_w2_gemm_output_buffer += gemm_m_tile_size * output_size_2;
+        }
+      }
+    }
+  }
+
+  // weighted sum
+  {
+    alignas(64) cpu_utils::Counter counter;
+    cpu_utils::Counter* counter_ptr = &counter;
+
+#pragma omp parallel for schedule(static, 1)
+    for (int32_t thread_id = 0; thread_id < thread_num; ++thread_id) {
+      const int32_t task_num = token_num;
+      uint8_t* __restrict__ thread_buffer =
+          thread_buffer_start + thread_id * ws_thread_buffer_offset;
+      float* __restrict__ ws_output_buffer =
+          reinterpret_cast<float*>(thread_buffer + ws_output_buffer_offset);
+      float* __restrict__ w2_gemm_output_buffer = reinterpret_cast<float*>(
+          common_buffer_start + w2_gemm_output_buffer_offset);
+
+      for (;;) {
+        int32_t task_id = counter_ptr->acquire_counter();
+        if (task_id >= task_num) {
+          break;
+        }
+
+        int32_t token_id = task_id;
+        int32_t* __restrict__ curr_expand_token_id_index_buffer =
+            expand_token_id_index_buffer + token_id * topk_num;
+        float* __restrict__ curr_weight = topk_weights + token_id * topk_num;
+        scalar_t* __restrict__ curr_output_buffer =
+            output + token_id * output_size_2;
+
+        if (topk_num > 1) {
+          {
+            int32_t w2_output_idx = curr_expand_token_id_index_buffer[0];
+            float* __restrict__ w2_output_iter =
+                w2_gemm_output_buffer + w2_output_idx * output_size_2;
+            float* __restrict__ ws_output_buffer_iter = ws_output_buffer;
+            vec_op::FP32Vec16 weight_vec(curr_weight[0]);
+            for (int32_t i = 0; i < output_size_2; i += 16) {
+              vec_op::FP32Vec16 vec(w2_output_iter);
+              vec = vec * weight_vec;
+              vec.save(ws_output_buffer_iter);
+
+              // update
+              w2_output_iter += 16;
+              ws_output_buffer_iter += 16;
+            }
+          }
+
+          {
+            for (int32_t idx = 1; idx < topk_num - 1; ++idx) {
+              int32_t w2_output_idx = curr_expand_token_id_index_buffer[idx];
+              float* __restrict__ w2_output_iter =
+                  w2_gemm_output_buffer + w2_output_idx * output_size_2;
+              float* __restrict__ ws_output_buffer_iter = ws_output_buffer;
+              vec_op::FP32Vec16 weight_vec(curr_weight[idx]);
+              for (int32_t i = 0; i < output_size_2; i += 16) {
+                vec_op::FP32Vec16 vec(w2_output_iter);
+                vec_op::FP32Vec16 sum(ws_output_buffer_iter);
+                sum = sum + vec * weight_vec;
+                sum.save(ws_output_buffer_iter);
+
+                // update
+                w2_output_iter += 16;
+                ws_output_buffer_iter += 16;
+              }
+            }
+          }
+
+          {
+            int32_t idx = topk_num - 1;
+            int32_t w2_output_idx = curr_expand_token_id_index_buffer[idx];
+            float* __restrict__ w2_output_iter =
+                w2_gemm_output_buffer + w2_output_idx * output_size_2;
+            float* __restrict__ ws_output_buffer_iter = ws_output_buffer;
+            scalar_t* __restrict__ curr_output_buffer_iter = curr_output_buffer;
+            vec_op::FP32Vec16 weight_vec(curr_weight[idx]);
+            for (int32_t i = 0; i < output_size_2; i += 16) {
+              vec_op::FP32Vec16 vec(w2_output_iter);
+              vec_op::FP32Vec16 sum(ws_output_buffer_iter);
+              sum = sum + vec * weight_vec;
+              scalar_vec_t out_vec(sum);
+              out_vec.save(curr_output_buffer_iter);
+
+              // update
+              w2_output_iter += 16;
+              ws_output_buffer_iter += 16;
+              curr_output_buffer_iter += 16;
+            }
+          }
+        } else {
+          int32_t w2_output_idx = curr_expand_token_id_index_buffer[0];
+          float* __restrict__ w2_output_iter =
+              w2_gemm_output_buffer + w2_output_idx * output_size_2;
+          scalar_t* __restrict__ curr_output_buffer_iter = curr_output_buffer;
+          vec_op::FP32Vec16 weight_vec(curr_weight[0]);
+          for (int32_t i = 0; i < output_size_2; i += 16) {
+            vec_op::FP32Vec16 vec(w2_output_iter);
+            vec = vec * weight_vec;
+            scalar_vec_t out_vec(vec);
+            out_vec.save(curr_output_buffer_iter);
+
+            // update
+            w2_output_iter += 16;
+            curr_output_buffer_iter += 16;
+          }
+        }
+      }
+    }
+  }
+}
+}  // namespace
+
+void prepack_moe_weight(
+    const torch::Tensor& weight,  // [expert_num, output_size, input_size]
+    torch::Tensor& packed_weight, const std::string& isa) {
+  TORCH_CHECK(weight.is_contiguous());
+  const int32_t expert_num = weight.size(0);
+  const int32_t output_size = weight.size(1);
+  const int32_t input_size = weight.size(2);
+  TORCH_CHECK_EQ(output_size % 32, 0);
+  const int64_t expert_stride = weight.stride(0);
+  cpu_utils::ISA isa_type = cpu_utils::get_isa(isa);
+
+  VLLM_DISPATCH_FLOATING_TYPES(
+      weight.scalar_type(), "prepack_moe_weight", [&]() {
+        CPU_ISA_DISPATCH_IMPL(isa_type, [&]() {
+          scalar_t* weight_ptr = weight.data_ptr<scalar_t>();
+          scalar_t* packed_weight_ptr = packed_weight.data_ptr<scalar_t>();
+          prepack_moe_weight_impl<scalar_t, gemm_t>(
+              weight_ptr, packed_weight_ptr, expert_num, output_size,
+              input_size, expert_stride);
+        });
+      });
+}
+
+void cpu_fused_moe(
+    torch::Tensor& output,       // [token_num, output_size_2]
+    const torch::Tensor& input,  // [token_num, input_size_13]
+    const torch::Tensor&
+        w13,  // [expert_num, output_size_13, input_size_13], packed
+    const torch::Tensor&
+        w2,  // [expert_num, output_size_2, input_size_2], packed
+    const std::optional<torch::Tensor>&
+        w13_bias,  // [expert_num, output_size_13]
+    const std::optional<torch::Tensor>& w2_bias,  // [expert_num, output_size_2]
+    const torch::Tensor& topk_weights,            // [token_num, k], float32
+    const torch::Tensor& topk_id,                 // [token_num, k], int32
+    const std::string& act, const std::string& isa) {
+  const int32_t token_num = input.size(0);
+  const int32_t input_size_13 = input.size(1);
+  const int64_t input_stride = input.stride(0);
+  TORCH_CHECK_EQ(input_stride, input_size_13);
+  const int32_t expert_num = w13.size(0);
+  const int32_t output_size_13 = w13.size(1);
+  const int32_t input_size_2 = w2.size(2);
+  const int32_t output_size_2 = w2.size(1);
+  const int32_t topk_num = topk_id.size(1);
+  const FusedMOEAct act_type = get_act_type(act);
+  cpu_utils::ISA isa_type = cpu_utils::get_isa(isa);
+
+  VLLM_DISPATCH_FLOATING_TYPES(w13.scalar_type(), "cpu_fused_moe", [&]() {
+    CPU_ISA_DISPATCH_IMPL(isa_type, [&]() {
+      fused_moe_impl<scalar_t, scalar_t, gemm_t>(
+          output.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
+          w13.data_ptr<scalar_t>(), w2.data_ptr<scalar_t>(),
+          w13_bias.has_value() ? w13_bias->data_ptr<scalar_t>() : nullptr,
+          w2_bias.has_value() ? w2_bias->data_ptr<scalar_t>() : nullptr,
+          topk_weights.data_ptr<float>(), topk_id.data_ptr<int32_t>(), act_type,
+          token_num, expert_num, topk_num, input_size_13, output_size_13,
+          input_size_2, output_size_2);
+    });
+  });
+}
diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp
index 6f51277f78440..d94af338ac1c9 100644
--- a/csrc/cpu/cpu_types_x86.hpp
+++ b/csrc/cpu/cpu_types_x86.hpp
@@ -352,6 +352,10 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
   explicit FP32Vec16(bool, void* ptr)
       : reg((__m512)_mm512_stream_load_si512(ptr)) {}
 
+  // strided load
+  explicit FP32Vec16(const float* ptr, INT32Vec16 idx)
+      : reg(_mm512_i32gather_ps(idx.reg, ptr, 4)) {}
+
   explicit FP32Vec16(__m512 data) : reg(data) {}
 
   // de-pack 4 bit values
@@ -408,6 +412,10 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
     return FP32Vec16(_mm512_sub_ps(reg, b.reg));
   }
 
+  FP32Vec16 operator-() const {
+    return FP32Vec16(_mm512_xor_ps(reg, _mm512_set1_ps(-0.0f)));
+  }
+
   FP32Vec16 operator/(const FP32Vec16& b) const {
     return FP32Vec16(_mm512_div_ps(reg, b.reg));
   }
diff --git a/csrc/cpu/cpu_wna16.cpp b/csrc/cpu/cpu_wna16.cpp
index 816d195506e52..88d48f3db8772 100644
--- a/csrc/cpu/cpu_wna16.cpp
+++ b/csrc/cpu/cpu_wna16.cpp
@@ -1,6 +1,5 @@
-#include "cpu_types.hpp"
-#include "scratchpad_manager.h"
-#include "utils.hpp"
+#include "cpu/cpu_types.hpp"
+#include "cpu/utils.hpp"
 
 #ifdef CPU_CAPABILITY_AMXBF16
   #include "cpu/micro_gemm/cpu_micro_gemm_amx.hpp"
@@ -158,7 +157,7 @@ void cpu_gemm_wna16_impl(
   // a simple schedule policy, just to hold more B tiles in L2 and make sure
   // each thread has tasks
   const int32_t n_partition_size = [&]() {
-    const int64_t cache_size = cpu_utils::get_l2_size();
+    const int64_t cache_size = cpu_utils::get_available_l2_size();
     int64_t ps_cache_limit = cache_size / (k_size * sizeof(scalar_t));
     int64_t ps_thread_limit = n_size / thread_num;
     ps_cache_limit =
@@ -179,8 +178,8 @@ void cpu_gemm_wna16_impl(
   const int64_t b_buffer_offset = 0;
   const int64_t c_buffer_offset = b_buffer_size;
   const int64_t buffer_size = b_buffer_size + c_buffer_size;
-  DNNLScratchPadManager::get_dnnl_scratchpad_manager()->realloc(buffer_size *
-                                                                thread_num);
+  cpu_utils::ScratchPadManager::get_scratchpad_manager()->realloc(buffer_size *
+                                                                  thread_num);
 
   alignas(64) cpu_utils::Counter counter;
   cpu_utils::Counter* counter_ptr = &counter;
@@ -190,9 +189,10 @@ void cpu_gemm_wna16_impl(
     scalar_t* __restrict__ b_buffer = nullptr;
     float* __restrict__ c_buffer = nullptr;
     {
-      uint8_t* buffer_ptr = DNNLScratchPadManager::get_dnnl_scratchpad_manager()
-                                ->get_data<uint8_t>() +
-                            thread_id * buffer_size;
+      uint8_t* buffer_ptr =
+          cpu_utils::ScratchPadManager::get_scratchpad_manager()
+              ->get_data<uint8_t>() +
+          thread_id * buffer_size;
       b_buffer = reinterpret_cast<scalar_t*>(buffer_ptr + b_buffer_offset);
       c_buffer = reinterpret_cast<float*>(buffer_ptr + c_buffer_offset);
     }
diff --git a/csrc/cpu/dnnl_helper.cpp b/csrc/cpu/dnnl_helper.cpp
index cfb6e78cba9a1..e337e10e1cf7b 100644
--- a/csrc/cpu/dnnl_helper.cpp
+++ b/csrc/cpu/dnnl_helper.cpp
@@ -4,8 +4,8 @@
 #include "common/memory_desc.hpp"
 #include "common/memory.hpp"
 
-#include "dnnl_helper.h"
-#include "scratchpad_manager.h"
+#include "cpu/utils.hpp"
+#include "cpu/dnnl_helper.h"
 
 static dnnl::engine& default_engine() {
   static dnnl::engine engine(dnnl::engine::kind::cpu, 0);
@@ -274,7 +274,7 @@ void W8A8MatMulPrimitiveHandler::execute(ExecArgs& args) {
 
   auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(5);
   scratchpad_storage->set_data_handle(
-      DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data<void>());
+      cpu_utils::ScratchPadManager::get_scratchpad_manager()->get_data<void>());
 
   matmul.execute(default_stream(), memory_cache_);
   default_stream().wait();
@@ -294,7 +294,7 @@ dnnl::matmul W8A8MatMulPrimitiveHandler::get_matmul_cache(
 
   return m_size_cache_->get_or_create(key, [&]() {
     dnnl::matmul::primitive_desc desc = this->create_primitive_desc(key, false);
-    auto manager = DNNLScratchPadManager::get_dnnl_scratchpad_manager();
+    auto manager = cpu_utils::ScratchPadManager::get_scratchpad_manager();
     manager->realloc(desc.scratchpad_desc().get_size());
     return dnnl::matmul(desc);
   });
@@ -470,7 +470,7 @@ void MatMulPrimitiveHandler::execute(ExecArgs& args) {
 
   auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(3);
   scratchpad_storage->set_data_handle(
-      DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data<void>());
+      cpu_utils::ScratchPadManager::get_scratchpad_manager()->get_data<void>());
 
   matmul.execute(default_stream(), memory_cache_);
   default_stream().wait();
@@ -486,7 +486,7 @@ dnnl::matmul MatMulPrimitiveHandler::get_matmul_cache(
   }
   return m_size_cache_->get_or_create(key, [&]() {
     dnnl::matmul::primitive_desc desc = this->create_primitive_desc(key, false);
-    auto manager = DNNLScratchPadManager::get_dnnl_scratchpad_manager();
+    auto manager = cpu_utils::ScratchPadManager::get_scratchpad_manager();
     manager->realloc(desc.scratchpad_desc().get_size());
     return dnnl::matmul(desc);
   });
diff --git a/csrc/cpu/micro_gemm/cpu_micro_gemm_amx.hpp b/csrc/cpu/micro_gemm/cpu_micro_gemm_amx.hpp
index 87a019773a895..357c7cf1d7844 100644
--- a/csrc/cpu/micro_gemm/cpu_micro_gemm_amx.hpp
+++ b/csrc/cpu/micro_gemm/cpu_micro_gemm_amx.hpp
@@ -235,6 +235,39 @@ class MicroGemm<cpu_utils::ISA::AMX, scalar_t> {
     }
   }
 
+  static void pack_weight(const scalar_t* __restrict__ weight,
+                          scalar_t* __restrict__ packed_weight,
+                          const int32_t output_size, const int32_t input_size) {
+    constexpr int32_t elem_num_per_group = 4 / sizeof(scalar_t);
+    TORCH_CHECK_EQ(output_size % 16, 0);
+    TORCH_CHECK_EQ(input_size % (16 * elem_num_per_group), 0);
+
+    const int32_t output_group_num = output_size / 16;
+    const int32_t input_32b_num = input_size / elem_num_per_group;
+    for (int32_t output_group_idx = 0; output_group_idx < output_group_num;
+         ++output_group_idx) {
+      const int32_t* __restrict__ weight_32b =
+          reinterpret_cast<const int32_t*>(weight);
+      int32_t* __restrict__ packed_weight_32b =
+          reinterpret_cast<int32_t*>(packed_weight);
+      for (int32_t output_idx = 0; output_idx < 16; ++output_idx) {
+        for (int32_t weight_offset = 0, packed_offset = 0;
+             weight_offset < input_32b_num;
+             ++weight_offset, packed_offset += 16) {
+          packed_weight_32b[packed_offset] = weight_32b[weight_offset];
+        }
+
+        // update
+        weight_32b += input_32b_num;
+        packed_weight_32b += 1;
+      }
+
+      // update
+      weight += 16 * input_size;
+      packed_weight += 16 * input_size;
+    }
+  }
+
  private:
   alignas(64) __tilecfg amx_tile_config_;
   int32_t curr_m_;
diff --git a/csrc/cpu/micro_gemm/cpu_micro_gemm_impl.hpp b/csrc/cpu/micro_gemm/cpu_micro_gemm_impl.hpp
index 784da55a420e5..23e78a681b5fe 100644
--- a/csrc/cpu/micro_gemm/cpu_micro_gemm_impl.hpp
+++ b/csrc/cpu/micro_gemm/cpu_micro_gemm_impl.hpp
@@ -13,6 +13,9 @@ namespace cpu_micro_gemm {
 #define CPU_MICRO_GEMM_PARAMS \
   a_ptr, b_ptr, c_ptr, m, k, lda, b_n_group_stride, ldc, accum_c
 
+// Note: weights for MicroGemm should be packed as (output_size / 16) contiguous
+// blocks, means the logical shape of blocks is [16, input_size]. And the actual
+// layout of blocks can be ISA-specific.
 template <cpu_utils::ISA isa, typename scalar_t>
 class MicroGemm {
  public:
@@ -86,6 +89,41 @@ FORCE_INLINE void bias_epilogue(float* __restrict__ c_ptr,
     curr_d += ldd;
   }
 }
+
+template <int32_t n_size, typename scalar_t>
+FORCE_INLINE void add_bias_epilogue(float* c_ptr, float* d_ptr,
+                                    scalar_t* __restrict__ bias_ptr,
+                                    const int32_t m, const int64_t ldc,
+                                    const int64_t ldd) {
+  using scalar_vec_t = typename cpu_utils::VecTypeTrait<scalar_t>::vec_t;
+  static_assert(n_size % 16 == 0);
+  constexpr int32_t n_group_num = n_size / 16;
+  static_assert(n_group_num <= 16);
+
+  vec_op::FP32Vec16 bias_vecs[n_group_num];
+  scalar_t* __restrict__ curr_bias = bias_ptr;
+  vec_op::unroll_loop<int32_t, n_group_num>([&](int32_t i) {
+    scalar_vec_t vec(curr_bias);
+    bias_vecs[i] = vec_op::FP32Vec16(vec);
+    curr_bias += 16;
+  });
+
+  float* curr_c = c_ptr;
+  float* curr_d = d_ptr;
+  for (int32_t i = 0; i < m; ++i) {
+    float* curr_c_iter = curr_c;
+    float* curr_d_iter = curr_d;
+    vec_op::unroll_loop<int32_t, n_group_num>([&](int32_t n_g_idx) {
+      vec_op::FP32Vec16 c_vec_fp32(curr_c_iter);
+      c_vec_fp32 = c_vec_fp32 + bias_vecs[n_g_idx];
+      c_vec_fp32.save(curr_d_iter);
+      curr_c_iter += 16;
+      curr_d_iter += 16;
+    });
+    curr_c += ldc;
+    curr_d += ldd;
+  }
+}
 }  // namespace cpu_micro_gemm
 
 #endif
diff --git a/csrc/cpu/micro_gemm/cpu_micro_gemm_vec.hpp b/csrc/cpu/micro_gemm/cpu_micro_gemm_vec.hpp
index 3985c2f2e5fe4..bdd3e85a1c522 100644
--- a/csrc/cpu/micro_gemm/cpu_micro_gemm_vec.hpp
+++ b/csrc/cpu/micro_gemm/cpu_micro_gemm_vec.hpp
@@ -109,6 +109,25 @@ class MicroGemm<cpu_utils::ISA::VEC, scalar_t> {
   void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) {
     TileGemm82<scalar_t>::gemm(CPU_MICRO_GEMM_PARAMS);
   }
+
+  // Note: pack contiguous weight [output_size, input_size] as contiguous
+  // packed weight [output_size / 16, input_size, 16]
+  static void pack_weight(const scalar_t* __restrict__ weight,
+                          scalar_t* __restrict__ packed_weight,
+                          const int32_t output_size, const int32_t input_size) {
+    TORCH_CHECK_EQ(output_size % 16, 0);
+    for (int32_t o_idx = 0; o_idx < output_size; ++o_idx) {
+      const scalar_t* __restrict__ curr_weight = weight + o_idx * input_size;
+      scalar_t* __restrict__ curr_packed_weight =
+          packed_weight + (o_idx / 16) * (16 * input_size) + o_idx % 16;
+      for (int32_t i_idx = 0; i_idx < input_size; ++i_idx) {
+        *curr_packed_weight = *curr_weight;
+
+        curr_packed_weight += 16;
+        ++curr_weight;
+      }
+    }
+  }
 };
 }  // namespace cpu_micro_gemm
 
diff --git a/csrc/cpu/scratchpad_manager.cpp b/csrc/cpu/scratchpad_manager.cpp
deleted file mode 100644
index 05cd435f34b7a..0000000000000
--- a/csrc/cpu/scratchpad_manager.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-#include <cstdlib>
-
-#include "scratchpad_manager.h"
-
-DNNLScratchPadManager::DNNLScratchPadManager() : size_(0), ptr_(nullptr) {
-  this->realloc(allocation_unit * 128);
-}
-
-void DNNLScratchPadManager::realloc(size_t new_size) {
-  new_size = round(new_size);
-  if (new_size > size_) {
-    if (ptr_ != nullptr) {
-      std::free(ptr_);
-    }
-    ptr_ = std::aligned_alloc(64, new_size);
-    size_ = new_size;
-  }
-}
-
-DNNLScratchPadManager* DNNLScratchPadManager::get_dnnl_scratchpad_manager() {
-  static DNNLScratchPadManager manager;
-  return &manager;
-}
diff --git a/csrc/cpu/scratchpad_manager.h b/csrc/cpu/scratchpad_manager.h
deleted file mode 100644
index 0ecf59192f845..0000000000000
--- a/csrc/cpu/scratchpad_manager.h
+++ /dev/null
@@ -1,31 +0,0 @@
-#ifndef SCRATCHPAD_MANAGER_H
-#define SCRATCHPAD_MANAGER_H
-
-#include <cstddef>
-#include <cstdio>
-
-class DNNLScratchPadManager {
- public:
-  static constexpr size_t allocation_unit = 4 * 1024;  // 4KB
-
-  static DNNLScratchPadManager* get_dnnl_scratchpad_manager();
-
-  DNNLScratchPadManager();
-
-  template <typename T>
-  T* get_data() {
-    return reinterpret_cast<T*>(ptr_);
-  }
-
-  static size_t round(size_t size) {
-    return ((size + allocation_unit - 1) / allocation_unit) * allocation_unit;
-  }
-
-  void realloc(size_t new_size);
-
- private:
-  size_t size_;
-  void* ptr_;
-};
-
-#endif
diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp
index e0e3ef71b485f..dd419405c94b9 100644
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -110,6 +110,17 @@ void cpu_gemm_wna16(const torch::Tensor& input, const torch::Tensor& q_weight,
                     const std::optional<torch::Tensor>& bias,
                     const int64_t pack_factor, const std::string& isa_hint);
 
+void prepack_moe_weight(const torch::Tensor& weight,
+                        torch::Tensor& packed_weight, const std::string& isa);
+
+void cpu_fused_moe(torch::Tensor& output, const torch::Tensor& input,
+                   const torch::Tensor& w13, const torch::Tensor& w2,
+                   const std::optional<torch::Tensor>& w13_bias,
+                   const std::optional<torch::Tensor>& w2_bias,
+                   const torch::Tensor& topk_weights,
+                   const torch::Tensor& topk_id, const std::string& act,
+                   const std::string& isa);
+
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // vLLM custom ops
 
@@ -296,6 +307,19 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "pack_factor, str isa_hint) -> ()");
   ops.impl("cpu_gemm_wna16", torch::kCPU, &cpu_gemm_wna16);
 #endif
+
+  // fused moe
+#if defined(__AVX512F__)
+  ops.def(
+      "prepack_moe_weight(Tensor weight, Tensor(a1!) packed_weight, str isa) "
+      "-> ()");
+  ops.impl("prepack_moe_weight", torch::kCPU, &prepack_moe_weight);
+  ops.def(
+      "cpu_fused_moe(Tensor(a0!) output, Tensor input, Tensor w13, Tensor w2, "
+      "Tensor? w13_bias, Tensor? w2_bias, Tensor topk_weights, Tensor topk_id, "
+      "str act, str isa) -> ()");
+  ops.impl("cpu_fused_moe", torch::kCPU, &cpu_fused_moe);
+#endif
 }
 
 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {
diff --git a/csrc/cpu/utils.cpp b/csrc/cpu/utils.cpp
index 3dacfc7b2b7a3..88bc3c509790c 100644
--- a/csrc/cpu/utils.cpp
+++ b/csrc/cpu/utils.cpp
@@ -10,7 +10,7 @@
   #define gettid() syscall(SYS_gettid)
 #endif
 
-#include "cpu_types.hpp"
+#include "cpu/utils.hpp"
 
 #ifdef VLLM_NUMA_DISABLED
 std::string init_cpu_threads_env(const std::string& cpu_ids) {
@@ -138,4 +138,26 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
 
   return ss.str();
 }
-#endif
+#endif  // VLLM_NUMA_DISABLED
+
+namespace cpu_utils {
+ScratchPadManager::ScratchPadManager() : size_(0), ptr_(nullptr) {
+  this->realloc(allocation_unit * 128);
+}
+
+void ScratchPadManager::realloc(size_t new_size) {
+  new_size = round(new_size);
+  if (new_size > size_) {
+    if (ptr_ != nullptr) {
+      std::free(ptr_);
+    }
+    ptr_ = std::aligned_alloc(64, new_size);
+    size_ = new_size;
+  }
+}
+
+ScratchPadManager* ScratchPadManager::get_scratchpad_manager() {
+  static ScratchPadManager manager;
+  return &manager;
+}
+}  // namespace cpu_utils
diff --git a/csrc/cpu/utils.hpp b/csrc/cpu/utils.hpp
index d3def306b8069..682751d67b1cd 100644
--- a/csrc/cpu/utils.hpp
+++ b/csrc/cpu/utils.hpp
@@ -2,19 +2,24 @@
 #define UTILS_HPP
 
 #include <atomic>
-#include <cassert>
-#include <cstdint>
 #include <unistd.h>
+#include <ATen/cpu/Utils.h>
 
-#if defined(__APPLE__)
-  #include <sys/sysctl.h>
-#endif
-
-#include "cpu_types.hpp"
+#include "cpu/cpu_types.hpp"
 
 namespace cpu_utils {
 enum class ISA { AMX, VEC };
 
+inline ISA get_isa(const std::string& isa) {
+  if (isa == "amx") {
+    return ISA::AMX;
+  } else if (isa == "vec") {
+    return ISA::VEC;
+  } else {
+    TORCH_CHECK(false, "Invalid isa type: " + isa);
+  }
+}
+
 template <typename T>
 struct VecTypeTrait {
   using vec_t = void;
@@ -32,10 +37,12 @@ struct VecTypeTrait<c10::BFloat16> {
 };
 #endif
 
+#if !defined(__powerpc__)
 template <>
 struct VecTypeTrait<c10::Half> {
   using vec_t = vec_op::FP16Vec16;
 };
+#endif
 
 struct Counter {
   std::atomic<int64_t> counter;
@@ -48,26 +55,66 @@ struct Counter {
   int64_t acquire_counter() { return counter++; }
 };
 
-inline int64_t get_l2_size() {
+inline int64_t get_available_l2_size() {
   static int64_t size = []() {
-#if defined(__APPLE__)
-    // macOS doesn't have _SC_LEVEL2_CACHE_SIZE. Use sysctlbyname.
-    int64_t l2_cache_size = 0;
-    size_t len = sizeof(l2_cache_size);
-    if (sysctlbyname("hw.l2cachesize", &l2_cache_size, &len, NULL, 0) == 0 &&
-        l2_cache_size > 0) {
-      return l2_cache_size >> 1;  // use 50% of L2 cache
-    }
-    // Fallback if sysctlbyname fails
-    return 128LL * 1024 >> 1;  // use 50% of 128KB
-#else
-    long l2_cache_size = sysconf(_SC_LEVEL2_CACHE_SIZE);
-    assert(l2_cache_size != -1);
+    const uint32_t l2_cache_size = at::cpu::L2_cache_size();
     return l2_cache_size >> 1;  // use 50% of L2 cache
-#endif
   }();
   return size;
 }
+
+template <int32_t alignment_v, typename T>
+inline T round_up(T size) {
+  T alignment = alignment_v;
+  return (((size + alignment - 1) / alignment) * alignment);
+}
+
+template <int32_t alignment_v, typename T>
+inline T round_down(T size) {
+  T alignment = alignment_v;
+  return (size / alignment) * alignment;
+}
+
+template <typename T>
+inline void print_logits(const char* name, T* ptr, int32_t row, int32_t col,
+                         int32_t stride) {
+  std::stringstream ss;
+  ss << std::fixed << std::setprecision(5) << name << ": [\n";
+  auto* curr_logits_buffer = ptr;
+  for (int32_t m = 0; m < row; ++m) {
+    for (int32_t n = 0; n < col; ++n) {
+      ss << curr_logits_buffer[n] << ", ";
+    }
+    ss << "\n";
+    curr_logits_buffer += stride;
+  }
+  ss << "]\n";
+  std::printf("%s", ss.str().c_str());
+}
+
+class ScratchPadManager {
+ public:
+  static constexpr size_t allocation_unit = 4 * 1024;  // 4KB
+
+  static ScratchPadManager* get_scratchpad_manager();
+
+  ScratchPadManager();
+
+  template <typename T>
+  T* get_data() {
+    return reinterpret_cast<T*>(ptr_);
+  }
+
+  static size_t round(size_t size) {
+    return ((size + allocation_unit - 1) / allocation_unit) * allocation_unit;
+  }
+
+  void realloc(size_t new_size);
+
+ private:
+  size_t size_;
+  void* ptr_;
+};
 }  // namespace cpu_utils
 
 #endif
diff --git a/csrc/fused_qknorm_rope_kernel.cu b/csrc/fused_qknorm_rope_kernel.cu
index baff8363162ef..a51e1a347e1d4 100644
--- a/csrc/fused_qknorm_rope_kernel.cu
+++ b/csrc/fused_qknorm_rope_kernel.cu
@@ -107,7 +107,8 @@ __global__ void fusedQKNormRopeKernel(
     void const* k_weight_void,       // RMSNorm weights for key
     void const* cos_sin_cache_void,  // Pre-computed cos/sin cache
     int64_t const* position_ids,     // Position IDs for RoPE
-    int const num_tokens             // Number of tokens
+    int const num_tokens,            // Number of tokens
+    int const rotary_dim             // Dimension for RoPE
 ) {
 #if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 800) && !defined(USE_ROCM)
   if constexpr ((std::is_same_v<scalar_t_in, c10::BFloat16>) ||
@@ -227,56 +228,59 @@ __global__ void fusedQKNormRopeKernel(
 
     // Calculate cache pointer for this position - similar to
     // pos_encoding_kernels.cu
-    T_cache const* cache_ptr = cos_sin_cache + pos_id * head_dim;
-    int const embed_dim = head_dim / 2;
+    T_cache const* cache_ptr = cos_sin_cache + pos_id * rotary_dim;
+    int const embed_dim = rotary_dim / 2;
     T_cache const* cos_ptr = cache_ptr;
     T_cache const* sin_ptr = cache_ptr + embed_dim;
-
-    if constexpr (interleave) {
-      // Perform interleaving. Use pre-computed cos/sin values.
+    int const rotary_lanes = rotary_dim / numElemsPerThread;  // rotary range
+    if (laneId < rotary_lanes) {
+      if constexpr (interleave) {
+        // Perform interleaving. Use pre-computed cos/sin values.
 #pragma unroll
-      for (int i = 0; i < numElemsPerThread / 2; ++i) {
-        int const idx0 = 2 * i;
-        int const idx1 = 2 * i + 1;
+        for (int i = 0; i < numElemsPerThread / 2; ++i) {
+          int const idx0 = 2 * i;
+          int const idx1 = 2 * i + 1;
+          // Global dimension index in the head
+          int const dim_idx = laneId * numElemsPerThread + idx0;
 
-        float const val0 = elements[idx0];
-        float const val1 = elements[idx1];
+          float const val0 = elements[idx0];
+          float const val1 = elements[idx1];
 
-        int const dim_idx = laneId * numElemsPerThread + idx0;
-        int const half_dim = dim_idx / 2;
-        float const cos_val =
-            CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim));
-        float const sin_val =
-            CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim));
+          int const half_dim = dim_idx / 2;
+          float const cos_val =
+              CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim));
+          float const sin_val =
+              CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim));
 
-        elements[idx0] = val0 * cos_val - val1 * sin_val;
-        elements[idx1] = val0 * sin_val + val1 * cos_val;
-      }
-    } else {
-      // Before data exchange with in warp, we need to sync.
-      __syncwarp();
-      // Get the data from the other half of the warp. Use pre-computed cos/sin
-      // values.
-#pragma unroll
-      for (int i = 0; i < numElemsPerThread; i++) {
-        elements2[i] = __shfl_xor_sync(FINAL_MASK, elements[i], 16);
-        if (laneId < 16) {
-          elements2[i] = -elements2[i];
+          elements[idx0] = val0 * cos_val - val1 * sin_val;
+          elements[idx1] = val0 * sin_val + val1 * cos_val;
         }
+      } else {
+        // Before data exchange with in warp, we need to sync.
+        __syncwarp();
+        int pairOffset = (rotary_dim / 2) / numElemsPerThread;
+        // Get the data from the other half of the warp. Use pre-computed
+        // cos/sin values.
+#pragma unroll
+        for (int i = 0; i < numElemsPerThread; i++) {
+          elements2[i] = __shfl_xor_sync(FINAL_MASK, elements[i], pairOffset);
 
-        int dim_idx = laneId * numElemsPerThread + i;
-        dim_idx = (dim_idx * 2) % head_dim;
-        int half_dim = dim_idx / 2;
-        // Use pre-computed cos/sin from cache
-        float cos_val = CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim));
-        float sin_val = CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim));
+          if (laneId < pairOffset) {
+            elements2[i] = -elements2[i];
+          }
+          int dim_idx = laneId * numElemsPerThread + i;
 
-        elements[i] = elements[i] * cos_val + elements2[i] * sin_val;
+          dim_idx = (dim_idx * 2) % rotary_dim;
+          int half_dim = dim_idx / 2;
+          float cos_val = CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim));
+          float sin_val = CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim));
+
+          elements[i] = elements[i] * cos_val + elements2[i] * sin_val;
+        }
+        // __shfl_xor_sync does not provide memfence. Need to sync again.
+        __syncwarp();
       }
-      // __shfl_xor_sync does not provide memfence. Need to sync again.
-      __syncwarp();
     }
-
     // Store.
     {
       vec_T vec;
@@ -312,10 +316,10 @@ template <typename scalar_t_in, typename scalar_t_cache>
 void launchFusedQKNormRope(void* qkv, int const num_tokens,
                            int const num_heads_q, int const num_heads_k,
                            int const num_heads_v, int const head_dim,
-                           float const eps, void const* q_weight,
-                           void const* k_weight, void const* cos_sin_cache,
-                           bool const interleave, int64_t const* position_ids,
-                           cudaStream_t stream) {
+                           int const rotary_dim, float const eps,
+                           void const* q_weight, void const* k_weight,
+                           void const* cos_sin_cache, bool const interleave,
+                           int64_t const* position_ids, cudaStream_t stream) {
   constexpr int blockSize = 256;
 
   int const warpsPerBlock = blockSize / 32;
@@ -332,7 +336,7 @@ void launchFusedQKNormRope(void* qkv, int const num_tokens,
         fusedQKNormRopeKernel<scalar_t_in, scalar_t_cache, 64, INTERLEAVE>
             <<<gridDim, blockDim, 0, stream>>>(
                 qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight,
-                k_weight, cos_sin_cache, position_ids, num_tokens);
+                k_weight, cos_sin_cache, position_ids, num_tokens, rotary_dim);
       });
       break;
     case 128:
@@ -340,7 +344,7 @@ void launchFusedQKNormRope(void* qkv, int const num_tokens,
         fusedQKNormRopeKernel<scalar_t_in, scalar_t_cache, 128, INTERLEAVE>
             <<<gridDim, blockDim, 0, stream>>>(
                 qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight,
-                k_weight, cos_sin_cache, position_ids, num_tokens);
+                k_weight, cos_sin_cache, position_ids, num_tokens, rotary_dim);
       });
       break;
     case 256:
@@ -348,7 +352,7 @@ void launchFusedQKNormRope(void* qkv, int const num_tokens,
         fusedQKNormRopeKernel<scalar_t_in, scalar_t_cache, 256, INTERLEAVE>
             <<<gridDim, blockDim, 0, stream>>>(
                 qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight,
-                k_weight, cos_sin_cache, position_ids, num_tokens);
+                k_weight, cos_sin_cache, position_ids, num_tokens, rotary_dim);
       });
       break;
     default:
@@ -392,8 +396,11 @@ void fused_qk_norm_rope(
               "Query weights size must match head dimension");
   TORCH_CHECK(k_weight.size(0) == head_dim,
               "Key weights size must match head dimension");
-  TORCH_CHECK(cos_sin_cache.size(1) == head_dim,
-              "Cos/sin cache dimension must match head_dim");
+
+  TORCH_CHECK(cos_sin_cache.size(1) % 2 == 0, "rotary_dim must be even");
+  TORCH_CHECK(cos_sin_cache.size(1) <= head_dim,
+              "rotary_dim must be less than or equal to head_dim");
+
   TORCH_CHECK(qkv.scalar_type() == q_weight.scalar_type() &&
                   qkv.scalar_type() == k_weight.scalar_type(),
               "qkv, q_weight and k_weight must have the same dtype");
@@ -419,7 +426,8 @@ void fused_qk_norm_rope(
               qkv.data_ptr(), static_cast<int>(num_tokens),
               static_cast<int>(num_heads_q), static_cast<int>(num_heads_k),
               static_cast<int>(num_heads_v), static_cast<int>(head_dim),
-              static_cast<float>(eps), q_weight.data_ptr(), k_weight.data_ptr(),
+              static_cast<int>(cos_sin_cache.size(1)), static_cast<float>(eps),
+              q_weight.data_ptr(), k_weight.data_ptr(),
               cos_sin_cache.data_ptr(), !is_neox,
               reinterpret_cast<int64_t const*>(position_ids.data_ptr()),
               stream);
diff --git a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
index 7539f836ecf37..e0438556dfe5c 100644
--- a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
+++ b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
@@ -74,6 +74,9 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
   static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
                 "Vec size is not matched.");
 
+  // Precompute SF layout parameter (constant for entire kernel).
+  int32_t const numKTiles = (numCols + 63) / 64;
+
   // Get the global scaling factor, which will be applied to the SF.
   // Note SFScale is the same as next GEMM's alpha, which is
   // (448.f / (Alpha_A / 6.f)).
@@ -101,7 +104,7 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
       auto sf_out =
           cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
                                              CVT_FP4_NUM_THREADS_PER_SF>(
-              rowIdx, colIdx, numCols, SFout);
+              rowIdx, colIdx, numKTiles, SFout);
 
       out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(out_silu_mul, SFScaleVal,
                                                      sf_out);
diff --git a/csrc/quantization/fp4/nvfp4_experts_quant.cu b/csrc/quantization/fp4/nvfp4_experts_quant.cu
index 82c53c2375a31..20191a9bc6160 100644
--- a/csrc/quantization/fp4/nvfp4_experts_quant.cu
+++ b/csrc/quantization/fp4/nvfp4_experts_quant.cu
@@ -25,6 +25,7 @@
 #include <cuda_fp8.h>
 #include "dispatch_utils.h"
 
+#include "cuda_utils.h"
 #include "nvfp4_utils.cuh"
 #include "launch_bounds_utils.h"
 
@@ -44,6 +45,9 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
   static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
                 "Vec size is not matched.");
 
+  // Precompute SF layout parameter (constant for entire kernel).
+  int32_t const numKTiles = (numCols + 63) / 64;
+
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   int colsPerRow = numCols / CVT_FP4_ELTS_PER_THREAD;
 
@@ -112,17 +116,13 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
     // (448.f / (Alpha_A / 6.f)).
     float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[expert_idx];
 
-    int factor = CVT_FP4_SF_VEC_SIZE * 4;
-    // The actual output_scales dim is computed from the padded numCols.
-    int32_t numCols_padded = (numCols + factor - 1) / factor * factor;
-    int numCols_SFout = numCols_padded / CVT_FP4_SF_VEC_SIZE / 4;
     uint32_t* SFout_in_expert =
-        SFout + output_scale_offset_by_experts[expert_idx] * numCols_SFout;
+        SFout + output_scale_offset_by_experts[expert_idx] * numKTiles;
 
     auto sf_out =
         cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
                                            CVT_FP4_NUM_THREADS_PER_SF>(
-            rowIdx_in_expert, colIdx, numCols, SFout_in_expert);
+            rowIdx_in_expert, colIdx, numKTiles, SFout_in_expert);
 
     out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
   }
@@ -140,6 +140,10 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
       (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
   static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
                 "Vec size is not matched.");
+
+  // Precompute SF layout parameter (constant for entire kernel).
+  int32_t const numKTiles = (numCols + 63) / 64;
+
   extern __shared__ uint32_t shared_input_offsets[];
 
   // Load input offsets into shared memory.
@@ -202,16 +206,13 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
 
     float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[expert_idx];
 
-    int factor = CVT_FP4_SF_VEC_SIZE * 4;
-    int32_t numCols_padded = (numCols + factor - 1) / factor * factor;
-    int numCols_SFout = numCols_padded / CVT_FP4_SF_VEC_SIZE / 4;
     uint32_t* SFout_in_expert =
-        SFout + output_scale_offset_by_experts[expert_idx] * numCols_SFout;
+        SFout + output_scale_offset_by_experts[expert_idx] * numKTiles;
 
     auto sf_out =
         cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
                                            CVT_FP4_NUM_THREADS_PER_SF>(
-            rowIdx_in_expert, colIdx, numCols, SFout_in_expert);
+            rowIdx_in_expert, colIdx, numKTiles, SFout_in_expert);
 
     out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
   }
@@ -222,12 +223,8 @@ void quant_impl(void* output, void* output_scale, void* input,
                 void* input_global_scale, void* input_offset_by_experts,
                 void* output_scale_offset_by_experts, int m_topk, int k,
                 int n_experts, cudaStream_t stream) {
-  // TODO: this multiProcessorCount should be cached.
-  int device;
-  cudaGetDevice(&device);
-  int multiProcessorCount;
-  cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount,
-                         device);
+  int multiProcessorCount =
+      get_device_attribute(cudaDevAttrMultiProcessorCount, -1);
 
   // Grid, Block size.
   // Each thread converts 8 values.
diff --git a/csrc/quantization/fp4/nvfp4_quant_kernels.cu b/csrc/quantization/fp4/nvfp4_quant_kernels.cu
index 6d69852bb4e4f..8e38deeb6607f 100644
--- a/csrc/quantization/fp4/nvfp4_quant_kernels.cu
+++ b/csrc/quantization/fp4/nvfp4_quant_kernels.cu
@@ -35,7 +35,13 @@ template <typename Int>
 __host__ __device__ inline Int round_up(Int x, Int y) {
   static_assert(std::is_integral_v<Int>,
                 "round_up argument must be integral type");
-  return (x + y - 1) / y * y;
+  return ((x + y - 1) / y) * y;
+}
+
+// Compute effective rows for grid configuration with swizzled SF layouts.
+inline int computeEffectiveRows(int m) {
+  constexpr int ROW_TILE = 128;
+  return round_up(m, ROW_TILE);
 }
 
 // Use UE4M3 by default.
@@ -49,81 +55,57 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
   static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
                 "Vec size is not matched.");
 
+  // Precompute SF layout parameter (constant for entire kernel).
+  int32_t const numKTiles = (numCols + 63) / 64;
+
   int sf_m = round_up<int>(numRows, 128);
   int sf_n_unpadded = numCols / CVT_FP4_SF_VEC_SIZE;
   int sf_n_int = round_up<int>(sf_n_unpadded, 4) / 4;
-  for (int row = numRows + blockIdx.x; row < sf_m; row += gridDim.x) {
-    // Each thread writes 4 uint32_t elements.
-    for (int col = sf_n_unpadded + threadIdx.x * 4; col < sf_n_int;
-         col += blockDim.x * 4) {
-      SFout[row * sf_n_int + col] = 0x00;
-    }
-  }
+  int num_padded_cols = sf_n_int * 4 * CVT_FP4_SF_VEC_SIZE;
 
   // Get the global scaling factor, which will be applied to the SF.
   // Note SFScale is the same as next GEMM's alpha, which is
   // (448.f / (Alpha_A / 6.f)).
   float const global_scale = SFScale == nullptr ? 1.0f : SFScale[0];
 
-  // Input tensor row/col loops.
-  for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) {
-    for (int colIdx = threadIdx.x; colIdx < numCols / CVT_FP4_ELTS_PER_THREAD;
+  // Iterate over all rows and cols including padded ones -
+  //  ensures we visit every single scale factor address to initialize it.
+  for (int rowIdx = blockIdx.x; rowIdx < sf_m; rowIdx += gridDim.x) {
+    for (int colIdx = threadIdx.x;
+         colIdx < num_padded_cols / CVT_FP4_ELTS_PER_THREAD;
          colIdx += blockDim.x) {
+      int elem_idx = colIdx * CVT_FP4_ELTS_PER_THREAD;
+
+      PackedVec in_vec;
       int64_t inOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
-      PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
-      // Get the output tensor offset.
-      // Same as inOffset because 8 elements are packed into one uint32_t.
-      int64_t outOffset = inOffset;
-      auto& out_pos = out[outOffset];
+
+      // If we are outside valid rows OR outside valid columns -> Use Zeros
+      if (rowIdx >= numRows || elem_idx >= numCols) {
+        memset(&in_vec, 0, sizeof(PackedVec));
+
+      } else {
+        // Valid Region: Load actual data
+        in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
+      }
 
       auto sf_out =
           cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
                                              CVT_FP4_NUM_THREADS_PER_SF>(
-              rowIdx, colIdx, numCols, SFout);
+              rowIdx, colIdx, numKTiles, SFout);
 
-      out_pos =
+      auto out_val =
           cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, global_scale, sf_out);
+
+      // We do NOT write output for padding because the 'out' tensor is not
+      // padded.
+      if (rowIdx < numRows && elem_idx < numCols) {
+        // Same as inOffset because 8 elements are packed into one uint32_t.
+        out[inOffset] = out_val;
+      }
     }
   }
 }
 
-template <typename T>
-void invokeFP4Quantization(int m, int n, T const* input, float const* SFScale,
-                           int64_t* output, int32_t* SFOuput, bool useUE8M0,
-                           int multiProcessorCount, cudaStream_t stream) {
-  // Grid, Block size.
-  // Each thread converts 8 values.
-  dim3 block(std::min(int(n / ELTS_PER_THREAD), 512));
-  // Get number of blocks per SM
-  int const numBlocksPerSM =
-      vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
-  dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
-
-  // Launch the cvt kernel.
-  if (useUE8M0) {
-    cvt_fp16_to_fp4<T, true><<<grid, block, 0, stream>>>(
-        m, n, input, SFScale, reinterpret_cast<uint32_t*>(output),
-        reinterpret_cast<uint32_t*>(SFOuput));
-  } else {
-    cvt_fp16_to_fp4<T, false><<<grid, block, 0, stream>>>(
-        m, n, input, SFScale, reinterpret_cast<uint32_t*>(output),
-        reinterpret_cast<uint32_t*>(SFOuput));
-  }
-}
-
-// Instantiate the function.
-template void invokeFP4Quantization(int m, int n, half const* input,
-                                    float const* SFScale, int64_t* output,
-                                    int32_t* SFOuput, bool useUE8M0,
-                                    int multiProcessorCount,
-                                    cudaStream_t stream);
-
-template void invokeFP4Quantization(int m, int n, __nv_bfloat16 const* input,
-                                    float const* SFScale, int64_t* output,
-                                    int32_t* SFOuput, bool useUE8M0,
-                                    int multiProcessorCount,
-                                    cudaStream_t stream);
-
 }  // namespace vllm
 
 void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
@@ -147,13 +129,19 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
 
-  // We don't support e8m0 scales at this moment.
-  bool useUE8M0 = false;
+  // Grid, Block size. Each thread converts 8 values.
+  dim3 block(std::min(int(n / ELTS_PER_THREAD), 512));
+  int const numBlocksPerSM =
+      vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
+  int effectiveRows = vllm::computeEffectiveRows(m);
+  dim3 grid(std::min(effectiveRows, multiProcessorCount * numBlocksPerSM));
 
   VLLM_DISPATCH_HALF_TYPES(input.scalar_type(), "nvfp4_quant_kernel", [&] {
     using cuda_type = vllm::CUDATypeConverter<scalar_t>::Type;
     auto input_ptr = static_cast<cuda_type const*>(input.data_ptr());
-    vllm::invokeFP4Quantization(m, n, input_ptr, input_sf_ptr, output_ptr,
-                                sf_out, useUE8M0, multiProcessorCount, stream);
+    // NOTE: We don't support e8m0 scales at this moment.
+    vllm::cvt_fp16_to_fp4<cuda_type, false><<<grid, block, 0, stream>>>(
+        m, n, input_ptr, input_sf_ptr, reinterpret_cast<uint32_t*>(output_ptr),
+        reinterpret_cast<uint32_t*>(sf_out));
   });
-}
+}
\ No newline at end of file
diff --git a/csrc/quantization/fp4/nvfp4_utils.cuh b/csrc/quantization/fp4/nvfp4_utils.cuh
index 48e4959de9793..4c91af85e1514 100644
--- a/csrc/quantization/fp4/nvfp4_utils.cuh
+++ b/csrc/quantization/fp4/nvfp4_utils.cuh
@@ -128,51 +128,42 @@ inline __device__ float reciprocal_approximate_ftz(float a) {
   return b;
 }
 
+// Compute SF output offset for swizzled tensor core layout.
+// SF layout: [numMTiles, numKTiles, 32, 4, 4]
+// Caller must precompute: numKTiles = (numCols + 63) / 64
 template <class SFType, int CVT_FP4_NUM_THREADS_PER_SF>
-__device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(int rowIdx, int colIdx,
-                                                       int numCols,
-                                                       SFType* SFout) {
+__device__ __forceinline__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(
+    int rowIdx, int colIdx, int32_t numKTiles, SFType* SFout) {
   static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 ||
                 CVT_FP4_NUM_THREADS_PER_SF == 2);
 
   // One pair of threads write one SF to global memory.
   // TODO: stage through smem for packed STG.32
   // is it better than STG.8 from 4 threads ?
-  if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0) {
-    // SF vector index (16 elements share one SF in the K dimension).
-    int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
-    int32_t mIdx = rowIdx;
-
-    // SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)]
-    // --> index [mTileIdx, kTileIdx, outerMIdx, innerMIdx, innerKIdx]
-
-    int32_t mTileIdx = mIdx / (32 * 4);
-    // SF vector size 16.
-    int factor = CVT_FP4_SF_VEC_SIZE * 4;
-    int32_t numKTiles = (numCols + factor - 1) / factor;
-    int64_t mTileStride = numKTiles * 32 * 4 * 4;
-
-    int32_t kTileIdx = (kIdx / 4);
-    int64_t kTileStride = 32 * 4 * 4;
-
-    // M tile layout [32, 4] is column-major.
-    int32_t outerMIdx = (mIdx % 32);
-    int64_t outerMStride = 4 * 4;
-
-    int32_t innerMIdx = (mIdx % (32 * 4)) / 32;
-    int64_t innerMStride = 4;
-
-    int32_t innerKIdx = (kIdx % 4);
-    int64_t innerKStride = 1;
-
-    // Compute the global offset.
-    int64_t SFOffset = mTileIdx * mTileStride + kTileIdx * kTileStride +
-                       outerMIdx * outerMStride + innerMIdx * innerMStride +
-                       innerKIdx * innerKStride;
-
-    return reinterpret_cast<uint8_t*>(SFout) + SFOffset;
+  if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF != 0) {
+    return nullptr;
   }
-  return nullptr;
+
+  // SF vector index (16 elements share one SF in the K dimension).
+  int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
+  int32_t mIdx = rowIdx;
+
+  // Decompose indices using bitwise ops (all divisors are powers of 2).
+  // SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)]
+  int32_t mTileIdx = mIdx >> 7;         // mIdx / 128
+  int32_t outerMIdx = mIdx & 31;        // mIdx % 32
+  int32_t innerMIdx = (mIdx >> 5) & 3;  // (mIdx / 32) % 4
+  int32_t kTileIdx = kIdx >> 2;         // kIdx / 4
+  int32_t innerKIdx = kIdx & 3;         // kIdx % 4
+
+  // Compute global SF offset: mTileIdx * (numKTiles * 512) + kTileIdx * 512 +
+  //                           outerMIdx * 16 + innerMIdx * 4 + innerKIdx
+  // Use bitwise OR for non-overlapping lower bits.
+  int64_t SFOffset = (static_cast<int64_t>(mTileIdx) * numKTiles + kTileIdx)
+                         << 9 |
+                     (outerMIdx << 4) | (innerMIdx << 2) | innerKIdx;
+
+  return reinterpret_cast<uint8_t*>(SFout) + SFOffset;
 }
 
 // Quantizes the provided PackedVec into the uint32_t output
diff --git a/csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu b/csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu
deleted file mode 100644
index 6c8f6309ef43f..0000000000000
--- a/csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu
+++ /dev/null
@@ -1,373 +0,0 @@
-#include "core/registration.h"
-
-#include <torch/all.h>
-#include <cutlass/arch/arch.h>
-
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <c10/cuda/CUDAStream.h>
-
-#include "cute/tensor.hpp"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/epilogue/collective/default_epilogue.hpp"
-#include "cutlass/epilogue/thread/linear_combination.h"
-#include "cutlass/gemm/dispatch_policy.hpp"
-#include "cutlass/gemm/group_array_problem_shape.hpp"
-#include "cutlass/gemm/collective/collective_builder.hpp"
-#include "cutlass/epilogue/collective/collective_builder.hpp"
-#include "cutlass/gemm/device/gemm_universal_adapter.h"
-#include "cutlass/gemm/kernel/gemm_universal.hpp"
-
-#include "cutlass/util/command_line.h"
-#include "cutlass/util/distribution.h"
-#include "cutlass/util/host_tensor.h"
-#include "cutlass/util/packed_stride.hpp"
-#include "cutlass/util/tensor_view_io.h"
-#include "cutlass/util/reference/device/gemm.h"
-#include "cutlass/util/reference/device/tensor_compare.h"
-#include "cutlass/util/reference/host/tensor_fill.h"
-#include "cutlass/util/reference/host/gett.hpp"
-#include "cutlass/util/reference/host/tensor_norm.h"
-#include "cutlass/util/reference/host/tensor_compare.h"
-#include <cassert>
-
-using namespace cute;
-
-template <typename ElementAB, typename ElementC, typename ElementAccumulator,
-          typename LayoutSFA, typename LayoutSFB, typename ScaleConfig>
-__global__ void get_ggemm_starts(
-    int32_t* expert_offsets, ElementAB** a_offsets, ElementAB** b_offsets,
-    ElementC** out_offsets, ElementAccumulator** a_scale_offsets,
-    ElementAccumulator** b_scale_offsets, ElementAB* a_base_as_int,
-    ElementAB* b_base_as_int, ElementC* out_base_as_int,
-    ElementAccumulator* a_scale_base_as_int,
-    ElementAccumulator* b_scale_base_as_int, LayoutSFA* layout_sfa_base_as_int,
-    LayoutSFB* layout_sfb_base_as_int, int* problem_sizes) {
-  int expert_id = threadIdx.x;
-
-  if (expert_id >= gridDim.x * blockDim.x) {
-    return;
-  }
-
-  int m = problem_sizes[expert_id * 3];
-  int n = problem_sizes[expert_id * 3 + 1];
-  int k = problem_sizes[expert_id * 3 + 2];
-
-  int32_t expert_offset = expert_offsets[expert_id];
-  int a_stride = expert_offset * k;
-  int b_stride = expert_id * k * n;
-  int a_scale_stride = expert_offset * k / 128;
-  int b_scale_stride = expert_id * k * n / 128 / 128;
-
-  a_offsets[expert_id] = a_base_as_int + a_stride;
-  b_offsets[expert_id] = b_base_as_int + b_stride;
-  out_offsets[expert_id] = out_base_as_int + expert_offset * n;
-  a_scale_offsets[expert_id] = a_scale_base_as_int + a_scale_stride;
-  b_scale_offsets[expert_id] = b_scale_base_as_int + b_scale_stride;
-
-  LayoutSFA* layout_sfa_ptr = layout_sfa_base_as_int + expert_id;
-  LayoutSFB* layout_sfb_ptr = layout_sfb_base_as_int + expert_id;
-
-  *layout_sfa_ptr =
-      ScaleConfig::tile_atom_to_shape_SFA(cute::make_shape(m, n, k, 1));
-  *layout_sfb_ptr =
-      ScaleConfig::tile_atom_to_shape_SFB(cute::make_shape(m, n, k, 1));
-}
-
-#define __CALL_GET_STARTS_KERNEL(TENSOR_C_TYPE, C_TYPE, LayoutSFA, LayoutSFB, \
-                                 ScaleConfig)                                 \
-  else if (out_tensors.dtype() == TENSOR_C_TYPE) {                            \
-    get_ggemm_starts<cutlass::float_e4m3_t, C_TYPE, float, LayoutSFA,         \
-                     LayoutSFB, ScaleConfig><<<1, num_experts, 0, stream>>>(  \
-        static_cast<int32_t*>(expert_offsets.data_ptr()),                     \
-        static_cast<cutlass::float_e4m3_t**>(a_ptrs.data_ptr()),              \
-        static_cast<cutlass::float_e4m3_t**>(b_ptrs.data_ptr()),              \
-        static_cast<C_TYPE**>(out_ptrs.data_ptr()),                           \
-        static_cast<float**>(a_scales_ptrs.data_ptr()),                       \
-        static_cast<float**>(b_scales_ptrs.data_ptr()),                       \
-        static_cast<cutlass::float_e4m3_t*>(a_tensors.data_ptr()),            \
-        static_cast<cutlass::float_e4m3_t*>(b_tensors.data_ptr()),            \
-        static_cast<C_TYPE*>(out_tensors.data_ptr()),                         \
-        static_cast<float*>(a_scales.data_ptr()),                             \
-        static_cast<float*>(b_scales.data_ptr()),                             \
-        reinterpret_cast<LayoutSFA*>(layout_sfa.data_ptr()),                  \
-        reinterpret_cast<LayoutSFB*>(layout_sfb.data_ptr()),                  \
-        static_cast<int*>(problem_sizes.data_ptr()));                         \
-  }
-
-template <typename LayoutSFA, typename LayoutSFB, typename ScaleConfig>
-void run_get_ggemm_starts(
-    torch::Tensor const& expert_offsets, torch::Tensor& a_ptrs,
-    torch::Tensor& b_ptrs, torch::Tensor& out_ptrs,
-    torch::Tensor& a_scales_ptrs, torch::Tensor& b_scales_ptrs,
-    torch::Tensor const& a_tensors, torch::Tensor const& b_tensors,
-    torch::Tensor out_tensors, torch::Tensor const& a_scales,
-    torch::Tensor const& b_scales, torch::Tensor const& layout_sfa,
-    torch::Tensor const& layout_sfb, torch::Tensor const& problem_sizes) {
-  TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn);
-  TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn);
-  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
-  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
-  TORCH_CHECK(out_tensors.size(1) % 128 == 0 or out_tensors.size(0) % 128 == 0);
-  TORCH_CHECK(a_tensors.size(1) % 128 == 0 or a_tensors.size(0) % 128 == 0);
-
-  int num_experts = (int)expert_offsets.size(0);
-  auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());
-
-  if (false) {
-  }
-  __CALL_GET_STARTS_KERNEL(torch::kBFloat16, cutlass::bfloat16_t, LayoutSFA,
-                           LayoutSFB, ScaleConfig)
-  __CALL_GET_STARTS_KERNEL(torch::kFloat16, cutlass::half_t, LayoutSFA,
-                           LayoutSFB, ScaleConfig)
-  else {
-    TORCH_CHECK(false, "Unsupported output tensor type");
-  }
-}
-
-template <typename OutType, typename ScheduleConfig, typename LayoutD>
-void run_blockwise_scaled_group_mm(
-    torch::Tensor& out_ptrs, const torch::Tensor& a_ptrs,
-    const torch::Tensor& b_ptrs, const torch::Tensor& a_scales_ptrs,
-    const torch::Tensor& b_scales_ptrs, const torch::Tensor& stride_a,
-    const torch::Tensor& stride_b, const torch::Tensor& stride_c,
-    const torch::Tensor& layout_sfa, const torch::Tensor& layout_sfb,
-    const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) {
-  using ProblemShape = cutlass::gemm::GroupProblemShape<Shape<int, int, int>>;
-
-  // Types
-  using ElementA = cutlass::float_e4m3_t;
-  using ElementB = cutlass::float_e4m3_t;
-  using ElementC = OutType;
-  using ElementD = ElementC;
-  using ElementAccumulator = float;
-  using LayoutA = cutlass::layout::RowMajor;
-  using LayoutB = cutlass::layout::ColumnMajor;
-  using LayoutC = LayoutD;
-
-  // Alignments
-  static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
-  static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
-  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
-
-  using ArchTag = cutlass::arch::Sm100;
-  using OperatorClass = cutlass::arch::OpClassTensorOp;
-
-  using CollectiveEpilogue =
-      typename cutlass::epilogue::collective::CollectiveBuilder<
-          ArchTag, OperatorClass, typename ScheduleConfig::MmaTileShape,
-          typename ScheduleConfig::ClusterShape,
-          cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
-          ElementAccumulator, void, LayoutC*, AlignmentC, ElementD, LayoutC*,
-          AlignmentC, typename ScheduleConfig::EpilogueSchedule>::CollectiveOp;
-
-  using CollectiveMainloop =
-      typename cutlass::gemm::collective::CollectiveBuilder<
-          ArchTag, OperatorClass, ElementA,
-          cute::tuple<LayoutA*, typename ScheduleConfig::LayoutSFA*>,
-          AlignmentA, ElementB,
-          cute::tuple<LayoutB*, typename ScheduleConfig::LayoutSFB*>,
-          AlignmentB, ElementAccumulator, typename ScheduleConfig::MmaTileShape,
-          typename ScheduleConfig::ClusterShape,
-          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
-              sizeof(typename CollectiveEpilogue::SharedStorage))>,
-          typename ScheduleConfig::KernelSchedule>::CollectiveOp;
-
-  using GemmKernel =
-      cutlass::gemm::kernel::GemmUniversal<ProblemShape, CollectiveMainloop,
-                                           CollectiveEpilogue, void>;
-
-  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
-  using StrideA = typename Gemm::GemmKernel::InternalStrideA;
-  using StrideB = typename Gemm::GemmKernel::InternalStrideB;
-  using StrideC = typename Gemm::GemmKernel::InternalStrideC;
-  using StrideD = typename Gemm::GemmKernel::InternalStrideD;
-
-  using UnderlyingProblemShape = ProblemShape::UnderlyingProblemShape;
-  int num_experts = (int)expert_offsets.size(0);
-
-  Gemm gemm_op;
-
-  // Mainloop Arguments
-  typename GemmKernel::MainloopArguments mainloop_args{
-      static_cast<const ElementA**>(a_ptrs.data_ptr()),
-      static_cast<StrideA*>(stride_a.data_ptr()),
-      static_cast<const ElementB**>(b_ptrs.data_ptr()),
-      static_cast<StrideB*>(stride_b.data_ptr()),
-      static_cast<const ElementAccumulator**>(a_scales_ptrs.data_ptr()),
-      reinterpret_cast<typename ScheduleConfig::LayoutSFA*>(
-          layout_sfa.data_ptr()),
-      static_cast<const ElementAccumulator**>(b_scales_ptrs.data_ptr()),
-      reinterpret_cast<typename ScheduleConfig::LayoutSFB*>(
-          layout_sfb.data_ptr())};
-
-  int device_id = a_ptrs.device().index();
-  static const cutlass::KernelHardwareInfo hw_info{
-      device_id, cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
-                     device_id)};
-
-  // Epilogue Arguments
-  typename GemmKernel::EpilogueArguments epilogue_args{
-      {},  // epilogue.thread
-      nullptr,
-      static_cast<StrideC*>(stride_c.data_ptr()),
-      static_cast<ElementD**>(out_ptrs.data_ptr()),
-      static_cast<StrideC*>(stride_c.data_ptr())};
-
-  UnderlyingProblemShape* problem_sizes_as_shapes =
-      static_cast<UnderlyingProblemShape*>(problem_sizes.data_ptr());
-
-  // Gemm Arguments
-  typename GemmKernel::Arguments args{
-      cutlass::gemm::GemmUniversalMode::kGrouped,
-      {num_experts, problem_sizes_as_shapes, nullptr},
-      mainloop_args,
-      epilogue_args,
-      hw_info};
-
-  at::cuda::CUDAGuard device_guard{(char)a_ptrs.device().index()};
-  const cudaStream_t stream =
-      at::cuda::getCurrentCUDAStream(a_ptrs.get_device());
-
-  auto can_implement_status = gemm_op.can_implement(args);
-  TORCH_CHECK(can_implement_status == cutlass::Status::kSuccess,
-              "Failed to implement GEMM");
-
-  size_t workspace_size = gemm_op.get_workspace_size(args);
-  auto const workspace_options =
-      torch::TensorOptions().dtype(torch::kUInt8).device(a_ptrs.device());
-  auto workspace = torch::empty(workspace_size, workspace_options);
-
-  auto status = gemm_op.initialize(args, workspace.data_ptr(), stream);
-  TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to initialize GEMM");
-
-  status = gemm_op.run(stream);
-  TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM");
-}
-
-template <typename OutType>
-void blockwise_scaled_group_mm_dispatch_shape(
-    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
-    const torch::Tensor& scales_a, const torch::Tensor& scales_b,
-    const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) {
-  struct MmaConfig {
-    using ElementA = cutlass::float_e4m3_t;
-    using KernelSchedule =
-        cutlass::gemm::KernelPtrArrayTmaWarpSpecializedBlockwise1SmSm100;
-    using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
-    using ScaleConfig = cutlass::detail::Sm100BlockwiseScaleConfig<
-        1, 128, 128, cute::UMMA::Major::K, cute::UMMA::Major::K>;
-    using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
-    using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
-    using LayoutC = cutlass::layout::RowMajor;
-    using MmaTileShape = Shape<_128, _128, _128>;
-    using ClusterShape = Shape<_1, _1, _1>;
-  };
-
-  int num_experts = (int)expert_offsets.size(0);
-
-  auto a_ptrs = torch::empty(
-      {num_experts},
-      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
-  auto b_ptrs = torch::empty(
-      {num_experts},
-      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
-  auto out_ptrs = torch::empty(
-      {num_experts},
-      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
-  auto a_scales_ptrs = torch::empty(
-      {num_experts},
-      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
-  auto b_scales_ptrs = torch::empty(
-      {num_experts},
-      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
-
-  auto layout_sfa = torch::empty(
-      {num_experts, 5},
-      torch::TensorOptions().dtype(torch::kInt32).device(a.device()));
-  auto layout_sfb = torch::empty(
-      {num_experts, 5},
-      torch::TensorOptions().dtype(torch::kInt32).device(a.device()));
-
-  auto stride_a = torch::full(
-      {num_experts}, a.size(1),
-      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
-  auto stride_b = torch::full(
-      {num_experts}, a.size(1),
-      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
-  auto stride_c = torch::full(
-      {num_experts}, output.size(1),
-      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
-
-  torch::TensorOptions options_int =
-      torch::TensorOptions().dtype(torch::kInt64).device(a.device());
-
-  run_get_ggemm_starts<typename MmaConfig::LayoutSFA,
-                       typename MmaConfig::LayoutSFB,
-                       typename MmaConfig::ScaleConfig>(
-      expert_offsets, a_ptrs, b_ptrs, out_ptrs, a_scales_ptrs, b_scales_ptrs, a,
-      b, output, scales_a, scales_b, layout_sfa, layout_sfb, problem_sizes);
-
-  run_blockwise_scaled_group_mm<OutType, MmaConfig,
-                                typename MmaConfig::LayoutC>(
-      out_ptrs, a_ptrs, b_ptrs, a_scales_ptrs, b_scales_ptrs, stride_a,
-      stride_b, stride_c, layout_sfa, layout_sfb, problem_sizes,
-      expert_offsets);
-}
-
-void cutlass_blockwise_scaled_grouped_mm(
-    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
-    const torch::Tensor& scales_a, const torch::Tensor& scales_b,
-    const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) {
-  TORCH_CHECK(problem_sizes.dim() == 2, "problem_sizes must be 2D tensor");
-  TORCH_CHECK(problem_sizes.size(1) == 3,
-              "problem_sizes must have shape (num_experts, 3)");
-  TORCH_CHECK(problem_sizes.size(0) == expert_offsets.size(0),
-              "Number of experts in problem_sizes must match expert_offsets");
-  TORCH_CHECK(problem_sizes.dtype() == torch::kInt32,
-              "problem_sizes must be int32");
-  TORCH_CHECK(a.scalar_type() == torch::kFloat8_e4m3fn,
-              "a must be kFloat8_e4m3fn");
-  TORCH_CHECK(b.scalar_type() == torch::kFloat8_e4m3fn,
-              "b must be kFloat8_e4m3fn");
-  TORCH_CHECK(output.scalar_type() == torch::kBFloat16 ||
-                  output.scalar_type() == torch::kHalf,
-              "output must be bfloat16 or half");
-  TORCH_CHECK(scales_a.scalar_type() == torch::kFloat32,
-              "scales_a must be float32");
-  TORCH_CHECK(scales_b.scalar_type() == torch::kFloat32,
-              "scales_b must be float32");
-  TORCH_CHECK(expert_offsets.scalar_type() == torch::kInt32,
-              "expert_offsets must be int32");
-
-  TORCH_CHECK(output.dim() == 2, "output must be 2D tensor");
-  TORCH_CHECK(a.dim() == 2, "a must be 2D tensor");
-  TORCH_CHECK(b.dim() == 3, "b must be 3D tensor");
-  TORCH_CHECK(scales_a.dim() == 2, "scales_a must be 2D tensor");
-  TORCH_CHECK(scales_b.dim() == 3, "scales_b must be 3D tensor");
-  TORCH_CHECK(problem_sizes.dim() == 2, "problem_sizes must be 2D tensor");
-  TORCH_CHECK(problem_sizes.size(1) == 3,
-              "problem_sizes must have shape (num_experts, 3)");
-  TORCH_CHECK(problem_sizes.size(0) == expert_offsets.size(0),
-              "Number of experts in problem_sizes must match expert_offsets");
-  TORCH_CHECK(problem_sizes.dtype() == torch::kInt32,
-              "problem_sizes must be int32");
-  TORCH_CHECK(expert_offsets.dim() == 1, "expert_offsets must be 1D tensor");
-
-#if defined(ENABLE_CUTLASS_MOE_SM100) && ENABLE_CUTLASS_MOE_SM100
-  if (output.scalar_type() == torch::kBFloat16) {
-    blockwise_scaled_group_mm_dispatch_shape<cutlass::bfloat16_t>(
-        output, a, b, scales_a, scales_b, problem_sizes, expert_offsets);
-  } else if (output.scalar_type() == torch::kFloat16) {
-    blockwise_scaled_group_mm_dispatch_shape<cutlass::half_t>(
-        output, a, b, scales_a, scales_b, problem_sizes, expert_offsets);
-  } else {
-    TORCH_CHECK(false, "Unsupported output tensor type");
-  }
-#endif
-}
-
-TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
-  m.impl("cutlass_blockwise_scaled_grouped_mm",
-         &cutlass_blockwise_scaled_grouped_mm);
-}
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 83d4943d62776..461f74ca184fd 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -416,13 +416,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "                      Tensor alpha) -> ()");
   ops.impl("cutlass_scaled_fp4_mm", torch::kCUDA, &cutlass_scaled_fp4_mm);
 
-  // cutlass blockwise scaledgroup GEMM
-  ops.def(
-      "cutlass_blockwise_scaled_grouped_mm(Tensor! output, Tensor a, Tensor b, "
-      "Tensor scales_a, Tensor scales_b, "
-      "Tensor problem_sizes, Tensor expert_offsets) -> ()");
-  // conditionally compiled so impl registration is in source file
-
   // cutlass nvfp4 block scaled group GEMM
   ops.def(
       "cutlass_fp4_group_mm(Tensor! out, Tensor a, Tensor b,"
diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu
index 8d55ecfba3e52..2caf1ad144178 100644
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -17,7 +17,7 @@
 #   VLLM_CPU_DISABLE_AVX512=false (default)|true
 #   VLLM_CPU_AVX512BF16=false (default)|true
 #   VLLM_CPU_AVX512VNNI=false (default)|true
-#   VLLM_CPU_AMXBF16=false (default)|true
+#   VLLM_CPU_AMXBF16=false |true (default)
 #
 
 ######################### COMMON BASE IMAGE #########################
@@ -95,7 +95,7 @@ ENV VLLM_CPU_AVX512BF16=${VLLM_CPU_AVX512BF16}
 ARG VLLM_CPU_AVX512VNNI=0
 ENV VLLM_CPU_AVX512VNNI=${VLLM_CPU_AVX512VNNI}
 # Support for building with AMXBF16 ISA: docker build --build-arg VLLM_CPU_AMXBF16="true" ...
-ARG VLLM_CPU_AMXBF16=0
+ARG VLLM_CPU_AMXBF16=1
 ENV VLLM_CPU_AMXBF16=${VLLM_CPU_AMXBF16}
 
 WORKDIR /workspace/vllm
@@ -147,7 +147,9 @@ WORKDIR /workspace/vllm
 
 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
     --mount=type=cache,target=/var/lib/apt,sharing=locked \
-    apt-get install -y --no-install-recommends vim numactl xz-utils
+    apt-get install -y --no-install-recommends vim numactl xz-utils make clangd-14
+
+RUN ln -s /usr/bin/clangd-14 /usr/bin/clangd
 
 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 1b6bdabc7a539..4c09808a14333 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -130,6 +130,7 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
     && uv pip install --system *.whl
 
 ARG COMMON_WORKDIR
+ARG BASE_IMAGE
 
 # Copy over the benchmark scripts as well
 COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks
@@ -144,4 +145,9 @@ ENV SAFETENSORS_FAST_GPU=1
 # Performance environment variable.
 ENV HIP_FORCE_DEV_KERNARG=1
 
+# Workaround for ROCm profiler limits
+RUN echo "ROCTRACER_MAX_EVENTS=10000000" > ${COMMON_WORKDIR}/libkineto.conf
+ENV KINETO_CONFIG="${COMMON_WORKDIR}/libkineto.conf"
+RUN echo "VLLM_BASE_IMAGE=${BASE_IMAGE}" >> ${COMMON_WORKDIR}/versions.txt
+
 CMD ["/bin/bash"]
diff --git a/docker/Dockerfile.rocm_base b/docker/Dockerfile.rocm_base
index a57ee728d9243..c5e94ee1f6928 100644
--- a/docker/Dockerfile.rocm_base
+++ b/docker/Dockerfile.rocm_base
@@ -1,15 +1,15 @@
-ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.1-complete
+ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.0-complete
 ARG TRITON_BRANCH="57c693b6"
 ARG TRITON_REPO="https://github.com/ROCm/triton.git"
-ARG PYTORCH_BRANCH="1c57644d"
-ARG PYTORCH_VISION_BRANCH="v0.23.0"
+ARG PYTORCH_BRANCH="89075173"
 ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
+ARG PYTORCH_VISION_BRANCH="v0.24.1"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
 ARG PYTORCH_AUDIO_BRANCH="v2.9.0"
 ARG PYTORCH_AUDIO_REPO="https://github.com/pytorch/audio.git"
 ARG FA_BRANCH="0e60e394"
 ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
-ARG AITER_BRANCH="59bd8ff2"
+ARG AITER_BRANCH="6af8b687"
 ARG AITER_REPO="https://github.com/ROCm/aiter.git"
 
 FROM ${BASE_IMAGE} AS base
@@ -162,4 +162,4 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
     && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
     && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
     && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
-    && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
\ No newline at end of file
+    && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu
index 72d2053102c22..4168c1570d874 100644
--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
@@ -2,7 +2,7 @@ FROM intel/deep-learning-essentials:2025.2.2-0-devel-ubuntu24.04 AS vllm-base
 
 RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
     echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
-    add-apt-repository -y ppa:kobuk-team/intel-graphics
+    add-apt-repository -y ppa:kobuk-team/intel-graphics-staging
 
 RUN apt clean && apt-get update -y && \
     apt-get install -y --no-install-recommends --fix-missing \
@@ -47,6 +47,11 @@ RUN --mount=type=cache,target=/root/.cache/pip \
     pip install --no-cache-dir \
     -r requirements/xpu.txt
 
+# arctic-inference is built from source which needs torch-xpu properly installed
+# used for suffix method speculative decoding
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install --no-cache-dir arctic-inference==0.1.1
+
 ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/"
 
 COPY . .
diff --git a/docs/benchmarking/dashboard.md b/docs/benchmarking/dashboard.md
index 7cc4d23250df9..4cbc1a6a0a4fb 100644
--- a/docs/benchmarking/dashboard.md
+++ b/docs/benchmarking/dashboard.md
@@ -8,12 +8,19 @@ The results are automatically published to the public [vLLM Performance Dashboar
 ## Manually Trigger the benchmark
 
 Use [vllm-ci-test-repo images](https://gallery.ecr.aws/q9t5s3a7/vllm-ci-test-repo) with vLLM benchmark suite.
-For CPU environment, please use the image with "-cpu" postfix.
+For x86 CPU environment, please use the image with "-cpu" postfix. For AArch64 CPU environment, please use the image with "-arm64-cpu" postfix.
 
-Here is an example for docker run command for CPU.
+Here is an example for docker run command for CPU. For GPUs skip setting the `ON_CPU` env var.
 
 ```bash
-docker run -it --entrypoint /bin/bash -v /data/huggingface:/root/.cache/huggingface  -e HF_TOKEN=''  --shm-size=16g --name vllm-cpu-ci  public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:1da94e673c257373280026f75ceb4effac80e892-cpu
+export VLLM_COMMIT=1da94e673c257373280026f75ceb4effac80e892 # use full commit hash from the main branch
+export HF_TOKEN=<valid Hugging Face token>
+if [[ "$(uname -m)" == aarch64 || "$(uname -m)" == arm64 ]]; then
+  IMG_SUFFIX="arm64-cpu"
+else
+  IMG_SUFFIX="cpu"
+fi
+docker run -it --entrypoint /bin/bash -v /data/huggingface:/root/.cache/huggingface -e HF_TOKEN=$HF_TOKEN -e ON_ARM64_CPU=1 --shm-size=16g --name vllm-cpu-ci public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:${VLLM_COMMIT}-${IMG_SUFFIX}
 ```
 
 Then, run below command inside the docker instance.
@@ -26,7 +33,7 @@ When run, benchmark script generates results under **benchmark/results** folder,
 
 ### Runtime environment variables
 
-- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
+- `ON_CPU`: set the value to '1' on Intel® Xeon® and Arm® Neoverse™ Processors. Default value is 0.
 - `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
 - `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
 - `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
diff --git a/docs/contributing/ci/update_pytorch_version.md b/docs/contributing/ci/update_pytorch_version.md
index 735bb2e205332..74c0beb779c7d 100644
--- a/docs/contributing/ci/update_pytorch_version.md
+++ b/docs/contributing/ci/update_pytorch_version.md
@@ -77,25 +77,20 @@ This complicates the process as we cannot use the out-of-the-box
     - `.buildkite/release-pipeline.yaml`
     - `.buildkite/scripts/upload-wheels.sh`
 
-## Address long vLLM build time
+## Manually running vLLM builds on BuildKiteCI
 
-When building vLLM with a new PyTorch/CUDA version, no cache will exist
-in the vLLM sccache S3 bucket, causing the build job on CI to potentially take more than 5 hours
-and timeout. Additionally, since vLLM's fastcheck pipeline runs in read-only mode,
-it doesn't populate the cache, so re-running it to warm up the cache
-is ineffective.
+When building vLLM with a new PyTorch/CUDA version, the vLLM sccache S3 bucket
+will not have any cached artifacts, which can cause CI build jobs to exceed 5 hours.
+Furthermore, vLLM's fastcheck pipeline operates in read-only mode and does not
+populate the cache, making it ineffective for cache warm-up purposes.
 
-While ongoing efforts like <https://github.com/vllm-project/vllm/issues/17419>
-address the long build time at its source, the current workaround is to set `VLLM_CI_BRANCH`
-to a custom branch provided by @khluu (`VLLM_CI_BRANCH=khluu/long_build`)
-when manually triggering a build on Buildkite. This branch accomplishes two things:
+To address this, manually trigger a build on Buildkite to accomplish two objectives:
 
-1. Increase the timeout limit to 10 hours so that the build doesn't time out.
-2. Allow the compiled artifacts to be written to the vLLM sccache S3 bucket
-to warm it up so that future builds are faster.
+1. Run the complete test suite against the PyTorch RC build by setting the environment variables: `RUN_ALL=1` and `NIGHTLY=1`
+2. Populate the vLLM sccache S3 bucket with compiled artifacts, enabling faster subsequent builds
 
 <p align="center" width="100%">
-    <img width="60%" alt="Buildkite new build popup" src="https://github.com/user-attachments/assets/a8ff0fcd-76e0-4e91-b72f-014e3fdb6b94">
+<img width="60%" alt="Buildkite new build popup" src="https://github.com/user-attachments/assets/3b07f71b-bb18-4ca3-aeaf-da0fe79d315f" />
 </p>
 
 ## Update all the different vLLM platforms
diff --git a/docs/deployment/integrations/kserve.md b/docs/deployment/integrations/kserve.md
index 37b29aa1a4876..06ad5f29a1a65 100644
--- a/docs/deployment/integrations/kserve.md
+++ b/docs/deployment/integrations/kserve.md
@@ -2,4 +2,4 @@
 
 vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving.
 
-Please see [this guide](https://kserve.github.io/website/docs/model-serving/generative-inference/overview) for more details on using vLLM with KServe.
+You can use vLLM with KServe's [Hugging Face serving runtime](https://kserve.github.io/website/docs/model-serving/generative-inference/overview) or via [`LLMInferenceService` that uses llm-d](https://kserve.github.io/website/docs/model-serving/generative-inference/llmisvc/llmisvc-overview).
diff --git a/docs/deployment/integrations/llm-d.md b/docs/deployment/integrations/llm-d.md
new file mode 100644
index 0000000000000..cccf1773c6be6
--- /dev/null
+++ b/docs/deployment/integrations/llm-d.md
@@ -0,0 +1,5 @@
+# llm-d
+
+vLLM can be deployed with [llm-d](https://github.com/llm-d/llm-d), a Kubernetes-native distributed inference serving stack providing well-lit paths for anyone to serve large generative AI models at scale. It helps achieve the fastest "time to state-of-the-art (SOTA) performance" for key OSS models across most hardware accelerators and infrastructure providers.
+
+You can use vLLM with llm-d directly by following [this guide](https://llm-d.ai/docs/guide) or via [KServe's LLMInferenceService](https://kserve.github.io/website/docs/model-serving/generative-inference/llmisvc/llmisvc-overview).
diff --git a/docs/deployment/k8s.md b/docs/deployment/k8s.md
index 05814cbad9bfc..77a159009aa8d 100644
--- a/docs/deployment/k8s.md
+++ b/docs/deployment/k8s.md
@@ -12,6 +12,7 @@ Alternatively, you can deploy vLLM to Kubernetes using any of the following:
 
 - [Helm](frameworks/helm.md)
 - [InftyAI/llmaz](integrations/llmaz.md)
+- [llm-d](integrations/llm-d.md)
 - [KAITO](integrations/kaito.md)
 - [KServe](integrations/kserve.md)
 - [Kthena](integrations/kthena.md)
diff --git a/docs/design/moe_kernel_features.md b/docs/design/moe_kernel_features.md
index 48341d199cb80..6c02dcb76bec2 100644
--- a/docs/design/moe_kernel_features.md
+++ b/docs/design/moe_kernel_features.md
@@ -16,7 +16,7 @@ Async backends support the use of DBO (Dual Batch Overlap) and shared expert ove
 
 Certain models require the topk weights to be applied to the input activations rather than the output activations when topk==1, e.g. Llama. For modular kernels, this feature is supported by the `FusedMoEPrepareAndFinalize` subclass. For non-modular kernels, it is up to the experts function to deal with this flag.
 
-Unless otherwise specified, backends are controlled via `VLLM_ALL2ALL_BACKEND`. All backends except `flashinfer` only work with EP+DP or EP+TP. `Flashinfer` can work with EP or DP without EP.
+Unless otherwise specified, backends are controlled via the `--all2all-backend` command-line argument (or the `all2all_backend` parameter in `ParallelConfig`). All backends except `flashinfer` only work with EP+DP or EP+TP. `Flashinfer` can work with EP or DP without EP.
 
 <style>
 td {
diff --git a/docs/design/paged_attention.md b/docs/design/paged_attention.md
index 5cc5878425515..53368ab1a79fa 100644
--- a/docs/design/paged_attention.md
+++ b/docs/design/paged_attention.md
@@ -139,18 +139,18 @@ token data.
 const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE;
 ```
 
-<figure markdown="span">
-  ![](../assets/design/paged_attention/query.png){ align="center" alt="query" width="70%" }
-</figure>
+<p align="center">
+  <img src="../assets/design/paged_attention/query.png" alt="query" width="70%" />
+</p>
 
 Each thread defines its own `q_ptr` which points to the assigned
 query token data on global memory. For example, if `VEC_SIZE` is 4
 and `HEAD_SIZE` is 128, the `q_ptr` points to data that contains
 total of 128 elements divided into 128 / 4 = 32 vecs.
 
-<figure markdown="span">
-  ![](../assets/design/paged_attention/q_vecs.png){ align="center" alt="q_vecs" width="70%" }
-</figure>
+<p align="center">
+  <img src="../assets/design/paged_attention/q_vecs.png" alt="q_vecs" width="70%" />
+</p>
 
 ```cpp
 __shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD];
@@ -187,9 +187,9 @@ key token at different iterations. As shown above, that `k_ptr`
 points to key token data based on `k_cache` at assigned block,
 assigned head and assigned token.
 
-<figure markdown="span">
-  ![](../assets/design/paged_attention/key.png){ align="center" alt="key" width="70%" }
-</figure>
+<p align="center">
+  <img src="../assets/design/paged_attention/key.png" alt="key" width="70%" />
+</p>
 
 The diagram above illustrates the memory layout for key data. It
 assumes that the `BLOCK_SIZE` is 16, `HEAD_SIZE` is 128, `x` is
@@ -202,9 +202,9 @@ iterations. Inside each rectangle, there are a total 32 vecs (128
 elements for one token) that will be processed by 2 threads (one
 thread group) separately.
 
-<figure markdown="span">
-  ![](../assets/design/paged_attention/k_vecs.png){ align="center" alt="k_vecs" width="70%" }
-</figure>
+<p align="center">
+  <img src="../assets/design/paged_attention/k_vecs.png" alt="k_vecs" width="70%" />
+</p>
 
 ```cpp
 K_vec k_vecs[NUM_VECS_PER_THREAD]
@@ -361,17 +361,17 @@ later steps. Now, it should store the normalized softmax result of
 
 ## Value
 
-<figure markdown="span">
-  ![](../assets/design/paged_attention/value.png){ align="center" alt="value" width="70%" }
-</figure>
+<p align="center">
+  <img src="../assets/design/paged_attention/value.png" alt="value" width="70%" />
+</p>
 
-<figure markdown="span">
-  ![](../assets/design/paged_attention/logits_vec.png){ align="center" alt="logits_vec" width="50%" }
-</figure>
+<p align="center">
+  <img src="../assets/design/paged_attention/logits_vec.png" alt="logits_vec" width="50%" />
+</p>
 
-<figure markdown="span">
-  ![](../assets/design/paged_attention/v_vec.png){ align="center" alt="v_vec" width="70%" }
-</figure>
+<p align="center">
+  <img src="../assets/design/paged_attention/v_vec.png" alt="v_vec" width="70%" />
+</p>
 
 Now we need to retrieve the value data and perform dot multiplication
 with `logits`. Unlike query and key, there is no thread group
diff --git a/docs/features/README.md b/docs/features/README.md
index e9e5232929b72..b9083b9993159 100644
--- a/docs/features/README.md
+++ b/docs/features/README.md
@@ -64,7 +64,7 @@ th:not(:first-child) {
 | [CP](../configuration/optimization.md#chunked-prefill)                                     | [❌](https://github.com/vllm-project/vllm/issues/2729) | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
 | [APC](automatic_prefix_caching.md)                        | [❌](https://github.com/vllm-project/vllm/issues/3687) | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
 | [LoRA](lora.md)                                           | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
-| [SD](spec_decode.md)                                      | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ✅     | [🟠](https://github.com/vllm-project/vllm/issues/26963)       |
+| [SD](spec_decode.md)                                      | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ✅     | ✅        |
 | CUDA graph                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ✅     | [❌](https://github.com/vllm-project/vllm/issues/26970)        |
 | [pooling](../models/pooling_models.md)                    | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
 | <abbr title="Encoder-Decoder Models">enc-dec</abbr>       | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❌     | ✅        |
diff --git a/docs/features/quantization/modelopt.md b/docs/features/quantization/modelopt.md
index b02d5ba9e89a2..5c846767bc5b8 100644
--- a/docs/features/quantization/modelopt.md
+++ b/docs/features/quantization/modelopt.md
@@ -8,6 +8,16 @@ We recommend installing the library with:
 pip install nvidia-modelopt
 ```
 
+## Supported ModelOpt checkpoint formats
+
+vLLM detects ModelOpt checkpoints via `hf_quant_config.json` and supports the
+following `quantization.quant_algo` values:
+
+- `FP8`: per-tensor weight scale (+ optional static activation scale).
+- `FP8_PER_CHANNEL_PER_TOKEN`: per-channel weight scale and dynamic per-token activation quantization.
+- `FP8_PB_WO` (ModelOpt may emit `fp8_pb_wo`): block-scaled FP8 weight-only (typically 128×128 blocks).
+- `NVFP4`: ModelOpt NVFP4 checkpoints (use `quantization="modelopt_fp4"`).
+
 ## Quantizing HuggingFace Models with PTQ
 
 You can quantize HuggingFace models using the example scripts provided in the Model Optimizer repository. The primary script for LLM PTQ is typically found within the `examples/llm_ptq` directory.
@@ -80,3 +90,24 @@ The quantized checkpoint can then be deployed with vLLM. As an example, the foll
     if __name__ == "__main__":
         main()
     ```
+
+## Running the OpenAI-compatible server
+
+To serve a local ModelOpt checkpoint via the OpenAI-compatible API:
+
+```bash
+vllm serve <path_to_exported_checkpoint> \
+  --quantization modelopt \
+  --host 0.0.0.0 --port 8000
+```
+
+## Testing (local checkpoints)
+
+vLLM's ModelOpt unit tests are gated by local checkpoint paths and are skipped
+by default in CI. To run the tests locally:
+
+```bash
+export VLLM_TEST_MODELOPT_FP8_PC_PT_MODEL_PATH=<path_to_fp8_pc_pt_checkpoint>
+export VLLM_TEST_MODELOPT_FP8_PB_WO_MODEL_PATH=<path_to_fp8_pb_wo_checkpoint>
+pytest -q tests/quantization/test_modelopt.py
+```
diff --git a/docs/features/quantization/quantized_kvcache.md b/docs/features/quantization/quantized_kvcache.md
index d26a5e217f314..586117272d3ba 100644
--- a/docs/features/quantization/quantized_kvcache.md
+++ b/docs/features/quantization/quantized_kvcache.md
@@ -17,6 +17,16 @@ The E4M3 format offers higher precision compared to E5M2. However, due to its sm
 
 For now, only per-tensor (scalar) scaling factors are supported. Development is ongoing to support scaling factors of a finer granularity (e.g. per-channel).
 
+### How FP8 KV Cache Works
+
+The FP8 KV cache implementation follows this workflow:
+
+1. **Storage**: Key and Value tensors are quantized to FP8 format using scaling factors before being stored in the KV cache
+2. **Retrieval**: When needed for attention computation, cached KV tensors are dequantized back to higher precision (FP16/BF16)
+3. **Attention**: The attention-value multiplication (softmax output × V) is performed using the dequantized higher-precision V tensor
+
+This means the final attention computation operates on dequantized values, not FP8 tensors. The quantization reduces memory usage during storage but maintains computation accuracy by using higher precision during the actual attention operations.
+
 ### Performance Impact
 
 The current FP8 KV cache implementation primarily benefits throughput by allowing approximately double the amount of space for KV cache allocation. This enables either:
diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md
index 70a11d6def566..7b6945cb71c27 100644
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@@ -352,10 +352,17 @@ Supported models:
 * `zai-org/GLM-4.5`
 * `zai-org/GLM-4.5-Air`
 * `zai-org/GLM-4.6`
-* `zai-org/GLM-4.6-Air`
 
 Flags: `--tool-call-parser glm45`
 
+### GLM-4.7 Models (`glm47`)
+
+Supported models:
+
+* `zai-org/GLM-4.7`
+
+Flags: `--tool-call-parser glm47`
+
 ### Qwen3-Coder Models (`qwen3_xml`)
 
 Supported models:
diff --git a/docs/getting_started/installation/README.md b/docs/getting_started/installation/README.md
index cff7ce1a882a1..cdbe601ca801a 100644
--- a/docs/getting_started/installation/README.md
+++ b/docs/getting_started/installation/README.md
@@ -27,3 +27,5 @@ The backends below live **outside** the main `vllm` repository and follow the
 | IBM Spyre AIU | `vllm-spyre` | <https://github.com/vllm-project/vllm-spyre> |
 | Cambricon MLU | `vllm-mlu` | <https://github.com/Cambricon/vllm-mlu> |
 | Baidu Kunlun XPU | N/A, install from source | <https://github.com/baidu/vLLM-Kunlun> |
+| Sophgo TPU | N/A, install from source | <https://github.com/sophgo/vllm-tpu> |
+| Apple Silicon (Metal) | N/A, install from source | <https://github.com/vllm-project/vllm-metal> |
diff --git a/docs/getting_started/installation/cpu.apple.inc.md b/docs/getting_started/installation/cpu.apple.inc.md
index 9f1f6e3821397..c5a4d00ddcf4c 100644
--- a/docs/getting_started/installation/cpu.apple.inc.md
+++ b/docs/getting_started/installation/cpu.apple.inc.md
@@ -4,6 +4,9 @@ vLLM has experimental support for macOS with Apple Silicon. For now, users must
 
 Currently the CPU implementation for macOS supports FP32 and FP16 datatypes.
 
+!!! tip "GPU-Accelerated Inference with vLLM-Metal"
+    For GPU-accelerated inference on Apple Silicon using Metal, check out [vllm-metal](https://github.com/vllm-project/vllm-metal), a community-maintained hardware plugin that uses MLX as the compute backend.
+
 # --8<-- [end:installation]
 # --8<-- [start:requirements]
 
diff --git a/docs/getting_started/installation/cpu.arm.inc.md b/docs/getting_started/installation/cpu.arm.inc.md
index 657bf2509db01..611e6edf6668b 100644
--- a/docs/getting_started/installation/cpu.arm.inc.md
+++ b/docs/getting_started/installation/cpu.arm.inc.md
@@ -19,12 +19,26 @@ Pre-built vLLM wheels for Arm are available since version 0.11.2. These wheels c
 
 ```bash
 export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//')
-uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_VERSION}/cpu
+uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cpu-cp38-abi3-manylinux_2_35_aarch64.whl
 ```
 
 ??? console "pip"
     ```bash
-    pip install vllm==${VLLM_VERSION}+cpu --extra-index-url https://wheels.vllm.ai/${VLLM_VERSION}/cpu
+    pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cpu-cp38-abi3-manylinux_2_35_aarch64.whl
+    ```
+
+!!! warning "set `LD_PRELOAD`"
+    Before use vLLM CPU installed via wheels, make sure TCMalloc is installed and added to `LD_PRELOAD`:
+    ```bash
+    # install TCMalloc
+    sudo apt-get install -y --no-install-recommends libtcmalloc-minimal4
+
+    # manually find the path
+    sudo find / -iname *libtcmalloc_minimal.so.4
+    TC_PATH=...
+
+    # add them to LD_PRELOAD
+    export LD_PRELOAD="$TC_PATH:$LD_PRELOAD"
     ```
 
 The `uv` approach works for vLLM `v0.6.6` and later. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version.
@@ -37,7 +51,7 @@ LLM inference is a fast-evolving field, and the latest code may contain bug fixe
 
 To install from nightly index, run:
 ```bash
-uv pip install vllm --extra-index-url https://wheels.vllm.ai/nightly/cpu
+uv pip install vllm --extra-index-url https://wheels.vllm.ai/nightly/cpu --index-strategy first-index
 ```
 
 ??? console "pip (there's a caveat)"
@@ -56,7 +70,7 @@ If you want to access the wheels for previous commits (e.g. to bisect the behavi
 
 ```bash
 export VLLM_COMMIT=730bd35378bf2a5b56b6d3a45be28b3092d26519 # use full commit hash from the main branch
-uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT}/cpu
+uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT}/cpu --index-strategy first-index
 ```
 
 # --8<-- [end:pre-built-wheels]
@@ -105,6 +119,20 @@ VLLM_TARGET_DEVICE=cpu uv pip install -e . --no-build-isolation
 
 Testing has been conducted on AWS Graviton3 instances for compatibility.
 
+!!! warning "set `LD_PRELOAD`"
+    Before use vLLM CPU installed via wheels, make sure TCMalloc is installed and added to `LD_PRELOAD`:
+    ```bash
+    # install TCMalloc
+    sudo apt-get install -y --no-install-recommends libtcmalloc-minimal4
+
+    # manually find the path
+    sudo find / -iname *libtcmalloc_minimal.so.4
+    TC_PATH=...
+
+    # add them to LD_PRELOAD
+    export LD_PRELOAD="$TC_PATH:$LD_PRELOAD"
+    ```
+
 # --8<-- [end:build-wheel-from-source]
 # --8<-- [start:pre-built-images]
 
diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md
index 210f720e2d92a..affb94593dd42 100644
--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@@ -18,6 +18,12 @@ vLLM is a Python library that supports the following CPU variants. Select your C
 
     --8<-- "docs/getting_started/installation/cpu.s390x.inc.md:installation"
 
+## Technical Discussions
+
+The main discussions happen in the `#sig-cpu` channel of [vLLM Slack](https://slack.vllm.ai/).
+
+When open a Github issue about the CPU backend, please add `[CPU Backend]` in the title and it will be labeled with `cpu` for better awareness.
+
 ## Requirements
 
 - Python: 3.10 -- 3.13
@@ -258,11 +264,6 @@ vLLM CPU supports data parallel (DP), tensor parallel (TP) and pipeline parallel
     - GPTQ (x86 only)
     - compressed-tensor INT8 W8A8 (x86, s390x)
 
-### (x86 only) What is the purpose of `VLLM_CPU_SGL_KERNEL`?
-
-- Both of them require `amx` CPU flag.
-    - `VLLM_CPU_SGL_KERNEL` can provide better performance for MoE models and small-batch scenarios.
-
 ### Why do I see `get_mempolicy: Operation not permitted` when running in Docker?
 
 In some container environments (like Docker), NUMA-related syscalls used by vLLM (e.g., `get_mempolicy`, `migrate_pages`) are blocked/denied in the runtime's default seccomp/capabilities settings. This may lead to warnings like `get_mempolicy: Operation not permitted`. Functionality is not affected, but NUMA memory binding/migration optimizations may not take effect and performance can be suboptimal.
diff --git a/docs/getting_started/installation/cpu.x86.inc.md b/docs/getting_started/installation/cpu.x86.inc.md
index 1fad7f4338822..013750bc537bf 100644
--- a/docs/getting_started/installation/cpu.x86.inc.md
+++ b/docs/getting_started/installation/cpu.x86.inc.md
@@ -17,7 +17,51 @@ vLLM supports basic model inferencing and serving on x86 CPU platform, with data
 # --8<-- [end:set-up-using-python]
 # --8<-- [start:pre-built-wheels]
 
-Currently, there are no pre-built x86 CPU wheels.
+Pre-built vLLM wheels for x86 with AVX512 are available since version 0.13.0. To install release wheels:
+
+```bash
+export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//')
+
+# use uv
+uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cpu-cp38-abi3-manylinux_2_35_x86_64.whl --torch-backend cpu
+```
+??? console "pip"
+    ```bash
+    # use pip
+    pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cpu-cp38-abi3-manylinux_2_35_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cpu
+    ```
+!!! warning "set `LD_PRELOAD`"
+    Before use vLLM CPU installed via wheels, make sure TCMalloc and Intel OpenMP are installed and added to `LD_PRELOAD`:
+    ```bash
+    # install TCMalloc, Intel OpenMP is installed with vLLM CPU
+    sudo apt-get install -y --no-install-recommends libtcmalloc-minimal4
+
+    # manually find the path
+    sudo find / -iname *libtcmalloc_minimal.so.4
+    sudo find / -iname *libiomp5.so
+    TC_PATH=...
+    IOMP_PATH=...
+
+    # add them to LD_PRELOAD
+    export LD_PRELOAD="$TC_PATH:$IOMP_PATH:$LD_PRELOAD"
+    ```
+
+**Install the latest code**
+
+To install the wheel built from the latest main branch:
+
+```bash
+uv pip install vllm --extra-index-url https://wheels.vllm.ai/nightly/cpu --index-strategy first-index --torch-backend cpu
+```
+
+**Install specific revisions**
+
+If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL:
+
+```bash
+export VLLM_COMMIT=730bd35378bf2a5b56b6d3a45be28b3092d26519 # use full commit hash from the main branch
+uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT}/cpu --index-strategy first-index --torch-backend cpu
+```
 
 # --8<-- [end:pre-built-wheels]
 # --8<-- [start:build-wheel-from-source]
@@ -26,10 +70,12 @@ Install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as the def
 
 ```bash
 sudo apt-get update -y
-sudo apt-get install -y gcc-12 g++-12 libnuma-dev python3-dev
+sudo apt-get install -y gcc-12 g++-12 libnuma-dev
 sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
 ```
 
+--8<-- "docs/getting_started/installation/python_env_setup.inc.md"
+
 Clone the vLLM project:
 
 ```bash
@@ -82,6 +128,22 @@ uv pip install dist/*.whl
     pip install dist/*.whl
     ```
 
+!!! warning "set `LD_PRELOAD`"
+    Before use vLLM CPU installed via wheels, make sure TCMalloc and Intel OpenMP are installed and added to `LD_PRELOAD`:
+    ```bash
+    # install TCMalloc, Intel OpenMP is installed with vLLM CPU
+    sudo apt-get install -y --no-install-recommends libtcmalloc-minimal4
+
+    # manually find the path
+    sudo find / -iname *libtcmalloc_minimal.so.4
+    sudo find / -iname *libiomp5.so
+    TC_PATH=...
+    IOMP_PATH=...
+
+    # add them to LD_PRELOAD
+    export LD_PRELOAD="$TC_PATH:$IOMP_PATH:$LD_PRELOAD"
+    ```
+
 !!! example "Troubleshooting"
     - **NumPy ≥2.0 error**: Downgrade using `pip install "numpy<2.0"`.
     - **CMake picks up CUDA**: Add `CMAKE_DISABLE_FIND_PACKAGE_CUDA=ON` to prevent CUDA detection during CPU builds, even if CUDA is installed.
@@ -95,7 +157,6 @@ uv pip install dist/*.whl
       "torch==X.Y.Z+cpu"   # <-------
     ]
     ```
-    - If you are building vLLM from source and not using the pre-built images, remember to set `LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD"` on x86 machines before running vLLM.
 
 # --8<-- [end:build-wheel-from-source]
 # --8<-- [start:pre-built-images]
@@ -112,6 +173,7 @@ uv pip install dist/*.whl
 docker build -f docker/Dockerfile.cpu \
         --build-arg VLLM_CPU_AVX512BF16=false (default)|true \
         --build-arg VLLM_CPU_AVX512VNNI=false (default)|true \
+        --build-arg VLLM_CPU_AMXBF16=false|true (default) \
         --build-arg VLLM_CPU_DISABLE_AVX512=false (default)|true \ 
         --tag vllm-cpu-env \
         --target vllm-openai .
@@ -123,9 +185,8 @@ docker run --rm \
             --shm-size=4g \
             -p 8000:8000 \
             -e VLLM_CPU_KVCACHE_SPACE=<KV cache space> \
-            -e VLLM_CPU_OMP_THREADS_BIND=<CPU cores for inference> \
             vllm-cpu-env \
-            --model=meta-llama/Llama-3.2-1B-Instruct \
+            meta-llama/Llama-3.2-1B-Instruct \
             --dtype=bfloat16 \
             other vLLM OpenAI server arguments
 ```
diff --git a/docs/getting_started/installation/python_env_setup.inc.md b/docs/getting_started/installation/python_env_setup.inc.md
index ba78c329723ed..06794f8d3120e 100644
--- a/docs/getting_started/installation/python_env_setup.inc.md
+++ b/docs/getting_started/installation/python_env_setup.inc.md
@@ -1,4 +1,4 @@
-On NVIDIA CUDA only, it's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following commands:
+It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following commands:
 
 ```bash
 uv venv --python 3.12 --seed
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 9ba0f4ca9096e..6838fc227f355 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -387,7 +387,7 @@ th {
 | `Gemma3nForCausalLM` | Gemma 3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | |
 | `GlmForCausalLM` | GLM-4 | `zai-org/glm-4-9b-chat-hf`, etc. | ✅︎ | ✅︎ |
 | `Glm4ForCausalLM` | GLM-4-0414 | `zai-org/GLM-4-32B-0414`, etc. | ✅︎ | ✅︎ |
-| `Glm4MoeForCausalLM` | GLM-4.5, GLM-4.6 | `zai-org/GLM-4.5`, etc. | ✅︎ | ✅︎ |
+| `Glm4MoeForCausalLM` | GLM-4.5, GLM-4.6, GLM-4.7 | `zai-org/GLM-4.5`, etc. | ✅︎ | ✅︎ |
 | `GPT2LMHeadModel` | GPT-2 | `gpt2`, `gpt2-xl`, etc. | | ✅︎ |
 | `GPTBigCodeForCausalLM` | StarCoder, SantaCoder, WizardCoder | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. | ✅︎ | ✅︎ |
 | `GPTJForCausalLM` | GPT-J | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. | | ✅︎ |
@@ -406,6 +406,7 @@ th {
 | `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ |
 | `InternLM3ForCausalLM` | InternLM3 | `internlm/internlm3-8b-instruct`, etc. | ✅︎ | ✅︎ |
 | `JAISLMHeadModel` | Jais | `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc. | | ✅︎ |
+| `Jais2ForCausalLM` | Jais2 | `inceptionai/Jais-2-8B-Chat`, `inceptionai/Jais-2-70B-Chat`, etc. | | ✅︎ |
 | `JambaForCausalLM` | Jamba | `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc. | ✅︎ | ✅︎ |
 | `KimiLinearForCausalLM` | Kimi-Linear-48B-A3B-Base, Kimi-Linear-48B-A3B-Instruct | `moonshotai/Kimi-Linear-48B-A3B-Base`, `moonshotai/Kimi-Linear-48B-A3B-Instruct` | | ✅︎ |
 | `Lfm2ForCausalLM`  | LFM2  | `LiquidAI/LFM2-1.2B`, `LiquidAI/LFM2-700M`, `LiquidAI/LFM2-350M`, etc. | ✅︎ | ✅︎ |
@@ -414,9 +415,10 @@ th {
 | `MambaForCausalLM` | Mamba | `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc. | | ✅︎ |
 | `Mamba2ForCausalLM` | Mamba2 | `mistralai/Mamba-Codestral-7B-v0.1`, etc. | | ✅︎ |
 | `MiMoForCausalLM` | MiMo | `XiaomiMiMo/MiMo-7B-RL`, etc. | ✅︎ | ✅︎ |
+| `MiMoV2FlashForCausalLM` | MiMoV2Flash | `XiaomiMiMo/MiMo-V2-Flash`, etc. | ︎| ✅︎ |
 | `MiniCPMForCausalLM` | MiniCPM | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. | ✅︎ | ✅︎ |
 | `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ |
-| `MiniMaxM2ForCausalLM` | MiniMax-M2 |`MiniMaxAI/MiniMax-M2`, etc. | | ✅︎ |
+| `MiniMaxM2ForCausalLM` | MiniMax-M2, MiniMax-M2.1 |`MiniMaxAI/MiniMax-M2`, etc. | | ✅︎ |
 | `MistralForCausalLM` | Ministral-3, Mistral, Mistral-Instruct | `mistralai/Ministral-3-3B-Instruct-2512`, `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ |
 | `MistralLarge3ForCausalLM` | Mistral-Large-3-675B-Base-2512, Mistral-Large-3-675B-Instruct-2512 | `mistralai/Mistral-Large-3-675B-Base-2512`, `mistralai/Mistral-Large-3-675B-Instruct-2512`, etc. | ✅︎ | ✅︎ |
 | `MixtralForCausalLM` | Mixtral-8x7B, Mixtral-8x7B-Instruct | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. | ✅︎ | ✅︎ |
@@ -488,6 +490,7 @@ These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) A
 | `GteNewModel`<sup>C</sup> | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. |  |  |
 | `ModernBertModel`<sup>C</sup> | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. |  |  |
 | `NomicBertModel`<sup>C</sup> | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. |  |  |
+| `LlamaBidirectionalModel`<sup>C</sup> | Llama-based with bidirectional attention | `nvidia/llama-nemotron-embed-1b-v2`, etc. | ✅︎ | ✅︎ |
 | `LlamaModel`<sup>C</sup>, `LlamaForCausalLM`<sup>C</sup>, `MistralModel`<sup>C</sup>, etc. | Llama-based | `intfloat/e5-mistral-7b-instruct`, etc. | ✅︎ | ✅︎ |
 | `Qwen2Model`<sup>C</sup>, `Qwen2ForCausalLM`<sup>C</sup> | Qwen2-based | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. | ✅︎ | ✅︎ |
 | `Qwen3Model`<sup>C</sup>, `Qwen3ForCausalLM`<sup>C</sup> | Qwen3-based | `Qwen/Qwen3-Embedding-0.6B`, etc. | ✅︎ | ✅︎ |
@@ -541,8 +544,9 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A
 | `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | | |
 | `GemmaForSequenceClassification` | Gemma-based | `BAAI/bge-reranker-v2-gemma` (see note), etc. | ✅︎ | ✅︎ |
 | `GteNewForSequenceClassification` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-reranker-base`, etc. |  |  |
-| `Qwen2ForSequenceClassification` | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2` (see note), etc. | ✅︎ | ✅︎ |
-| `Qwen3ForSequenceClassification` | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ | ✅︎ |
+| `LlamaBidirectionalForSequenceClassification`<sup>C</sup> | Llama-based with bidirectional attention | `nvidia/llama-nemotron-rerank-1b-v2` (see note), etc. | ✅︎ | ✅︎ |
+| `Qwen2ForSequenceClassification`<sup>C</sup> | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2` (see note), etc. | ✅︎ | ✅︎ |
+| `Qwen3ForSequenceClassification`<sup>C</sup> | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ | ✅︎ |
 | `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | | |
 | `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | | |
 | `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* |
@@ -560,6 +564,11 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A
 !!! note
     The second-generation GTE model (mGTE-TRM) is named `NewForSequenceClassification`. The name `NewForSequenceClassification` is too generic, you should set `--hf-overrides '{"architectures": ["GteNewForSequenceClassification"]}'` to specify the use of the `GteNewForSequenceClassification` architecture.
 
+!!! note
+    `nvidia/llama-nemotron-rerank-1b-v2` require a specific prompt format to work correctly.
+
+    Examples : [offline_using_template.py](../../examples/pooling/score/offline_using_template.py) [online_using_template.py](../../examples/pooling/score/online_using_template.py)
+
 !!! note
     Load the official original `mxbai-rerank-v2` by using the following command.
 
diff --git a/docs/serving/integrations/langchain.md b/docs/serving/integrations/langchain.md
index 192a61ea5b903..14b336dffa78f 100644
--- a/docs/serving/integrations/langchain.md
+++ b/docs/serving/integrations/langchain.md
@@ -16,7 +16,7 @@ To run inference on a single or multiple GPUs, use `VLLM` class from `langchain`
     from langchain_community.llms import VLLM
 
     llm = VLLM(
-        model="mosaicml/mpt-7b",
+        model="Qwen/Qwen3-4B",
         trust_remote_code=True,  # mandatory for hf models
         max_new_tokens=128,
         top_k=10,
diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md
index 6a08f872def15..fb4b0b634145b 100644
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -669,6 +669,21 @@ You can find the documentation for cross encoder models at [sbert.net](https://w
 
 Code example: [examples/pooling/score/openai_cross_encoder_score.py](../../examples/pooling/score/openai_cross_encoder_score.py)
 
+#### Score Template
+
+Some scoring models require a specific prompt format to work correctly. You can specify a custom score template using the `--chat-template` parameter (see [Chat Template](#chat-template)).
+
+Score templates are supported for **cross-encoder** models only. If you are using an **embedding** model for scoring, vLLM does not apply a score template.
+
+Like chat templates, the score template receives a `messages` list. For scoring, each message has a `role` attribute—either `"query"` or `"document"`. For the usual kind of point-wise cross-encoder, you can expect exactly two messages: one query and one document. To access the query and document content, use Jinja's `selectattr` filter:
+
+- **Query**: `{{ (messages | selectattr("role", "eq", "query") | first).content }}`
+- **Document**: `{{ (messages | selectattr("role", "eq", "document") | first).content }}`
+
+This approach is more robust than index-based access (`messages[0]`, `messages[1]`) because it selects messages by their semantic role. It also avoids assumptions about message ordering if additional message types are added to `messages` in the future.
+
+Example template file: [examples/pooling/score/template/nemotron-rerank.jinja](../../examples/pooling/score/template/nemotron-rerank.jinja)
+
 #### Single inference
 
 You can pass a string to both `text_1` and `text_2`, forming a single sentence pair.
diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py
index be0b846995a92..287409fa2b5c1 100644
--- a/examples/offline_inference/data_parallel.py
+++ b/examples/offline_inference/data_parallel.py
@@ -5,130 +5,91 @@ Usage:
 Single node:
     python examples/offline_inference/data_parallel.py \
             --model="ibm-research/PowerMoE-3b" \
-            --dp-size=2 \
-            --tp-size=2
+            -dp=2 \
+            -tp=2
 
 Multi-node:
     Node 0 (assume the node has ip of 10.99.48.128):
             python examples/offline_inference/data_parallel.py \
                     --model="ibm-research/PowerMoE-3b" \
-                    --dp-size=2 \
-                    --tp-size=2 \
-                    --node-size=2 \
-                    --node-rank=0 \
-                    --master-addr=10.99.48.128 \
-                    --master-port=13345
+                    -dp=2 \
+                    -tp=2 \
+                    --dp-num-nodes=2 \
+                    --dp-node-rank=0 \
+                    --dp-master-addr=10.99.48.128 \
+                    --dp-master-port=13345
     Node 1:
             python examples/offline_inference/data_parallel.py \
                     --model="ibm-research/PowerMoE-3b" \
-                    --dp-size=2 \
-                    --tp-size=2 \
-                    --node-size=2 \
-                    --node-rank=1 \
-                    --master-addr=10.99.48.128 \
-                    --master-port=13345
+                    -dp=2 \
+                    -tp=2 \
+                    --dp-num-nodes=2 \
+                    --dp-node-rank=1 \
+                    --dp-master-addr=10.99.48.128 \
+                    --dp-master-port=13345
 """
 
 import os
 from time import sleep
 
-from vllm import LLM, SamplingParams
+from vllm import LLM, EngineArgs, SamplingParams
 from vllm.platforms import current_platform
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.network_utils import get_open_port
 
 
-def parse_args():
-    import argparse
+def create_parser():
+    parser = FlexibleArgumentParser(description="Data Parallel Inference")
 
-    parser = argparse.ArgumentParser(description="Data Parallel Inference")
+    # Add all engine args
+    EngineArgs.add_cli_args(parser)
+    parser.set_defaults(
+        model="ibm-research/PowerMoE-3b",
+        enable_expert_parallel=True,
+    )
+
+    # Add DP-specific args (separate from engine args to avoid conflicts)
     parser.add_argument(
-        "--model",
+        "--dp-num-nodes",
+        type=int,
+        default=1,
+        help="Total number of nodes for data parallel.",
+    )
+    parser.add_argument(
+        "--dp-node-rank",
+        type=int,
+        default=0,
+        help="Rank of the current node for data parallel.",
+    )
+    parser.add_argument(
+        "--dp-master-addr",
         type=str,
-        default="ibm-research/PowerMoE-3b",
-        help="Model name or path",
-    )
-    parser.add_argument("--dp-size", type=int, default=2, help="Data parallel size")
-    parser.add_argument("--tp-size", type=int, default=2, help="Tensor parallel size")
-    parser.add_argument(
-        "--node-size", type=int, default=1, help="Total number of nodes"
+        default="",
+        help="Master node IP address for DP coordination.",
     )
     parser.add_argument(
-        "--node-rank", type=int, default=0, help="Rank of the current node"
-    )
-    parser.add_argument(
-        "--master-addr", type=str, default="", help="Master node IP address"
-    )
-    parser.add_argument("--master-port", type=int, default=0, help="Master node port")
-    parser.add_argument(
-        "--enforce-eager", action="store_true", help="Enforce eager mode execution."
-    )
-    parser.add_argument(
-        "--trust-remote-code", action="store_true", help="Trust remote code."
-    )
-    parser.add_argument(
-        "--max-num-seqs",
+        "--dp-master-port",
         type=int,
-        default=64,
-        help=("Maximum number of sequences to be processed in a single iteration."),
-    )
-    parser.add_argument(
-        "--max-model-len",
-        type=int,
-        help=("Maximum number of tokens to be processed in a single iteration."),
+        default=0,
+        help="Master node port for DP coordination.",
     )
     parser.add_argument(
         "--timeout",
         type=int,
         default=300,
-        help=("Number of seconds before unresponsive process is killed."),
+        help="Number of seconds before unresponsive process is killed.",
     )
-    parser.add_argument(
-        "--gpu-memory-utilization",
-        type=float,
-        default=0.8,
-        help=("Fraction of GPU memory vLLM is allowed to allocate (0.0, 1.0]."),
-    )
-    parser.add_argument(
-        "--enable-dbo",
-        action="store_true",
-        help=("Enable microbatched execution"),
-    )
-    parser.add_argument(
-        "--compilation-config",
-        type=int,
-        help=("Compilation optimization (O) mode 0-3."),
-    )
-    parser.add_argument(
-        "--quantization",
-        type=str,
-    )
-    parser.add_argument(
-        "--disable-expert-parallel",
-        dest="enable_expert_parallel",
-        action="store_false",
-        help="Disable expert parallel (default: enabled).",
-    )
-    parser.set_defaults(enable_expert_parallel=True)
-    return parser.parse_args()
+
+    return parser
 
 
 def main(
-    model,
     dp_size,
     local_dp_rank,
     global_dp_rank,
     dp_master_ip,
     dp_master_port,
-    GPUs_per_dp_rank,
-    enforce_eager,
-    enable_expert_parallel,
-    trust_remote_code,
-    max_num_seqs,
-    max_model_len,
-    compilation_config,
-    gpu_memory_utilization,
-    enable_dbo,
-    quantization,
+    engine_args,
 ):
     os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
     os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank)
@@ -173,19 +134,7 @@ def main(
     )
 
     # Create an LLM.
-    llm = LLM(
-        model=model,
-        tensor_parallel_size=GPUs_per_dp_rank,
-        enforce_eager=enforce_eager,
-        enable_expert_parallel=enable_expert_parallel,
-        trust_remote_code=trust_remote_code,
-        max_num_seqs=max_num_seqs,
-        max_model_len=max_model_len,
-        gpu_memory_utilization=gpu_memory_utilization,
-        enable_dbo=enable_dbo,
-        quantization=quantization,
-        compilation_config=compilation_config,
-    )
+    llm = LLM(**engine_args)
     outputs = llm.generate(prompts, sampling_params)
     # Print the outputs.
     for i, output in enumerate(outputs):
@@ -204,22 +153,29 @@ def main(
 
 
 if __name__ == "__main__":
-    args = parse_args()
+    parser = create_parser()
+    args = vars(parser.parse_args())
 
-    dp_size = args.dp_size
-    tp_size = args.tp_size
-    node_size = args.node_size
-    node_rank = args.node_rank
+    # Extract DP-specific args (pop to remove from engine_args)
+    dp_size = args.pop("data_parallel_size")
+    dp_num_nodes = args.pop("dp_num_nodes")
+    dp_node_rank = args.pop("dp_node_rank")
+    dp_master_addr = args.pop("dp_master_addr")
+    dp_master_port = args.pop("dp_master_port")
+    timeout = args.pop("timeout")
 
-    if node_size == 1:
+    # Remaining args are engine args
+    engine_args = args
+
+    if dp_num_nodes == 1:
         dp_master_ip = "127.0.0.1"
-        dp_master_port = get_open_port()
+        dp_master_port_val = get_open_port()
     else:
-        dp_master_ip = args.master_addr
-        dp_master_port = args.master_port
+        dp_master_ip = dp_master_addr
+        dp_master_port_val = dp_master_port
 
-    assert dp_size % node_size == 0, "dp_size should be divisible by node_size"
-    dp_per_node = dp_size // node_size
+    assert dp_size % dp_num_nodes == 0, "dp_size should be divisible by dp_num_nodes"
+    dp_per_node = dp_size // dp_num_nodes
 
     from multiprocessing import Process
 
@@ -230,34 +186,24 @@ if __name__ == "__main__":
 
     procs = []
     for local_dp_rank, global_dp_rank in enumerate(
-        range(node_rank * dp_per_node, (node_rank + 1) * dp_per_node)
+        range(dp_node_rank * dp_per_node, (dp_node_rank + 1) * dp_per_node)
     ):
         proc = Process(
             target=main,
             args=(
-                args.model,
                 dp_size,
                 local_dp_rank,
                 global_dp_rank,
                 dp_master_ip,
-                dp_master_port,
-                tp_size,
-                args.enforce_eager,
-                args.enable_expert_parallel,
-                args.trust_remote_code,
-                args.max_num_seqs,
-                args.max_model_len,
-                args.compilation_config,
-                args.gpu_memory_utilization,
-                args.enable_dbo,
-                args.quantization,
+                dp_master_port_val,
+                engine_args,
             ),
         )
         proc.start()
         procs.append(proc)
     exit_code = 0
     for proc in procs:
-        proc.join(timeout=args.timeout)
+        proc.join(timeout=timeout)
         if proc.exitcode is None:
             print(f"Killing process {proc.pid} that didn't stop within 5 minutes.")
             proc.kill()
diff --git a/examples/online_serving/disaggregated_encoder/README.md b/examples/online_serving/disaggregated_encoder/README.md
index b2c3bb974dfab..2a59f86d15fb7 100644
--- a/examples/online_serving/disaggregated_encoder/README.md
+++ b/examples/online_serving/disaggregated_encoder/README.md
@@ -38,6 +38,8 @@ Encoder engines should be launched with the following flags:
 
 - `--max-num-batched-tokens=<large value>` **(default: 2048)** – This flag controls the token scheduling budget per decoding step and is irrelevant to encoder-only instances. **Set it to a very high value (effectively unlimited) to bypass scheduler limitations.** The actual token budget is managed by the encoder cache manager.
 
+- `--convert "mm_encoder_only"` **(Optional)** - The language model is skipped during initialization to reduce device memory usage. **Models using this option must implement the `get_language_model_spec` interface.**
+
 ## Local media inputs
 
 To support local image inputs (from your ```MEDIA_PATH``` directory), add the following flag to the encoder instance:
diff --git a/examples/online_serving/elastic_ep/serve_deepseek_v2.sh b/examples/online_serving/elastic_ep/serve_deepseek_v2.sh
index 6845545b6fd17..20bf598c03e26 100644
--- a/examples/online_serving/elastic_ep/serve_deepseek_v2.sh
+++ b/examples/online_serving/elastic_ep/serve_deepseek_v2.sh
@@ -55,7 +55,6 @@ done
 echo "Starting vLLM server for $MODEL_NAME with data parallel size: $DATA_PARALLEL_SIZE and redundant experts: $REDUNDANT_EXPERTS"
 
 export RAY_DEDUP_LOGS=0
-export VLLM_ALL2ALL_BACKEND="pplx"
 export VLLM_USE_DEEP_GEMM=1
 
 vllm serve $MODEL_NAME \
@@ -65,6 +64,7 @@ vllm serve $MODEL_NAME \
     --enforce-eager \
     --enable-expert-parallel \
     --enable-eplb \
+    --all2all-backend pplx \
     --num-redundant-experts $REDUNDANT_EXPERTS \
     --trust-remote-code \
     --host $HOST \
diff --git a/examples/pooling/score/offline_using_template.py b/examples/pooling/score/offline_using_template.py
new file mode 100644
index 0000000000000..427cbaab6fbc8
--- /dev/null
+++ b/examples/pooling/score/offline_using_template.py
@@ -0,0 +1,27 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E501
+from pathlib import Path
+
+from vllm import LLM
+
+model_name = "nvidia/llama-nemotron-rerank-1b-v2"
+
+# Path to template file
+template_path = Path(__file__).parent / "template" / "nemotron-rerank.jinja"
+chat_template = template_path.read_text()
+
+llm = LLM(model=model_name, runner="pooling", trust_remote_code=True)
+
+query = "how much protein should a female eat?"
+documents = [
+    "As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.",
+    "Definition of summit for English Language Learners. : 1  the highest point of a mountain : the top of a mountain. : 2  the highest level. : 3  a meeting or series of meetings between the leaders of two or more governments.",
+    "Calorie intake should not fall below 1,200 a day in women or 1,500 a day in men, except under the supervision of a health professional.",
+]
+
+outputs = llm.score(query, documents, chat_template=chat_template)
+
+print("-" * 30)
+print([output.outputs.score for output in outputs])
+print("-" * 30)
diff --git a/examples/pooling/score/online_using_template.py b/examples/pooling/score/online_using_template.py
new file mode 100644
index 0000000000000..66b22e0a9563f
--- /dev/null
+++ b/examples/pooling/score/online_using_template.py
@@ -0,0 +1,46 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E501
+"""
+Example of using the rerank API with template.
+
+run:
+    vllm serve nvidia/llama-nemotron-rerank-1b-v2 --runner pooling --trust-remote-code --chat-template examples/pooling/score/template/nemotron-rerank.jinja
+"""
+
+import json
+
+import requests
+
+url = "http://127.0.0.1:8000/rerank"
+
+headers = {"accept": "application/json", "Content-Type": "application/json"}
+
+query = "how much protein should a female eat?"
+documents = [
+    "As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.",
+    "Definition of summit for English Language Learners. : 1  the highest point of a mountain : the top of a mountain. : 2  the highest level. : 3  a meeting or series of meetings between the leaders of two or more governments.",
+    "Calorie intake should not fall below 1,200 a day in women or 1,500 a day in men, except under the supervision of a health professional.",
+]
+
+data = {
+    "model": "nvidia/llama-nemotron-rerank-1b-v2",
+    "query": query,
+    "documents": documents,
+}
+
+
+def main():
+    response = requests.post(url, headers=headers, json=data)
+
+    # Check the response
+    if response.status_code == 200:
+        print("Request successful!")
+        print(json.dumps(response.json(), indent=2))
+    else:
+        print(f"Request failed with status code: {response.status_code}")
+        print(response.text)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/pooling/score/template/nemotron-rerank.jinja b/examples/pooling/score/template/nemotron-rerank.jinja
new file mode 100644
index 0000000000000..0447d7bcd5d59
--- /dev/null
+++ b/examples/pooling/score/template/nemotron-rerank.jinja
@@ -0,0 +1,3 @@
+question:{{ (messages | selectattr("role", "eq", "query") | first).content }} 
+ 
+ passage:{{ (messages | selectattr("role", "eq", "document") | first).content }}
\ No newline at end of file
diff --git a/requirements/cpu-build.txt b/requirements/cpu-build.txt
index 1ea401a04a12c..a7bd3b17b6323 100644
--- a/requirements/cpu-build.txt
+++ b/requirements/cpu-build.txt
@@ -1,7 +1,7 @@
 cmake>=3.26.1
 ninja
 packaging>=24.2
-setuptools>=77.0.3,<81.0.0
+setuptools==77.0.3 # this version can reuse CMake build dir
 setuptools-scm>=8
 torch==2.9.1+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
 torch==2.9.1; platform_system == "Darwin" or platform_machine == "ppc64le" or platform_machine == "aarch64"
diff --git a/requirements/cpu.txt b/requirements/cpu.txt
index 7a670812e8943..111b8a5511562 100644
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@@ -1,6 +1,8 @@
 # Common dependencies
 -r common.txt
 
+setuptools==77.0.3 # this version can reuse CMake build dir
+
 numba == 0.61.2; platform_machine != "s390x" # Required for N-gram speculative decoding
 
 # Dependencies for CPUs
diff --git a/tests/compile/distributed/test_fusions_e2e.py b/tests/compile/distributed/test_fusions_e2e.py
index 28ab2cee71a6a..f8a629ed46cee 100644
--- a/tests/compile/distributed/test_fusions_e2e.py
+++ b/tests/compile/distributed/test_fusions_e2e.py
@@ -557,7 +557,8 @@ def test_rms_group_quant(
     # To capture subprocess logs, we need to know whether spawn or fork is used.
     # Force spawn as it is more general.
     monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
+
+    model_kwargs["attention_config"] = {"backend": backend.name}
 
     compilation_config = CompilationConfig(
         # Testing properties
diff --git a/tests/compile/test_aot_compile.py b/tests/compile/test_aot_compile.py
index 8fa305d6d72f5..2ffcd627e476a 100644
--- a/tests/compile/test_aot_compile.py
+++ b/tests/compile/test_aot_compile.py
@@ -9,6 +9,7 @@ from contextlib import contextmanager
 import pytest
 import torch
 
+import vllm.model_executor.layers.activation
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import (
     CompilationConfig,
@@ -16,9 +17,12 @@ from vllm.config import (
     VllmConfig,
     set_current_vllm_config,
 )
+from vllm.envs import disable_envs_cache
 from vllm.forward_context import set_forward_context
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
+from ..utils import create_new_process_for_each_test
+
 
 def reference_fn(x: torch.Tensor):
     assert x.shape[0] <= 42
@@ -66,6 +70,7 @@ def test_no_dynamo_cache_entry(monkeypatch: pytest.MonkeyPatch):
                 torch.compiler.set_stance("fail_on_recompile"),
             ):
                 CompiledMod(vllm_config=vllm_config)(*args)
+            disable_envs_cache()
 
             m.setenv("VLLM_USE_AOT_COMPILE", "1")
             torch._dynamo.reset()
@@ -101,6 +106,7 @@ def test_save_and_load(monkeypatch: pytest.MonkeyPatch):
             vllm_config = make_vllm_config()
             with use_vllm_config(vllm_config):
                 expected = CompiledMod(vllm_config=vllm_config)(*args)
+            disable_envs_cache()
 
             m.setenv("VLLM_FORCE_AOT_LOAD", "1")
             vllm_config = make_vllm_config()
@@ -130,6 +136,7 @@ def test_shape_env(monkeypatch: pytest.MonkeyPatch):
                 artifacts = compiled_mod.aot_compiled_fn._artifacts
                 guards_string = artifacts.compiled_fn.shape_env.format_guards()
                 assert guards_string == " - s77 <= 42\n - Eq(Mod(s77, 2), 0)"
+            disable_envs_cache()
 
             m.setenv("VLLM_FORCE_AOT_LOAD", "1")
             vllm_config = make_vllm_config()
@@ -144,7 +151,7 @@ def test_shape_env(monkeypatch: pytest.MonkeyPatch):
 @pytest.mark.skipif(
     not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
 )
-@use_vllm_config(make_vllm_config())
+@create_new_process_for_each_test("spawn")
 def test_gpt2_cache_hit(monkeypatch: pytest.MonkeyPatch):
     """
     Test that compiling gpt2 twice results in a cache hit and
@@ -186,6 +193,8 @@ def test_gpt2_cache_hit(monkeypatch: pytest.MonkeyPatch):
 
             # Clean up first model
             del llm_model
+            disable_envs_cache()
+            vllm.model_executor.layers.activation._ACTIVATION_REGISTRY._dict.clear()
 
             # Second compilation - should hit cache
             m.setenv("VLLM_FORCE_AOT_LOAD", "1")
diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py
index 2502b62645371..5f9d4ac53b80e 100644
--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
@@ -259,24 +259,6 @@ def test_splitting_ops_dynamic():
     assert config.compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE
 
 
-def test_moe_splitting_ops_deepep_ht_piecewise():
-    # Non-inductor, non-attn-fusion case: DeepEP HT with dp>1
-    # should add MoE ops to splitting_ops on top of attention ops.
-    config = VllmConfig(
-        parallel_config=ParallelConfig(
-            all2all_backend="deepep_high_throughput",
-            data_parallel_size=8,
-        ),
-        compilation_config=CompilationConfig(
-            mode=CompilationMode.VLLM_COMPILE,
-        ),
-    )
-    splitting_ops = config.compilation_config.splitting_ops
-    assert splitting_ops is not None
-    assert "vllm::moe_forward" in splitting_ops
-    assert "vllm::moe_forward_shared" in splitting_ops
-
-
 def test_moe_splitting_ops_deepep_ht_inductor_partition():
     # Inductor partition case: user-provided splitting_ops should be
     # preserved and MoE ops should be appended for DeepEP HT with dp>1.
@@ -303,26 +285,6 @@ def test_moe_splitting_ops_deepep_ht_inductor_partition():
     ]
 
 
-def test_moe_splitting_ops_deepep_ht_attn_fusion_no_inductor():
-    # Pure attn-fusion case without inductor partition: even with
-    # DeepEP HT and dp>1, we should not re-enable piecewise compilation
-    # or add MoE ops into splitting_ops.
-    config = VllmConfig(
-        parallel_config=ParallelConfig(
-            all2all_backend="deepep_high_throughput",
-            data_parallel_size=8,
-        ),
-        compilation_config=CompilationConfig(
-            mode=CompilationMode.VLLM_COMPILE,
-            pass_config={"fuse_attn_quant": True, "eliminate_noops": True},
-            custom_ops=["+quant_fp8"],
-            cudagraph_mode=CUDAGraphMode.PIECEWISE,
-        ),
-    )
-    assert config.compilation_config.splitting_ops == []
-    assert config.compilation_config.cudagraph_mode == CUDAGraphMode.FULL
-
-
 def test_should_split():
     import torch
 
diff --git a/tests/compile/test_dynamic_shapes_compilation.py b/tests/compile/test_dynamic_shapes_compilation.py
index 9ccb363b088f5..1fda21dea6361 100644
--- a/tests/compile/test_dynamic_shapes_compilation.py
+++ b/tests/compile/test_dynamic_shapes_compilation.py
@@ -77,6 +77,7 @@ def test_dynamic_shapes_compilation(
                 "evaluate_guards": evaluate_guards,
             },
         },
+        max_model_len=1024,
     )
 
     output = model.generate(prompt)
diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py
index 6b72c595cd779..7755e9f9b7380 100644
--- a/tests/compile/test_fusion.py
+++ b/tests/compile/test_fusion.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import itertools
 
 import pytest
 import torch
@@ -53,37 +52,61 @@ class TestModel(torch.nn.Module):
         hidden_size: int,
         eps: float,
         group_shape: GroupShape,
-        cuda_force_torch: bool,
+        use_aiter: bool = False,
+        cuda_force_torch: bool = False,
+        use_aiter_quant_op: bool = True,
         *args,
         **kwargs,
     ):
         super().__init__(*args, **kwargs)
+        self.use_aiter = use_aiter
+        self.use_aiter_quant_op = use_aiter_quant_op
         self.cuda_force_torch = cuda_force_torch
+        self.group_shape = group_shape
+        self.enable_quant_fp8_custom_op = None  # Will be set later if applicable
+
         self.norm = [RMSNorm(hidden_size, eps) for _ in range(4)]
-        if group_shape.is_per_group():
-            self.wscale = [
-                torch.rand(
-                    (hidden_size // group_shape[1], hidden_size // group_shape[1]),
-                    dtype=torch.float32,
-                )
-                for _ in range(3)
-            ]
-        else:
-            self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(3)]
-        static = group_shape == GroupShape.PER_TENSOR
+
+        # Setup quantization scale descriptor
+        static = group_shape == GroupShape.PER_TENSOR and not use_aiter
         quant_scale = ScaleDesc(torch.float32, static, group_shape)
         self.quant_key = QuantKey(dtype=FP8_DTYPE, scale=quant_scale, symmetric=True)
+
+        # Setup scales
         if static:
             self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(3)]
         else:
             self.scale = [None for _ in range(3)]
+
+        # Setup weights
         self.w = [
             torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE) for _ in range(3)
         ]
-        if not group_shape.is_per_group():
+        if not group_shape.is_per_group() or use_aiter:
             self.w = [self.w[0].t() for _ in range(3)]
 
+        # Setup weight scales
         if group_shape.is_per_group():
+            scale_size = (
+                (hidden_size + 128 - 1) // 128
+                if use_aiter
+                else hidden_size // group_shape[1]
+            )
+            wscale_shape: tuple[int, ...] = (scale_size, scale_size)
+        else:
+            wscale_shape = (1,)
+        self.wscale = [torch.rand(wscale_shape, dtype=torch.float32) for _ in range(3)]
+
+        # Setup FP8 linear operation
+        is_per_group = group_shape.is_per_group()
+        if is_per_group and use_aiter:
+            self.fp8_linear = W8A8BlockFp8LinearOp(
+                weight_group_shape=GroupShape(128, 128),
+                act_quant_group_shape=group_shape,
+                use_aiter_and_is_supported=use_aiter_quant_op,
+            )
+            # AITER blockwise doesn't use enable_quant_fp8_custom_op
+        elif is_per_group:
             self.fp8_linear = W8A8BlockFp8LinearOp(
                 weight_group_shape=GroupShape(group_shape[1], group_shape[1]),
                 act_quant_group_shape=group_shape,
@@ -91,6 +114,13 @@ class TestModel(torch.nn.Module):
                 use_aiter_and_is_supported=False,
             )
             self.enable_quant_fp8_custom_op = self.fp8_linear.input_quant_op.enabled()
+        elif use_aiter:
+            self.fp8_linear = Fp8LinearOp(
+                act_quant_static=False,
+                act_quant_group_shape=group_shape,
+            )
+            self.fp8_linear.quant_fp8.use_aiter = use_aiter_quant_op
+            self.enable_quant_fp8_custom_op = self.fp8_linear.quant_fp8.enabled()
         else:
             with override_cutlass_fp8_supported(not cuda_force_torch):
                 self.fp8_linear = Fp8LinearOp(
@@ -100,7 +130,6 @@ class TestModel(torch.nn.Module):
                 self.enable_quant_fp8_custom_op = self.fp8_linear.quant_fp8.enabled()
 
         self.enable_rms_norm_custom_op = self.norm[0].enabled()
-        self.group_shape = group_shape
 
     def forward(self, x):
         # avoid having graph input be an arg to a pattern directly
@@ -126,19 +155,49 @@ class TestModel(torch.nn.Module):
         y4, resid = self.norm[3](x4, resid)  # use resid here
         return y4
 
+    def ops_in_model_before(self):
+        if (
+            self.use_aiter
+            and self.group_shape.is_per_group()
+            and current_platform.is_fp8_fnuz()
+        ):
+            return [rocm_aiter_ops.get_group_quant_op()]
+        if self.use_aiter and self.group_shape.is_per_group():
+            return [torch.ops.vllm.triton_per_token_group_quant_fp8.default]
+        if self.use_aiter and self.use_aiter_quant_op:
+            return [rocm_aiter_ops.get_per_token_quant_op()]
+        if self.use_aiter:
+            return [QUANT_OPS[self.quant_key]]
+        if self.enable_quant_fp8_custom_op:
+            return [QUANT_OPS[self.quant_key]]
+        return [torch.ops.aten.reciprocal]
+
     def ops_in_model_after(self):
+        if self.use_aiter and self.group_shape.is_per_group():
+            from vllm.compilation.rocm_aiter_fusion import (
+                AiterFusedAddRMSFp8GroupQuantPattern,
+                AiterRMSFp8GroupQuantPattern,
+            )
+
+            return [
+                AiterFusedAddRMSFp8GroupQuantPattern.FUSED_OP,
+                AiterRMSFp8GroupQuantPattern.FUSED_OP,
+            ]
+        if self.use_aiter:
+            from vllm.compilation.rocm_aiter_fusion import (
+                AiterFusedAddRMSNormDynamicQuantPattern,
+                AiterRMSNormDynamicQuantPattern,
+            )
+
+            return [
+                AiterFusedAddRMSNormDynamicQuantPattern.FUSED_OP,
+                AiterRMSNormDynamicQuantPattern.FUSED_OP,
+            ]
         return [
             FUSED_OPS[FusedRMSQuantKey(self.quant_key, True)],
             FUSED_OPS[FusedRMSQuantKey(self.quant_key, False)],
         ]
 
-    def ops_in_model_before(self):
-        return (
-            [QUANT_OPS[self.quant_key]]
-            if self.enable_quant_fp8_custom_op
-            else [torch.ops.aten.reciprocal]
-        )
-
     def ops_in_model_before_partial(self):
         return (
             [RMS_OP, RMS_ADD_OP]
@@ -155,67 +214,45 @@ GROUP_SHAPES = [
 ]
 
 
-class TestRmsnormGroupFp8QuantModel(torch.nn.Module):
-    def __init__(self, hidden_size: int, eps: float, **kwargs):
-        super().__init__()
-        self.w8a8_block_fp8_linear = W8A8BlockFp8LinearOp(
-            weight_group_shape=GroupShape(128, 128),
-            act_quant_group_shape=GroupShape(1, 128),
-            cutlass_block_fp8_supported=False,
-            use_aiter_and_is_supported=True,
-        )
-        self.w = [
-            torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t()
-            for _ in range(3)
-        ]
+def _run_fusion_test(
+    model,
+    fusion_pass,
+    vllm_config,
+    dtype,
+    hidden_size,
+    num_tokens,
+):
+    """Helper function for common fusion test logic.
 
-        scale_hidden_size = (hidden_size + 128 - 1) // 128
-        self.wscale = [
-            torch.rand((scale_hidden_size, scale_hidden_size), dtype=torch.float32)
-            for _ in range(3)
-        ]
+    Must be called within vllm_config context.
+    """
+    noop_pass = NoOpEliminationPass(vllm_config)
+    cleanup_pass = PostCleanupPass(vllm_config)
 
-        self.norm_weight = [torch.ones(hidden_size) for _ in range(4)]
-        self.eps = eps
+    backend = TestBackend(noop_pass, fusion_pass, cleanup_pass)
+    backend2 = TestBackend(noop_pass, cleanup_pass)
 
-    def forward(self, x):
-        # avoid having graph input be an arg to a pattern directly
-        x = resid = torch.relu(x)
-        y = rocm_aiter_ops.rms_norm(x, self.norm_weight[0], self.eps)
+    x = torch.rand(num_tokens, hidden_size)
+    torch._dynamo.mark_dynamic(x, 0)
 
-        x2 = self.w8a8_block_fp8_linear.apply(y, self.w[0], self.wscale[0])
-        # make sure resid is used for replacement to work
-        y2, resid = rocm_aiter_ops.rms_norm2d_with_add(
-            x2, resid, self.norm_weight[1], self.eps
-        )
+    model_fused = torch.compile(model, backend=backend)
+    result_fused = model_fused(x)
 
-        x3 = self.w8a8_block_fp8_linear.apply(y2, self.w[1], self.wscale[1])
+    model_unfused = torch.compile(model, backend=backend2)
+    result_unfused = model_unfused(x)
 
-        y3, resid = rocm_aiter_ops.rms_norm2d_with_add(
-            x3, resid, self.norm_weight[2], self.eps
-        )
+    if dtype == torch.float16:
+        ATOL, RTOL = (2e-3, 2e-3)
+    else:
+        ATOL, RTOL = (1e-2, 1e-2)
 
-        x4 = self.w8a8_block_fp8_linear.apply(y3, self.w[2], self.wscale[2])
+    torch.testing.assert_close(result_fused, result_unfused, atol=ATOL, rtol=RTOL)
 
-        y4, resid = rocm_aiter_ops.rms_norm2d_with_add(
-            x4, resid, self.norm_weight[3], self.eps
-        )
-        return y4
+    assert fusion_pass.matched_count == 3
+    backend.check_before_ops(model.ops_in_model_before())
+    backend.check_after_ops(model.ops_in_model_after())
 
-    def ops_in_model_before(self):
-        return [
-            torch.ops.vllm.rocm_aiter_rms_norm,
-            torch.ops.vllm.rocm_aiter_group_fp8_quant,
-        ]
-
-    def ops_in_model_before_partial(self):
-        return []
-
-    def ops_in_model_after(self):
-        return [
-            torch.ops.vllm.rocm_aiter_rmsnorm_fp8_group_quant,
-            torch.ops.vllm.rocm_aiter_rmsnorm_with_add_fp8_group_quant,
-        ]
+    return backend, backend2
 
 
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
@@ -223,11 +260,8 @@ class TestRmsnormGroupFp8QuantModel(torch.nn.Module):
 @pytest.mark.parametrize("num_tokens", [257])
 @pytest.mark.parametrize("eps", [1e-5, 1e-6])
 @pytest.mark.parametrize("group_shape", GROUP_SHAPES)
-@pytest.mark.parametrize(
-    "model_class, enable_rms_norm_custom_op, enable_quant_fp8_custom_op",
-    list(itertools.product([TestModel], [True, False], [True, False]))
-    + [(TestRmsnormGroupFp8QuantModel, False, False)],
-)
+@pytest.mark.parametrize("enable_rms_norm_custom_op", [True, False])
+@pytest.mark.parametrize("enable_quant_fp8_custom_op", [True, False])
 # cuda_force_torch used to test torch code path on platforms that
 # cutlass_fp8_supported() == True.
 @pytest.mark.parametrize(
@@ -242,23 +276,13 @@ def test_fusion_rmsnorm_quant(
     num_tokens,
     eps,
     group_shape,
-    model_class,
     enable_rms_norm_custom_op,
     enable_quant_fp8_custom_op,
     cuda_force_torch,
 ):
-    if model_class is TestRmsnormGroupFp8QuantModel and not IS_AITER_FOUND:
-        pytest.skip("AITER is not supported on this GPU.")
-
-    torch.set_default_device("cuda")
-    torch.set_default_dtype(dtype)
-    torch.manual_seed(1)
-    maybe_create_device_identity()  # needed for certain non-cutlass fp8 paths
-
     if not enable_quant_fp8_custom_op and group_shape.is_per_group():
         pytest.skip("Unsupported unwrapped quant fp8 op for blockwise quantization")
 
-    # Skip test for 64-bit group shape when running with cutlass or deepgemm
     if group_shape == GroupShape(1, 64) and (
         cutlass_block_fp8_supported() or is_deep_gemm_supported()
     ):
@@ -269,6 +293,7 @@ def test_fusion_rmsnorm_quant(
         custom_ops.append("+rms_norm")
     if enable_quant_fp8_custom_op:
         custom_ops.append("+quant_fp8")
+
     vllm_config = VllmConfig(
         model_config=ModelConfig(dtype=dtype),
         compilation_config=CompilationConfig(
@@ -279,60 +304,97 @@ def test_fusion_rmsnorm_quant(
             ),
         ),
     )
+
     with vllm.config.set_current_vllm_config(vllm_config):
-        # Reshape pass is needed for the fusion pass to work
-        noop_pass = NoOpEliminationPass(vllm_config)
-        if model_class is TestRmsnormGroupFp8QuantModel:
-            from vllm.compilation.rocm_aiter_fusion import (
-                RocmAiterRMSNormFp8GroupQuantFusionPass,
-            )
+        # Setup device before model creation
+        torch.set_default_device("cuda")
+        torch.set_default_dtype(dtype)
+        torch.manual_seed(1)
+        maybe_create_device_identity()
 
-            fusion_pass = RocmAiterRMSNormFp8GroupQuantFusionPass(vllm_config)
-        else:
-            fusion_pass = RMSNormQuantFusionPass(vllm_config)
-        cleanup_pass = PostCleanupPass(vllm_config)
-
-        backend = TestBackend(noop_pass, fusion_pass, cleanup_pass)
-        backend2 = TestBackend(noop_pass, cleanup_pass)
-        model = model_class(
+        fusion_pass = RMSNormQuantFusionPass(vllm_config)
+        model = TestModel(
             hidden_size=hidden_size,
             eps=eps,
             group_shape=group_shape,
+            use_aiter=False,
             cuda_force_torch=cuda_force_torch,
         )
-        # First dimension dynamic
-        x = torch.rand(num_tokens, hidden_size)
-        torch._dynamo.mark_dynamic(x, 0)
 
-        model_fused = torch.compile(model, backend=backend)
-        result_fused = model_fused(x)
-
-        model_unfused = torch.compile(model, backend=backend2)
-        result_unfused = model_unfused(x)
-
-        if dtype == torch.float16:
-            ATOL, RTOL = (2e-3, 2e-3)
-        else:
-            ATOL, RTOL = (1e-2, 1e-2)
-
-        torch.testing.assert_close(result_fused, result_unfused, atol=ATOL, rtol=RTOL)
-
-        assert fusion_pass.matched_count == 3
-        backend.check_before_ops(model.ops_in_model_before())
+        backend, _ = _run_fusion_test(
+            model, fusion_pass, vllm_config, dtype, hidden_size, num_tokens
+        )
         backend.check_before_ops(
             model.ops_in_model_before_partial(), fully_replaced=False
         )
-        backend.check_after_ops(model.ops_in_model_after())
 
         # If RMSNorm custom op is disabled (native/torch impl used),
         # there's a risk that the fused add doesn't get included in the
         # replacement and only the rms part gets fused with quant.
         # Hence, we check only 2 add nodes are left (final fused rmsnorm add).
-        if (
-            not enable_rms_norm_custom_op
-            and model_class is not TestRmsnormGroupFp8QuantModel
-        ):
+        if not enable_rms_norm_custom_op:
             n_add_nodes = lambda g: sum(1 for _ in find_op_nodes(torch.ops.aten.add, g))
             # 7 = 1 (RMS) + 3x2 (3xRMS_ADD, 2 each)
             assert n_add_nodes(backend.graph_pre_pass) == 7
             assert n_add_nodes(backend.graph_post_pass) == 2
+
+
+GROUP_SHAPE_QUANT_OPS_MATCHS = [
+    (GroupShape.PER_TOKEN, True),
+    (GroupShape.PER_TOKEN, False),
+    (GroupShape(1, 128), True),
+]
+
+
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("hidden_size", [256])
+@pytest.mark.parametrize("num_tokens", [257])
+@pytest.mark.parametrize("eps", [1e-5, 1e-6])
+@pytest.mark.parametrize(
+    "group_shape, use_aiter_quant_op", GROUP_SHAPE_QUANT_OPS_MATCHS
+)
+@pytest.mark.skipif(
+    (not current_platform.is_rocm() or not IS_AITER_FOUND),
+    reason="Only test on ROCm with aiter package installed",
+)
+def test_aiter_fusion_rmsnorm_quant(
+    dtype: torch.dtype,
+    hidden_size: int,
+    num_tokens: int,
+    eps: float,
+    group_shape: GroupShape,
+    use_aiter_quant_op: bool,
+    monkeypatch: pytest.MonkeyPatch,
+):
+    vllm_config = VllmConfig(
+        model_config=ModelConfig(dtype=dtype),
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE,
+            custom_ops=["+rms_norm", "+quant_fp8"],
+            pass_config=PassConfig(fuse_norm_quant=True, eliminate_noops=True),
+        ),
+    )
+
+    with vllm.config.set_current_vllm_config(vllm_config), monkeypatch.context() as m:
+        from vllm.compilation.rocm_aiter_fusion import RocmAiterRMSNormFusionPass
+
+        m.setenv("VLLM_ROCM_USE_AITER", "1")
+        rocm_aiter_ops.refresh_env_variables()
+
+        torch.set_default_device("cuda")
+        torch.set_default_dtype(dtype)
+        torch.manual_seed(1)
+        maybe_create_device_identity()
+
+        fusion_pass = RocmAiterRMSNormFusionPass(vllm_config)
+        model = TestModel(
+            hidden_size=hidden_size,
+            eps=eps,
+            group_shape=group_shape,
+            use_aiter=True,
+            use_aiter_quant_op=use_aiter_quant_op,
+        )
+
+        _run_fusion_test(
+            model, fusion_pass, vllm_config, dtype, hidden_size, num_tokens
+        )
diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py
index 4cf864bdb2de9..9fe1d906d857e 100644
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -8,7 +8,7 @@ import pytest
 import pytest_asyncio
 
 from vllm.assets.audio import AudioAsset
-from vllm.multimodal.utils import encode_audio_base64, fetch_audio
+from vllm.multimodal.utils import encode_audio_base64, encode_audio_url, fetch_audio
 
 from ...utils import RemoteOpenAIServer
 
@@ -53,6 +53,14 @@ def base64_encoded_audio() -> dict[str, str]:
     }
 
 
+@pytest.fixture(scope="session")
+def url_encoded_audio() -> dict[str, str]:
+    return {
+        audio_url: encode_audio_url(*fetch_audio(audio_url))
+        for audio_url in TEST_AUDIO_URLS
+    }
+
+
 def dummy_messages_from_audio_url(
     audio_urls: str | list[str],
     content_text: str = "What's happening in this audio?",
@@ -149,11 +157,9 @@ async def test_single_chat_session_audio_base64encoded(
     client: openai.AsyncOpenAI,
     model_name: str,
     audio_url: str,
-    base64_encoded_audio: dict[str, str],
+    url_encoded_audio: dict[str, str],
 ):
-    messages = dummy_messages_from_audio_url(
-        f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}"
-    )
+    messages = dummy_messages_from_audio_url(url_encoded_audio[audio_url])
 
     # test single completion
     chat_completion = await client.chat.completions.create(
@@ -313,7 +319,7 @@ async def test_chat_streaming_input_audio(
                         "format": "wav",
                     },
                 },
-                {"type": "text", "text": "What's happening in this audio?"},
+                {"type": "text", "text": "What's a short title for this audio?"},
             ],
         }
     ]
diff --git a/tests/entrypoints/openai/test_chat_error.py b/tests/entrypoints/openai/test_chat_error.py
index b194e9b74d874..1f30d8cf1e8cc 100644
--- a/tests/entrypoints/openai/test_chat_error.py
+++ b/tests/entrypoints/openai/test_chat_error.py
@@ -76,6 +76,7 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
         lora_request,
         trace_headers,
         priority,
+        data_parallel_rank,
     ):
         return dict(engine_prompt), {}
 
diff --git a/tests/entrypoints/openai/test_completion_error.py b/tests/entrypoints/openai/test_completion_error.py
index ca56cc2ddb6a7..6643aa471321b 100644
--- a/tests/entrypoints/openai/test_completion_error.py
+++ b/tests/entrypoints/openai/test_completion_error.py
@@ -73,6 +73,7 @@ def _build_serving_completion(engine: AsyncLLM) -> OpenAIServingCompletion:
         lora_request,
         trace_headers,
         priority,
+        data_parallel_rank,
     ):
         return dict(engine_prompt), {}
 
diff --git a/tests/entrypoints/openai/test_embedding_shape_validation.py b/tests/entrypoints/openai/test_embedding_shape_validation.py
new file mode 100644
index 0000000000000..27060e0be5aee
--- /dev/null
+++ b/tests/entrypoints/openai/test_embedding_shape_validation.py
@@ -0,0 +1,223 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Embedding shape validation in multimodal APIs.
+
+Tests verify that embeddings with correct ndim but incorrect hidden_size
+are rejected before they can cause crashes during model inference.
+
+Validation is performed by the parser (MultiModalDataParser) and EmbeddingItems
+classes, not by CompletionRenderer or MediaIO classes.
+"""
+
+import pytest
+import torch
+
+from vllm.multimodal.parse import (
+    AudioEmbeddingItems,
+    ImageEmbeddingItems,
+    MultiModalDataParser,
+    VideoEmbeddingItems,
+)
+
+
+class TestMultiModalParserShapeValidation:
+    """Test hidden_size validation in MultiModalDataParser."""
+
+    def test_image_embeddings_correct_hidden_size_accepted(self):
+        """Baseline: Image embeddings with correct hidden_size should work."""
+        expected_hidden_size = 768
+        parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size)
+
+        valid_embeds = torch.randn(2, 100, expected_hidden_size)
+
+        result = parser.parse_mm_data({"image": valid_embeds})
+
+        assert "image" in result
+        assert isinstance(result["image"], ImageEmbeddingItems)
+        assert result["image"].get_count() == 2
+
+    def test_image_embeddings_wrong_hidden_size_rejected(self):
+        """Security: Image embeddings with wrong hidden_size should be rejected."""
+        expected_hidden_size = 768
+        wrong_hidden_size = 4096
+        parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size)
+
+        invalid_embeds = torch.randn(2, 100, wrong_hidden_size)
+
+        with pytest.raises(ValueError) as exc_info:
+            parser.parse_mm_data({"image": invalid_embeds})
+
+        error_msg = str(exc_info.value).lower()
+        assert "image" in error_msg
+        assert "hidden dimension mismatch" in error_msg
+
+    def test_audio_embeddings_wrong_hidden_size_rejected(self):
+        """Security: Audio embeddings with wrong hidden_size should be rejected."""
+        expected_hidden_size = 768
+        wrong_hidden_size = 2048
+        parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size)
+
+        invalid_embeds = torch.randn(2, 100, wrong_hidden_size)
+
+        with pytest.raises(ValueError) as exc_info:
+            parser.parse_mm_data({"audio": invalid_embeds})
+
+        error_msg = str(exc_info.value).lower()
+        assert "audio" in error_msg
+        assert "hidden dimension mismatch" in error_msg
+
+    def test_video_embeddings_wrong_hidden_size_rejected(self):
+        """Security: Video embeddings with wrong hidden_size should be rejected."""
+        expected_hidden_size = 768
+        wrong_hidden_size = 512
+        parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size)
+
+        invalid_embeds = torch.randn(2, 100, wrong_hidden_size)
+
+        with pytest.raises(ValueError) as exc_info:
+            parser.parse_mm_data({"video": invalid_embeds})
+
+        error_msg = str(exc_info.value).lower()
+        assert "video" in error_msg
+        assert "hidden dimension mismatch" in error_msg
+
+    def test_list_of_embeddings_validates_each(self):
+        """Security: Each embedding in list should be validated."""
+        expected_hidden_size = 768
+        wrong_hidden_size = 1024
+        parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size)
+
+        # List with second tensor having wrong hidden_size
+        invalid_embeds = [
+            torch.randn(100, expected_hidden_size),
+            torch.randn(100, wrong_hidden_size),
+        ]
+
+        with pytest.raises(ValueError) as exc_info:
+            parser.parse_mm_data({"image": invalid_embeds})
+
+        # Should identify which embedding failed
+        assert "[1]" in str(exc_info.value)
+
+    def test_validation_disabled_allows_any_size(self):
+        """When validation disabled (legacy), any hidden_size allowed."""
+        parser = MultiModalDataParser(expected_hidden_size=None)
+
+        any_hidden_size = 12345
+        embeds = torch.randn(2, 100, any_hidden_size)
+
+        # Should not raise
+        result = parser.parse_mm_data({"image": embeds})
+        assert "image" in result
+        assert isinstance(result["image"], ImageEmbeddingItems)
+
+
+class TestEmbeddingItemsDirectValidation:
+    """Direct tests for EmbeddingItems hidden_size validation."""
+
+    def test_image_embedding_items_validates_batched_tensor(self):
+        """Test validation for batched (3D) image embeddings."""
+        expected = 768
+        wrong = 1024
+
+        # Valid
+        valid = torch.randn(2, 100, expected)
+        items = ImageEmbeddingItems(valid, expected_hidden_size=expected)
+        assert items.get_count() == 2
+
+        # Invalid
+        invalid = torch.randn(2, 100, wrong)
+        with pytest.raises(ValueError) as exc_info:
+            ImageEmbeddingItems(invalid, expected_hidden_size=expected)
+
+        assert str(wrong) in str(exc_info.value)
+        assert str(expected) in str(exc_info.value)
+
+    def test_image_embedding_items_validates_list_of_tensors(self):
+        """Test validation for list of 2D image embeddings."""
+        expected = 768
+        wrong = 512
+
+        # Valid list
+        valid_list = [torch.randn(100, expected), torch.randn(50, expected)]
+        items = ImageEmbeddingItems(valid_list, expected_hidden_size=expected)
+        assert items.get_count() == 2
+
+        # Invalid list
+        invalid_list = [torch.randn(100, expected), torch.randn(50, wrong)]
+        with pytest.raises(ValueError) as exc_info:
+            ImageEmbeddingItems(invalid_list, expected_hidden_size=expected)
+
+        assert "[1]" in str(exc_info.value)
+
+    def test_audio_embedding_items_validates(self):
+        """Test validation for audio embeddings."""
+        expected = 768
+        wrong = 256
+
+        invalid = torch.randn(2, 100, wrong)
+        with pytest.raises(ValueError) as exc_info:
+            AudioEmbeddingItems(invalid, expected_hidden_size=expected)
+
+        assert "audio" in str(exc_info.value).lower()
+
+    def test_video_embedding_items_validates(self):
+        """Test validation for video embeddings."""
+        expected = 768
+        wrong = 384
+
+        invalid = torch.randn(2, 100, wrong)
+        with pytest.raises(ValueError) as exc_info:
+            VideoEmbeddingItems(invalid, expected_hidden_size=expected)
+
+        assert "video" in str(exc_info.value).lower()
+
+
+class TestShapeValidationIntegration:
+    """Integration tests verifying attack scenarios are blocked."""
+
+    def test_attack_scenario_multimodal_image(self):
+        """
+        Simulate attack through Chat API with image embeddings.
+
+        Verifies validation occurs in multimodal parser path.
+        """
+        expected_hidden_size = 768
+        wrong_hidden_size = 4096
+        parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size)
+
+        attack_tensor = torch.randn(1, 100, wrong_hidden_size)
+
+        with pytest.raises(ValueError):
+            parser.parse_mm_data({"image": attack_tensor})
+
+    def test_attack_scenario_multimodal_audio(self):
+        """
+        Simulate attack through Chat API with audio embeddings.
+
+        Verifies validation occurs in multimodal parser path.
+        """
+        expected_hidden_size = 768
+        wrong_hidden_size = 2048
+        parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size)
+
+        attack_tensor = torch.randn(1, 100, wrong_hidden_size)
+
+        with pytest.raises(ValueError):
+            parser.parse_mm_data({"audio": attack_tensor})
+
+    def test_attack_scenario_multimodal_video(self):
+        """
+        Simulate attack through Chat API with video embeddings.
+
+        Verifies validation occurs in multimodal parser path.
+        """
+        expected_hidden_size = 768
+        wrong_hidden_size = 1024
+        parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size)
+
+        attack_tensor = torch.randn(1, 100, wrong_hidden_size)
+
+        with pytest.raises(ValueError):
+            parser.parse_mm_data({"video": attack_tensor})
diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py
index 6f2a50020699c..8ef0d7f277d5f 100644
--- a/tests/entrypoints/openai/test_response_api_with_harmony.py
+++ b/tests/entrypoints/openai/test_response_api_with_harmony.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import importlib
+import importlib.util
 import json
 import time
 
@@ -986,3 +987,23 @@ async def test_function_call_with_previous_input_messages(
     assert (
         "aquarius" in output_text or "otter" in output_text or "tuesday" in output_text
     )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_chat_truncation_content_not_null(client: OpenAI, model_name: str):
+    response = await client.chat.completions.create(
+        model=model_name,
+        messages=[{"role": "user", "content": "What is the role of AI in medicine?"}],
+        temperature=0.0,
+        max_tokens=250,
+    )
+
+    choice = response.choices[0]
+    assert choice.finish_reason == "length", (
+        f"Expected finish_reason='length', got {choice.finish_reason}"
+    )
+    assert choice.message.content is not None, (
+        "Content should not be None when truncated"
+    )
+    assert len(choice.message.content) > 0, "Content should not be empty"
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 2befa40d636da..d845913b8ee03 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -15,6 +15,7 @@ from vllm.entrypoints.openai.parser.harmony_utils import get_encoding
 from vllm.entrypoints.openai.protocol import (
     ChatCompletionRequest,
     ChatCompletionResponse,
+    ErrorResponse,
     RequestResponseMetadata,
 )
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
@@ -52,8 +53,19 @@ def with_tool_parser(request) -> bool:
     return request.param
 
 
+@pytest.fixture(
+    scope="module",
+    params=[True],
+    ids=["exclude_tools_when_tool_choice_none"],
+)
+def exclude_tools_when_tool_choice_none(request) -> bool:
+    return request.param
+
+
 @pytest.fixture(scope="module")
-def default_server_args(with_tool_parser: bool):
+def default_server_args(
+    with_tool_parser: bool, exclude_tools_when_tool_choice_none: bool
+):
     args = [
         # use half precision for speed and memory savings in CI environment
         "--enforce-eager",
@@ -72,6 +84,8 @@ def default_server_args(with_tool_parser: bool):
                 "--enable-auto-tool-choice",
             ]
         )
+    if exclude_tools_when_tool_choice_none:
+        args.append("--exclude-tools-when-tool-choice-none")
     return args
 
 
@@ -335,6 +349,69 @@ async def test_gpt_oss_tool_message_array_content(
     assert response_multi_array.choices[0].message is not None
 
 
+@pytest.mark.asyncio
+async def test_gpt_oss_tool_choice_none(
+    gptoss_client: OpenAI,
+    with_tool_parser: bool,
+    exclude_tools_when_tool_choice_none: bool,
+):
+    if not (with_tool_parser and exclude_tools_when_tool_choice_none):
+        pytest.skip(
+            "skip tool_choice tests when non-tool or "
+            "--exclude-tools-when-tool-choice-none not set"
+        )
+
+    tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": "get_current_weather",
+                "description": "Get the current weather in a given location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "city": {"type": "string"},
+                        "state": {"type": "string"},
+                        "unit": {
+                            "type": "string",
+                            "enum": ["celsius", "fahrenheit"],
+                        },
+                    },
+                    "required": ["city", "state", "unit"],
+                },
+            },
+        }
+    ]
+
+    messages = [
+        {
+            "role": "user",
+            "content": "What's the temperature(in degrees Celsius) in Dallas?",
+        },
+    ]
+
+    tool_choice_auto = await gptoss_client.chat.completions.create(
+        model=GPT_OSS_MODEL_NAME,
+        messages=messages,
+        tools=tools,
+        tool_choice="auto",
+        temperature=0.0,
+    )
+    msg = tool_choice_auto.choices[0].message
+    assert len(msg.tool_calls) == 1
+
+    tool_choice_none = await gptoss_client.chat.completions.create(
+        model=GPT_OSS_MODEL_NAME,
+        messages=messages,
+        tools=tools,
+        tool_choice="none",
+        temperature=0.0,
+    )
+
+    msg = tool_choice_none.choices[0].message
+    assert len(msg.tool_calls) == 0
+
+
 MODEL_NAME = "openai-community/gpt2"
 MODEL_NAME_SHORT = "gpt2"
 CHAT_TEMPLATE = "Dummy chat template for testing {}"
@@ -396,6 +473,7 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
         lora_request,
         trace_headers,
         priority,
+        data_parallel_rank,
     ):
         return dict(engine_prompt), {}
 
@@ -877,7 +955,6 @@ class TestServingChatWithHarmony:
             input_messages,
             [
                 {"role": "system"},
-                {"role": "developer"},
                 {"role": "user", "content": messages[0]["content"]},
             ],
         )
@@ -905,7 +982,6 @@ class TestServingChatWithHarmony:
             input_messages_2,
             [
                 {"role": "system"},
-                {"role": "developer"},
                 {"role": "user"},
                 # The analysis message should be dropped on subsequent inputs because
                 # of the subsequent assistant message to the final channel.
@@ -965,7 +1041,7 @@ class TestServingChatWithHarmony:
         )
 
         # Test the Harmony messages for the second turn's input
-        req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
+        req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
         input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
         verify_harmony_messages(
             input_messages_2,
@@ -1046,7 +1122,7 @@ class TestServingChatWithHarmony:
         )
 
         # Test the Harmony messages for the second turn's input
-        req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
+        req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
         input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
         verify_harmony_messages(
             input_messages_2,
@@ -1127,7 +1203,7 @@ class TestServingChatWithHarmony:
         )
 
         # Test the Harmony messages for the second turn's input
-        req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
+        req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
         input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
         verify_harmony_messages(
             input_messages_2,
@@ -1177,7 +1253,7 @@ class TestServingChatWithHarmony:
         )
 
         # Test the Harmony messages for the third turn's input
-        req_3 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
+        req_3 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
         input_messages_3, _ = serving_chat._make_request_with_harmony(req_3)
         verify_harmony_messages(
             input_messages_3,
@@ -1240,7 +1316,7 @@ class TestServingChatWithHarmony:
         )
 
         # Test the Harmony messages for the fourth turn's input
-        req_4 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
+        req_4 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
         input_messages_4, _ = serving_chat._make_request_with_harmony(req_4)
         verify_harmony_messages(
             input_messages_4,
@@ -1296,7 +1372,6 @@ class TestServingChatWithHarmony:
             input_messages,
             [
                 {"role": "system"},
-                {"role": "developer"},
                 {"role": "user", "content": messages[0]["content"]},
                 # The reasoning that would have resulted in an analysis message is
                 # dropped because of a later assistant message to the final channel.
@@ -1328,7 +1403,6 @@ class TestServingChatWithHarmony:
             input_messages,
             [
                 {"role": "system"},
-                {"role": "developer"},
                 {"role": "user", "content": messages[0]["content"]},
                 {
                     "role": "assistant",
@@ -1358,7 +1432,6 @@ class TestServingChatWithHarmony:
             input_messages,
             [
                 {"role": "system"},
-                {"role": "developer"},
                 {"role": "user", "content": messages[0]["content"]},
                 {
                     "role": "assistant",
@@ -1367,3 +1440,69 @@ class TestServingChatWithHarmony:
                 },
             ],
         )
+
+
+@pytest.mark.asyncio
+async def test_tool_choice_validation_without_parser():
+    """Test that tool_choice='required' or named tool without tool_parser
+    returns an appropriate error message."""
+    mock_engine = MagicMock(spec=AsyncLLM)
+    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
+    mock_engine.errored = False
+    mock_engine.model_config = MockModelConfig()
+    mock_engine.input_processor = MagicMock()
+    mock_engine.io_processor = MagicMock()
+
+    models = OpenAIServingModels(
+        engine_client=mock_engine,
+        base_model_paths=BASE_MODEL_PATHS,
+    )
+    # Create serving_chat without tool_parser (enable_auto_tools=False)
+    serving_chat = OpenAIServingChat(
+        mock_engine,
+        models,
+        response_role="assistant",
+        chat_template=CHAT_TEMPLATE,
+        chat_template_content_format="auto",
+        request_logger=None,
+        enable_auto_tools=False,  # No tool parser
+    )
+
+    tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": "get_weather",
+                "description": "Get the weather in a given location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {"location": {"type": "string"}},
+                    "required": ["location"],
+                },
+            },
+        }
+    ]
+
+    # Test tool_choice="required" without tool_parser
+    req_required = ChatCompletionRequest(
+        model=MODEL_NAME,
+        messages=[{"role": "user", "content": "What's the weather?"}],
+        tools=tools,
+        tool_choice="required",
+    )
+    response_required = await serving_chat.create_chat_completion(req_required)
+    assert isinstance(response_required, ErrorResponse)
+    assert "tool_choice" in response_required.error.message
+    assert "--tool-call-parser" in response_required.error.message
+
+    # Test named tool_choice without tool_parser
+    req_named = ChatCompletionRequest(
+        model=MODEL_NAME,
+        messages=[{"role": "user", "content": "What's the weather?"}],
+        tools=tools,
+        tool_choice={"type": "function", "function": {"name": "get_weather"}},
+    )
+    response_named = await serving_chat.create_chat_completion(req_named)
+    assert isinstance(response_named, ErrorResponse)
+    assert "tool_choice" in response_named.error.message
+    assert "--tool-call-parser" in response_named.error.message
diff --git a/tests/entrypoints/openai/test_serving_chat_stream_harmony.py b/tests/entrypoints/openai/test_serving_chat_stream_harmony.py
new file mode 100644
index 0000000000000..1934d43d5cfb6
--- /dev/null
+++ b/tests/entrypoints/openai/test_serving_chat_stream_harmony.py
@@ -0,0 +1,212 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Unit tests for harmony streaming delta extraction.
+"""
+
+from dataclasses import dataclass, field
+from unittest.mock import patch
+
+import pytest
+
+from vllm.entrypoints.openai.serving_chat_stream_harmony import (
+    extract_harmony_streaming_delta,
+)
+
+
+@dataclass
+class MockMessage:
+    """Mock message object for testing."""
+
+    channel: str | None = None
+    recipient: str | None = None
+
+
+@dataclass
+class MockStreamableParser:
+    """Mock StreamableParser for testing without openai_harmony dependency."""
+
+    messages: list[MockMessage] = field(default_factory=list)
+
+
+class TestExtractHarmonyStreamingDelta:
+    """Tests for extract_harmony_streaming_delta function."""
+
+    @pytest.mark.parametrize(
+        "delta_text,expected_content",
+        [
+            ("Hello, world!", "Hello, world!"),
+            ("", ""),
+        ],
+    )
+    def test_final_channel_returns_content_delta(self, delta_text, expected_content):
+        """Test that final channel returns a DeltaMessage with content."""
+        parser = MockStreamableParser()
+        delta_message, tools_streamed = extract_harmony_streaming_delta(
+            harmony_parser=parser,
+            cur_channel="final",
+            cur_recipient=None,
+            prev_recipient=None,
+            delta_text=delta_text,
+            include_reasoning=False,
+        )
+
+        assert delta_message is not None
+        assert delta_message.content == expected_content
+        assert tools_streamed is False
+
+    @pytest.mark.parametrize(
+        "include_reasoning,expected_has_message",
+        [
+            (True, True),
+            (False, False),
+        ],
+    )
+    def test_analysis_channel_reasoning(self, include_reasoning, expected_has_message):
+        """Test analysis channel respects include_reasoning flag."""
+        parser = MockStreamableParser()
+        delta_message, tools_streamed = extract_harmony_streaming_delta(
+            harmony_parser=parser,
+            cur_channel="analysis",
+            cur_recipient=None,
+            prev_recipient=None,
+            delta_text="Let me think...",
+            include_reasoning=include_reasoning,
+        )
+
+        if expected_has_message:
+            assert delta_message is not None
+            assert delta_message.reasoning == "Let me think..."
+        else:
+            assert delta_message is None
+        assert tools_streamed is False
+
+    @pytest.mark.parametrize("channel", ["commentary", "analysis"])
+    @patch("vllm.entrypoints.openai.serving_chat_stream_harmony.make_tool_call_id")
+    def test_new_tool_call(self, mock_make_tool_call_id, channel):
+        """Test new tool call creation when recipient changes."""
+        mock_make_tool_call_id.return_value = "call_test123"
+        parser = MockStreamableParser()
+
+        delta_message, tools_streamed = extract_harmony_streaming_delta(
+            harmony_parser=parser,
+            cur_channel=channel,
+            cur_recipient="functions.get_weather",
+            prev_recipient=None,
+            delta_text="",
+            include_reasoning=False,
+        )
+
+        assert delta_message is not None
+        assert len(delta_message.tool_calls) == 1
+        tool_call = delta_message.tool_calls[0]
+        assert tool_call.id == "call_test123"
+        assert tool_call.type == "function"
+        assert tool_call.function.name == "get_weather"
+        assert tool_call.function.arguments == ""
+        assert tool_call.index == 0
+        assert tools_streamed is True
+
+    @pytest.mark.parametrize("channel", ["commentary", "analysis"])
+    def test_tool_call_argument_streaming(self, channel):
+        """Test streaming tool call arguments (same recipient)."""
+        parser = MockStreamableParser()
+
+        delta_message, tools_streamed = extract_harmony_streaming_delta(
+            harmony_parser=parser,
+            cur_channel=channel,
+            cur_recipient="functions.get_weather",
+            prev_recipient="functions.get_weather",
+            delta_text='{"location": "Paris"}',
+            include_reasoning=False,
+        )
+
+        assert delta_message is not None
+        tool_call = delta_message.tool_calls[0]
+        assert tool_call.id is None
+        assert tool_call.function.arguments == '{"location": "Paris"}'
+        assert tool_call.index == 0
+        assert tools_streamed is True
+
+    @pytest.mark.parametrize("channel", ["commentary", "analysis"])
+    def test_tool_call_empty_arguments_returns_none(self, channel):
+        """Test empty delta_text with same recipient returns None."""
+        parser = MockStreamableParser()
+
+        delta_message, tools_streamed = extract_harmony_streaming_delta(
+            harmony_parser=parser,
+            cur_channel=channel,
+            cur_recipient="functions.get_weather",
+            prev_recipient="functions.get_weather",
+            delta_text="",
+            include_reasoning=False,
+        )
+
+        assert delta_message is None
+        assert tools_streamed is False
+
+    def test_tool_call_index_from_previous_messages(self):
+        """Test tool call index accounts for previous function messages."""
+        messages = [
+            MockMessage(channel="analysis", recipient=None),  # Not counted
+            MockMessage(channel="commentary", recipient="functions.tool1"),  # Counted
+            MockMessage(channel="final", recipient=None),  # Not counted
+        ]
+        parser = MockStreamableParser(messages=messages)
+
+        delta_message, _ = extract_harmony_streaming_delta(
+            harmony_parser=parser,
+            cur_channel="commentary",
+            cur_recipient="functions.tool2",
+            prev_recipient="functions.tool2",
+            delta_text="args",
+            include_reasoning=False,
+        )
+
+        assert delta_message.tool_calls[0].index == 1
+
+    @pytest.mark.parametrize(
+        "channel,recipient",
+        [
+            ("commentary", None),
+            ("commentary", "browser.search"),
+        ],
+    )
+    def test_returns_tool_call_preambles(self, channel, recipient):
+        """Test that invalid channel/recipient combinations return None."""
+        parser = MockStreamableParser()
+        delta_text = "some text"
+        delta_message, tools_streamed = extract_harmony_streaming_delta(
+            harmony_parser=parser,
+            cur_channel=channel,
+            cur_recipient=recipient,
+            prev_recipient=None,
+            delta_text=delta_text,
+            include_reasoning=True,
+        )
+
+        assert delta_message.content == delta_text
+        assert tools_streamed is False
+
+    @pytest.mark.parametrize(
+        "channel,recipient",
+        [
+            (None, None),
+            ("unknown_channel", None),
+        ],
+    )
+    def test_returns_none_for_invalid_inputs(self, channel, recipient):
+        """Test that invalid channel/recipient combinations return None."""
+        parser = MockStreamableParser()
+
+        delta_message, tools_streamed = extract_harmony_streaming_delta(
+            harmony_parser=parser,
+            cur_channel=channel,
+            cur_recipient=recipient,
+            prev_recipient=None,
+            delta_text="some text",
+            include_reasoning=True,
+        )
+
+        assert delta_message is None
+        assert tools_streamed is False
diff --git a/tests/entrypoints/openai/test_video.py b/tests/entrypoints/openai/test_video.py
index 7ecdac518f97f..e6b8a7f895636 100644
--- a/tests/entrypoints/openai/test_video.py
+++ b/tests/entrypoints/openai/test_video.py
@@ -7,7 +7,7 @@ import openai
 import pytest
 import pytest_asyncio
 
-from vllm.multimodal.utils import encode_video_base64, fetch_video
+from vllm.multimodal.utils import encode_video_url, fetch_video
 
 from ...utils import RemoteOpenAIServer
 
@@ -48,9 +48,9 @@ async def client(server):
 
 
 @pytest.fixture(scope="session")
-def base64_encoded_video() -> dict[str, str]:
+def url_encoded_video() -> dict[str, str]:
     return {
-        video_url: encode_video_base64(fetch_video(video_url)[0])
+        video_url: encode_video_url(fetch_video(video_url)[0])
         for video_url in TEST_VIDEO_URLS
     }
 
@@ -175,11 +175,9 @@ async def test_single_chat_session_video_base64encoded(
     client: openai.AsyncOpenAI,
     model_name: str,
     video_url: str,
-    base64_encoded_video: dict[str, str],
+    url_encoded_video: dict[str, str],
 ):
-    messages = dummy_messages_from_video_url(
-        f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
-    )
+    messages = dummy_messages_from_video_url(url_encoded_video[video_url])
 
     # test single completion
     chat_completion = await client.chat.completions.create(
@@ -223,11 +221,9 @@ async def test_single_chat_session_video_base64encoded_beamsearch(
     client: openai.AsyncOpenAI,
     model_name: str,
     video_url: str,
-    base64_encoded_video: dict[str, str],
+    url_encoded_video: dict[str, str],
 ):
-    messages = dummy_messages_from_video_url(
-        f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
-    )
+    messages = dummy_messages_from_video_url(url_encoded_video[video_url])
 
     chat_completion = await client.chat.completions.create(
         model=model_name,
diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index ae8860ee877b4..bc3da6ef1554a 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -9,7 +9,7 @@ import pytest_asyncio
 from transformers import AutoProcessor
 
 from vllm.multimodal.base import MediaWithBytes
-from vllm.multimodal.utils import encode_image_base64, fetch_image
+from vllm.multimodal.utils import encode_image_url, fetch_image
 
 from ...utils import RemoteOpenAIServer
 
@@ -35,7 +35,7 @@ EXPECTED_MM_BEAM_SEARCH_RES = [
     ],
     [
         "The image shows a Venn diagram with three over",
-        "The image shows a colorful Venn diagram with",
+        "The image displays a Venn diagram with three over",
     ],
     [
         "This image displays a gradient of colors ranging from",
@@ -70,11 +70,9 @@ async def client(server):
 
 
 @pytest.fixture(scope="session")
-def base64_encoded_image(local_asset_server) -> dict[str, str]:
+def url_encoded_image(local_asset_server) -> dict[str, str]:
     return {
-        image_asset: encode_image_base64(
-            local_asset_server.get_image_asset(image_asset)
-        )
+        image_asset: encode_image_url(local_asset_server.get_image_asset(image_asset))
         for image_asset in TEST_IMAGE_ASSETS
     }
 
@@ -234,11 +232,11 @@ async def test_single_chat_session_image_base64encoded(
     model_name: str,
     raw_image_url: str,
     image_url: str,
-    base64_encoded_image: dict[str, str],
+    url_encoded_image: dict[str, str],
 ):
     content_text = "What's in this image?"
     messages = dummy_messages_from_image_url(
-        f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}",
+        url_encoded_image[raw_image_url],
         content_text,
     )
 
@@ -288,15 +286,13 @@ async def test_single_chat_session_image_base64encoded_beamsearch(
     client: openai.AsyncOpenAI,
     model_name: str,
     image_idx: int,
-    base64_encoded_image: dict[str, str],
+    url_encoded_image: dict[str, str],
 ):
     # NOTE: This test also validates that we pass MM data through beam search
     raw_image_url = TEST_IMAGE_ASSETS[image_idx]
     expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx]
 
-    messages = dummy_messages_from_image_url(
-        f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}"
-    )
+    messages = dummy_messages_from_image_url(url_encoded_image[raw_image_url])
 
     chat_completion = await client.chat.completions.create(
         model=model_name,
diff --git a/tests/entrypoints/pooling/embed/test_online_vision.py b/tests/entrypoints/pooling/embed/test_online_vision.py
index eebbcdd2e4396..46b2d8a84d5ae 100644
--- a/tests/entrypoints/pooling/embed/test_online_vision.py
+++ b/tests/entrypoints/pooling/embed/test_online_vision.py
@@ -10,7 +10,7 @@ from transformers import AutoProcessor
 from tests.utils import VLLM_PATH, RemoteOpenAIServer
 from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
 from vllm.multimodal.base import MediaWithBytes
-from vllm.multimodal.utils import encode_image_base64, fetch_image
+from vllm.multimodal.utils import fetch_image
 
 MODEL_NAME = "TIGER-Lab/VLM2Vec-Full"
 MAXIMUM_IMAGES = 2
@@ -48,14 +48,6 @@ def server():
         yield remote_server
 
 
-@pytest.fixture(scope="session")
-def base64_encoded_image(local_asset_server) -> dict[str, str]:
-    return {
-        image_url: encode_image_base64(local_asset_server.get_image_asset(image_url))
-        for image_url in TEST_IMAGE_ASSETS
-    }
-
-
 def get_hf_prompt_tokens(model_name, content, image_url):
     processor = AutoProcessor.from_pretrained(
         model_name, trust_remote_code=True, num_crops=4
diff --git a/tests/entrypoints/pooling/score/test_utils.py b/tests/entrypoints/pooling/score/test_utils.py
new file mode 100644
index 0000000000000..92b443c48825f
--- /dev/null
+++ b/tests/entrypoints/pooling/score/test_utils.py
@@ -0,0 +1,352 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from unittest.mock import patch
+
+import pytest
+
+from vllm.config import ModelConfig
+from vllm.entrypoints.chat_utils import ChatTemplateResolutionError
+from vllm.entrypoints.score_utils import get_score_prompt
+from vllm.inputs import TokensPrompt
+from vllm.tokenizers import get_tokenizer
+
+# A cross-encoder model for testing
+CROSS_ENCODER_MODEL_ID = "cross-encoder/ms-marco-MiniLM-L-6-v2"
+
+
+def assert_prompt_tokenization_consistent(
+    tokenizer, full_prompt, engine_prompt, add_special_tokens=True
+):
+    """Verify that engine_prompt token_ids match tokenizing full_prompt."""
+    expected_ids = tokenizer(full_prompt, add_special_tokens=add_special_tokens)[
+        "input_ids"
+    ]
+    actual_ids = engine_prompt["prompt_token_ids"]
+    assert actual_ids == expected_ids, (
+        f"Token IDs don't match.\nExpected: {expected_ids}\nActual:   {actual_ids}"
+    )
+
+
+@pytest.fixture(scope="module")
+def cross_encoder_model_config():
+    return ModelConfig(
+        CROSS_ENCODER_MODEL_ID,
+        runner="pooling",
+    )
+
+
+@pytest.fixture(scope="module")
+def cross_encoder_tokenizer(cross_encoder_model_config):
+    return get_tokenizer(
+        CROSS_ENCODER_MODEL_ID,
+        trust_remote_code=cross_encoder_model_config.trust_remote_code,
+    )
+
+
+@pytest.fixture(scope="module")
+def llm_reranker_model_config():
+    """Model config for LLM-as-reranker style (no pad token)."""
+    config = ModelConfig(
+        CROSS_ENCODER_MODEL_ID,
+        runner="pooling",
+    )
+    # use_pad_token is a property that reads from hf_config,
+    # so we set it there to override the default (True)
+    config.hf_config.use_pad_token = False
+    return config
+
+
+@pytest.fixture
+def tokenization_kwargs():
+    """Common tokenization kwargs used across tests."""
+    return {"add_special_tokens": True, "return_tensors": None}
+
+
+@pytest.fixture
+def mock_model_with_score_template():
+    """Mock model class that supports score template and tracks post_process calls."""
+
+    class MockModelWithScoreTemplate:
+        supports_score_template = True
+        post_process_called: list[TokensPrompt] = []
+
+        @staticmethod
+        def get_score_template(p1: str, p2: str) -> str:
+            return f"[QUERY]{p1}[SEP][DOC]{p2}"
+
+        @staticmethod
+        def post_process_tokens(prompt: TokensPrompt) -> None:
+            MockModelWithScoreTemplate.post_process_called.append(prompt)
+
+    return MockModelWithScoreTemplate
+
+
+@pytest.fixture
+def mock_model_no_score_template():
+    """Mock model class that does not support score template."""
+
+    class MockModelNoScoreTemplate:
+        supports_score_template = False
+
+    return MockModelNoScoreTemplate
+
+
+class TestGetScorePrompt:
+    """Tests for the get_score_prompt function."""
+
+    def test_tokenization_kwargs_passed_through(
+        self,
+        llm_reranker_model_config,
+        cross_encoder_tokenizer,
+    ):
+        """Test that tokenization kwargs are properly passed through."""
+        data_1 = "Query text"
+        data_2 = "Document text"
+
+        # Test with truncation - custom kwargs for this test
+        custom_tokenization_kwargs = {
+            "add_special_tokens": True,
+            "return_tensors": None,
+            "truncation": True,
+            "max_length": 20,
+        }
+
+        full_prompt, engine_prompt = get_score_prompt(
+            llm_reranker_model_config,
+            cross_encoder_tokenizer,
+            custom_tokenization_kwargs,
+            data_1,
+            data_2,
+        )
+
+        assert isinstance(full_prompt, str)
+        assert "prompt_token_ids" in engine_prompt
+        # With max_length=20 and truncation, should not exceed this
+        assert len(engine_prompt["prompt_token_ids"]) <= 20
+        # Since truncation was applied, token_ids should be a prefix of full encoding
+        full_ids = cross_encoder_tokenizer(full_prompt, add_special_tokens=True)[
+            "input_ids"
+        ]
+        actual_ids = engine_prompt["prompt_token_ids"]
+        assert full_ids[: len(actual_ids)] == actual_ids, (
+            f"Token IDs are not a prefix of full encoding.\n"
+            f"Full IDs:   {full_ids}\n"
+            f"Actual IDs: {actual_ids}"
+        )
+
+    def test_model_supports_score_template(
+        self,
+        cross_encoder_model_config,
+        cross_encoder_tokenizer,
+        tokenization_kwargs,
+        mock_model_with_score_template,
+    ):
+        """Test when model supports score template (no score_template arg)."""
+        with patch(
+            "vllm.model_executor.model_loader.get_model_cls",
+            return_value=mock_model_with_score_template,
+        ):
+            full_prompt, engine_prompt = get_score_prompt(
+                cross_encoder_model_config,
+                cross_encoder_tokenizer,
+                tokenization_kwargs,
+                "query text",
+                "document text",
+            )
+
+        assert full_prompt == "[QUERY]query text[SEP][DOC]document text"
+        assert "prompt_token_ids" in engine_prompt
+        assert len(engine_prompt["prompt_token_ids"]) > 0
+        assert_prompt_tokenization_consistent(
+            cross_encoder_tokenizer, full_prompt, engine_prompt
+        )
+
+    def test_model_supports_score_template_but_custom_template_provided(
+        self,
+        cross_encoder_model_config,
+        cross_encoder_tokenizer,
+        tokenization_kwargs,
+        mock_model_with_score_template,
+    ):
+        """Test when model supports score template but custom template is provided."""
+        template = (
+            'TEMPLATE_USED {{ messages[0]["content"] }} {{ messages[1]["content"] }}'
+        )
+        with (
+            patch(
+                "vllm.model_executor.model_loader.get_model_cls",
+                return_value=mock_model_with_score_template,
+            ),
+        ):
+            full_prompt, engine_prompt = get_score_prompt(
+                cross_encoder_model_config,
+                cross_encoder_tokenizer,
+                tokenization_kwargs,
+                "query",
+                "doc",
+                score_template=template,  # Providing a template
+            )
+
+        assert "prompt_token_ids" in engine_prompt
+        assert full_prompt == "TEMPLATE_USED query doc"
+
+        assert_prompt_tokenization_consistent(
+            cross_encoder_tokenizer, full_prompt, engine_prompt
+        )
+
+    def test_not_using_default_template(
+        self,
+        llm_reranker_model_config,
+        cross_encoder_tokenizer,
+        tokenization_kwargs,
+        mock_model_no_score_template,
+    ):
+        # FIXME: Models implementing SupportsScoreTemplate must use their custom
+        # template implementation by default to preserve existing functionality.
+        # Attempting to use tokenizer_config.json templates would most likely break
+        # these models, as often they just inherit the template from the original LLM.
+        # CLI --chat-template overrides are still supported.
+        with (
+            patch(
+                "vllm.model_executor.model_loader.get_model_cls",
+                return_value=mock_model_no_score_template,
+            ),
+            patch(
+                "vllm.entrypoints.score_utils.apply_hf_chat_template",
+                return_value="test querytest doc",
+            ),
+        ):
+            full_prompt, engine_prompt = get_score_prompt(
+                llm_reranker_model_config,
+                cross_encoder_tokenizer,
+                tokenization_kwargs,
+                "test query",
+                "test doc",
+            )
+
+        assert full_prompt == "test querytest doc"
+        assert "prompt_token_ids" in engine_prompt
+        assert_prompt_tokenization_consistent(
+            cross_encoder_tokenizer, full_prompt, engine_prompt
+        )
+
+    def test_fallback_with_pad_token(
+        self,
+        cross_encoder_model_config,
+        cross_encoder_tokenizer,
+        tokenization_kwargs,
+        mock_model_no_score_template,
+    ):
+        """Test fallback path when ChatTemplateResolutionError
+        and use_pad_token=True."""
+        with (
+            patch(
+                "vllm.model_executor.model_loader.get_model_cls",
+                return_value=mock_model_no_score_template,
+            ),
+            patch(
+                "vllm.entrypoints.score_utils.apply_hf_chat_template",
+                side_effect=ChatTemplateResolutionError("No template"),
+            ),
+        ):
+            full_prompt, engine_prompt = get_score_prompt(
+                cross_encoder_model_config,  # use_pad_token=True
+                cross_encoder_tokenizer,
+                tokenization_kwargs,
+                "query",
+                "document",
+            )
+
+        assert "prompt_token_ids" in engine_prompt
+        # Should have token_type_ids from text_pair encoding
+        assert "token_type_ids" in engine_prompt
+        assert "query" in full_prompt
+        assert "document" in full_prompt
+        assert full_prompt != "querydocument"
+        assert (
+            engine_prompt["prompt_token_ids"]
+            == cross_encoder_tokenizer(
+                "query", text_pair="document", add_special_tokens=True
+            )["input_ids"]
+        )
+
+        # FIXME(?): add_special_tokens=False is needed because in this case
+        # full_prompt is obtained by decoding the tokenized prompt, which includes
+        # special tokens and we would get duplicated special tokens otherwise.
+        # This is inconsistent with other cases.
+        assert_prompt_tokenization_consistent(
+            cross_encoder_tokenizer,
+            full_prompt,
+            engine_prompt,
+            add_special_tokens=False,
+        )
+
+    def test_fallback_without_pad_token(
+        self,
+        llm_reranker_model_config,
+        cross_encoder_tokenizer,
+        tokenization_kwargs,
+        mock_model_no_score_template,
+    ):
+        """Test fallback path when ChatTemplateResolutionError
+        and use_pad_token=False."""
+        with (
+            patch(
+                "vllm.model_executor.model_loader.get_model_cls",
+                return_value=mock_model_no_score_template,
+            ),
+            patch(
+                "vllm.entrypoints.score_utils.apply_hf_chat_template",
+                side_effect=ChatTemplateResolutionError("No template"),
+            ),
+        ):
+            full_prompt, engine_prompt = get_score_prompt(
+                llm_reranker_model_config,  # use_pad_token=False
+                cross_encoder_tokenizer,
+                tokenization_kwargs,
+                "query",
+                "document",
+            )
+
+        assert full_prompt == "querydocument"
+        assert "prompt_token_ids" in engine_prompt
+        assert_prompt_tokenization_consistent(
+            cross_encoder_tokenizer, full_prompt, engine_prompt
+        )
+
+    def test_post_process_tokens_called(
+        self,
+        cross_encoder_model_config,
+        cross_encoder_tokenizer,
+        tokenization_kwargs,
+        mock_model_with_score_template,
+    ):
+        """Test that post_process_tokens is called on the engine prompt."""
+        # Reset the call tracker
+        mock_model_with_score_template.post_process_called.clear()
+
+        with (
+            patch(
+                "vllm.model_executor.model_loader.get_model_cls",
+                return_value=mock_model_with_score_template,
+            ),
+            patch(
+                "vllm.entrypoints.score_utils.apply_hf_chat_template",
+                side_effect=ChatTemplateResolutionError("No template"),
+            ),
+        ):
+            full_prompt, engine_prompt = get_score_prompt(
+                cross_encoder_model_config,
+                cross_encoder_tokenizer,
+                tokenization_kwargs,
+                "query",
+                "doc",
+            )
+
+        # post_process_tokens should have been called once
+        assert len(mock_model_with_score_template.post_process_called) == 1
+        assert mock_model_with_score_template.post_process_called[0] is engine_prompt
+        assert_prompt_tokenization_consistent(
+            cross_encoder_tokenizer, full_prompt, engine_prompt
+        )
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index a87a4c35d3dc7..6df2d26f2f0da 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -25,9 +25,9 @@ from vllm.entrypoints.chat_utils import (
 )
 from vllm.multimodal import MultiModalDataDict, MultiModalUUIDDict
 from vllm.multimodal.utils import (
-    encode_audio_base64,
-    encode_image_base64,
-    encode_video_base64,
+    encode_audio_url,
+    encode_image_url,
+    encode_video_url,
 )
 from vllm.tokenizers import get_tokenizer
 from vllm.tokenizers.mistral import MistralTokenizer
@@ -141,22 +141,19 @@ def mistral_model_config():
 @pytest.fixture(scope="module")
 def image_url():
     image = ImageAsset("cherry_blossom")
-    base64 = encode_image_base64(image.pil_image)
-    return f"data:image/jpeg;base64,{base64}"
+    return encode_image_url(image.pil_image)
 
 
 @pytest.fixture(scope="module")
 def video_url():
     video = VideoAsset("baby_reading", 1)
-    base64 = encode_video_base64(video.np_ndarrays)
-    return f"data:video/jpeg;base64,{base64}"
+    return encode_video_url(video.np_ndarrays)
 
 
 @pytest.fixture(scope="module")
 def audio_url():
     audio = AudioAsset("mary_had_lamb")
-    base64 = encode_audio_base64(*audio.audio_and_sample_rate)
-    return f"data:audio/ogg;base64,{base64}"
+    return encode_audio_url(*audio.audio_and_sample_rate)
 
 
 def _assert_mm_data_is_image_input(
diff --git a/tests/kernels/attention/test_attention.py b/tests/kernels/attention/test_attention.py
index 1a7d5ce0ddc1e..96bdcf16d5689 100644
--- a/tests/kernels/attention/test_attention.py
+++ b/tests/kernels/attention/test_attention.py
@@ -9,7 +9,8 @@ import torch
 from tests.kernels.allclose_default import get_default_atol, get_default_rtol
 from tests.kernels.utils import opcheck
 from vllm import _custom_ops as ops
-from vllm.attention.layer import Attention, MultiHeadAttention
+from vllm.attention.layer import Attention
+from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
 from vllm.platforms import current_platform
 from vllm.utils.mem_utils import get_max_shared_memory_bytes
 
@@ -442,7 +443,7 @@ def ref_multi_query_kv_attention(
     return torch.cat(ref_outputs, dim=0)
 
 
-@pytest.mark.parametrize("attention_cls", [Attention, MultiHeadAttention])
+@pytest.mark.parametrize("attention_cls", [Attention, MMEncoderAttention])
 def test_num_heads_not_divisble_by_num_kv_heads(attention_cls: type) -> None:
     head_size = 64
     scale = float(1.0 / (head_size**0.5))
diff --git a/tests/kernels/attention/test_mha_attn.py b/tests/kernels/attention/test_mha_attn.py
index 639abdf6f0487..7405e4d41da94 100644
--- a/tests/kernels/attention/test_mha_attn.py
+++ b/tests/kernels/attention/test_mha_attn.py
@@ -3,16 +3,17 @@
 """
 Test:
 
-* Tests for MultiHeadAttention layer
+* Tests for MMEncoderAttention layer
 """
 
+import itertools
 from unittest.mock import patch
 
 import pytest
 import torch
 
 from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.attention.layer import MultiHeadAttention
+from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
 from vllm.attention.selector import _cached_get_attn_backend
 from vllm.platforms import current_platform
 from vllm.platforms.cpu import CpuPlatform
@@ -42,35 +43,31 @@ def test_mha_attn_platform(device: str):
 
     if device == "cpu":
         with (
-            patch("vllm.attention.layer.current_platform", CpuPlatform()),
             patch("vllm.model_executor.models.vision.current_platform", CpuPlatform()),
         ):
-            attn = MultiHeadAttention(16, 64, scale=1)
+            attn = MMEncoderAttention(16, 64, scale=1)
             assert attn.attn_backend == AttentionBackendEnum.TORCH_SDPA
     elif device == "hip":
         with (
-            patch("vllm.attention.layer.current_platform", RocmPlatform()),
             patch("vllm.model_executor.models.vision.current_platform", RocmPlatform()),
         ):
-            attn = MultiHeadAttention(16, 64, scale=1)
+            attn = MMEncoderAttention(16, 64, scale=1)
             assert attn.attn_backend == AttentionBackendEnum.FLASH_ATTN
     else:
         # Test CUDA with head_size=64 (divisible by 32)
         # - should use vLLM's FlashAttention
         with (
-            patch("vllm.attention.layer.current_platform", CudaPlatform()),
             patch("vllm.model_executor.models.vision.current_platform", CudaPlatform()),
         ):
-            attn = MultiHeadAttention(16, 64, scale=1)
+            attn = MMEncoderAttention(16, 64, scale=1)
             assert attn.attn_backend == AttentionBackendEnum.FLASH_ATTN
 
         # Test CUDA with head_size=72 (not divisible by 32)
         # - should use vLLM's FlashAttention
         with (
-            patch("vllm.attention.layer.current_platform", CudaPlatform()),
             patch("vllm.model_executor.models.vision.current_platform", CudaPlatform()),
         ):
-            attn = MultiHeadAttention(16, 72, scale=1)
+            attn = MMEncoderAttention(16, 72, scale=1)
             assert attn.attn_backend == AttentionBackendEnum.FLASH_ATTN
 
 
@@ -94,6 +91,10 @@ def ref_attention(
 
 BATCH_SIZES = [1, 16]
 SEQ_LENS = [1]
+VAR_SEQ_LENS = [
+    [2, 2],
+    [2, 3, 4],
+]
 NUM_HEADS = [1, 16]
 NUM_KV_HEADS = [1]
 HEAD_SIZES = [64, 80]
@@ -130,7 +131,7 @@ def test_mha_attn_forward(
     k = torch.randn(batch_size, seq_len, num_kv_heads * head_size)
     v = torch.randn(batch_size, seq_len, num_kv_heads * head_size)
     scale = 1.0 / head_size**0.5
-    attn = MultiHeadAttention(
+    attn = MMEncoderAttention(
         num_heads, head_size, scale=scale, num_kv_heads=num_kv_heads
     )
     output = attn(q, k, v)
@@ -151,3 +152,58 @@ def test_mha_attn_forward(
         scale=scale,
     ).reshape(batch_size, seq_len, num_heads * head_size)
     torch.testing.assert_close(output, ref_output)
+
+
+@pytest.mark.parametrize("var_seq_len", VAR_SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("num_kv_heads", NUM_KV_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_mha_attn_varlen_forward(
+    var_seq_len: list[int],
+    num_heads: int,
+    num_kv_heads: int,
+    head_size: int,
+    dtype: torch.dtype,
+    device: str,
+):
+    current_platform.seed_everything(0)
+    torch.set_default_device(device)
+    torch.set_default_dtype(dtype)
+
+    q = torch.randn(1, sum(var_seq_len), num_heads, head_size)
+    k = torch.randn(1, sum(var_seq_len), num_kv_heads, head_size)
+    v = torch.randn(1, sum(var_seq_len), num_kv_heads, head_size)
+    cu_seqlens = torch.tensor(
+        [0] + list(itertools.accumulate(var_seq_len)), dtype=torch.int32
+    )
+    scale = 1.0 / head_size**0.5
+    attn = MMEncoderAttention(
+        num_heads, head_size, scale=scale, num_kv_heads=num_kv_heads
+    )
+    output = attn(
+        q, k, v, cu_seqlens=cu_seqlens, max_seqlen=torch.tensor(max(var_seq_len))
+    )
+
+    assert num_heads % num_kv_heads == 0
+    num_queries_per_kv = num_heads // num_kv_heads
+    if num_queries_per_kv > 1:
+        k = torch.repeat_interleave(k, num_queries_per_kv, dim=2)
+        v = torch.repeat_interleave(v, num_queries_per_kv, dim=2)
+
+    ref_output = []
+    for q_i, k_i, v_i in zip(
+        torch.split(q, var_seq_len, dim=1),
+        torch.split(k, var_seq_len, dim=1),
+        torch.split(v, var_seq_len, dim=1),
+    ):
+        output_i = ref_attention(
+            q_i,
+            k_i,
+            v_i,
+            scale=scale,
+        )
+        ref_output.append(output_i)
+    ref_output = torch.cat(ref_output, dim=1)
+    torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2)
diff --git a/tests/kernels/core/test_fused_qk_norm_rope.py b/tests/kernels/core/test_fused_qk_norm_rope.py
index a23959e353da9..05d61ec02fd29 100644
--- a/tests/kernels/core/test_fused_qk_norm_rope.py
+++ b/tests/kernels/core/test_fused_qk_norm_rope.py
@@ -13,6 +13,7 @@ DTYPES = [torch.bfloat16, torch.float16]
 IS_NEOX = [True, False]
 EPS_VALUES = [1e-5, 1e-6]
 SEEDS = [13]
+PARTIAL_ROPE = [True, False]
 CUDA_DEVICES = ["cuda:0"]
 
 
@@ -52,6 +53,7 @@ def _apply_qk_norm_rope(
 @pytest.mark.parametrize("is_neox", IS_NEOX)
 @pytest.mark.parametrize("eps", EPS_VALUES)
 @pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("rotary_ratio", [1.0, 0.5, 0.25])
 @torch.inference_mode()
 def test_fused_qk_norm_rope_matches_reference(
     device: str,
@@ -59,6 +61,7 @@ def test_fused_qk_norm_rope_matches_reference(
     is_neox: bool,
     eps: float,
     seed: int,
+    rotary_ratio: float,
 ):
     torch.set_default_device(device)
     current_platform.seed_everything(seed)
@@ -76,10 +79,10 @@ def test_fused_qk_norm_rope_matches_reference(
     k_norm.weight.data.normal_(mean=1.0, std=0.1)
     q_weight = q_norm.weight.data
     k_weight = k_norm.weight.data
-
+    rotary_dim = int(head_dim * rotary_ratio)
     rope = RotaryEmbedding(
         head_size=head_dim,
-        rotary_dim=head_dim,
+        rotary_dim=rotary_dim,
         max_position_embeddings=4096,
         base=10000.0,
         is_neox_style=is_neox,
diff --git a/tests/kernels/moe/modular_kernel_tools/common.py b/tests/kernels/moe/modular_kernel_tools/common.py
index 6078ce44cee9f..537dcae4e74b4 100644
--- a/tests/kernels/moe/modular_kernel_tools/common.py
+++ b/tests/kernels/moe/modular_kernel_tools/common.py
@@ -258,16 +258,16 @@ class Config:
                     f"{self.fe_supported_types()}."
                 )
 
-        # Check block quanization support
-        is_block_quatized = self.quant_block_shape is not None
-        if is_block_quatized and self.quant_dtype is None:
+        # Check block quantization support
+        is_block_quantized = self.quant_block_shape is not None
+        if is_block_quantized and self.quant_dtype is None:
             return False, "No block quantization support."
 
-        if is_block_quatized and not self.is_block_quant_supported():
+        if is_block_quantized and not self.is_block_quant_supported():
             return False, "Mismatched block quantization support."
 
         # deep_gemm only works with block-quantized
-        if self.needs_deep_gemm() and not is_block_quatized:
+        if self.needs_deep_gemm() and not is_block_quantized:
             return False, "Needs DeepGEMM but not block quantized."
 
         # Check dependencies (turn into asserts?)
diff --git a/tests/kernels/moe/test_cpu_fused_moe.py b/tests/kernels/moe/test_cpu_fused_moe.py
new file mode 100644
index 0000000000000..4dda45a6c7409
--- /dev/null
+++ b/tests/kernels/moe/test_cpu_fused_moe.py
@@ -0,0 +1,172 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from tests.kernels.allclose_default import get_default_atol, get_default_rtol
+from vllm._custom_ops import cpu_fused_moe, cpu_prepack_moe_weight
+from vllm.model_executor.layers.activation import SiluAndMul, SwigluOAIAndMul
+from vllm.platforms import current_platform
+
+if not current_platform.is_cpu():
+    pytest.skip("skipping CPU-only tests", allow_module_level=True)
+
+EXPERT_NUM = [
+    8,
+]
+HIDDEN_DIM = [128, 2880]
+INTERMEDIATE_DIM = [128, 2880]
+BATCH_SIZE = [1, 64, 256]
+ACT = ["silu", "swigluoai"]
+USE_BIAS = [True, False]
+ISA = ["amx", "vec"] if torch._C._cpu._is_amx_tile_supported() else ["vec"]
+DTYPE = [torch.bfloat16]
+
+_CPU_MOE_ACT = {
+    "silu": SiluAndMul(),
+    "swigluoai": SwigluOAIAndMul(),
+}
+
+
+def ref_fused_moe(
+    input: torch.Tensor,
+    w13: torch.Tensor,
+    w2: torch.Tensor,
+    w13_bias: torch.Tensor | None,
+    w2_bias: torch.Tensor | None,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    activation: str,
+) -> torch.Tensor:
+    len_experts = w13.size(0)
+
+    cnts = topk_ids.new_zeros((topk_ids.shape[0], len_experts))
+    cnts.scatter_(1, topk_ids.to(torch.int64), 1)
+    tokens_per_expert = cnts.sum(dim=0)
+    idxs = topk_ids.view(-1).argsort()
+
+    sorted_tokens = input[idxs // topk_ids.shape[1]]
+    tokens_per_expert = tokens_per_expert.cpu().numpy()
+
+    outputs = []
+    start_idx = 0
+
+    for i, num_tokens in enumerate(tokens_per_expert):
+        end_idx = start_idx + num_tokens
+        if num_tokens == 0:
+            continue
+        tokens_for_this_expert = sorted_tokens[start_idx:end_idx].float()
+        curr_w13 = w13[i].float()
+        curr_w2 = w2[i].float()
+
+        curr_w13_bias = None
+        if w13_bias is not None:
+            curr_w13_bias = w13_bias[i].float()
+
+        curr_w2_bias = None
+        if w2_bias is not None:
+            curr_w2_bias = w2_bias[i].float()
+
+        gate_up = torch.nn.functional.linear(
+            tokens_for_this_expert, curr_w13, curr_w13_bias
+        )
+        # Note: to simulate the kernel implementation
+        gate_up = (
+            _CPU_MOE_ACT[activation]
+            .forward_native(gate_up)
+            .to(dtype=input.dtype)
+            .float()
+        )
+        expert_out = torch.nn.functional.linear(gate_up, curr_w2, curr_w2_bias)
+
+        outputs.append(expert_out)
+        start_idx = end_idx
+
+    outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)
+    new_x = torch.empty_like(outs)
+
+    new_x[idxs] = outs
+    final_out = (
+        new_x.view(*topk_ids.shape, -1)
+        .mul_(topk_weights.unsqueeze(dim=-1))
+        .sum(dim=1)
+        .type(input.dtype)
+    )
+    return final_out
+
+
+@pytest.mark.parametrize("batch_size", BATCH_SIZE)
+@pytest.mark.parametrize("expert_num", EXPERT_NUM)
+@pytest.mark.parametrize("hidden_size", HIDDEN_DIM)
+@pytest.mark.parametrize("intermediate_size", INTERMEDIATE_DIM)
+@pytest.mark.parametrize("use_bias", USE_BIAS)
+@pytest.mark.parametrize("dtype", DTYPE)
+@pytest.mark.parametrize("act", ACT)
+@pytest.mark.parametrize("isa", ISA)
+def test_cpu_fused_moe(
+    batch_size: int,
+    expert_num: int,
+    hidden_size: int,
+    intermediate_size: int,
+    use_bias: bool,
+    dtype: torch.dtype,
+    act: str,
+    isa: str,
+):
+    current_platform.seed_everything(0)
+
+    topk_num = max(expert_num // 2, 1)
+    up_dim = 2 * intermediate_size
+
+    input = torch.randn((batch_size, hidden_size), dtype=dtype) / (
+        0.5 * hidden_size**0.5
+    )
+    w13 = torch.randn((expert_num, up_dim, hidden_size), dtype=dtype) / (
+        0.5 * hidden_size**0.5
+    )
+    w2 = torch.randn((expert_num, hidden_size, intermediate_size), dtype=dtype) / (
+        0.5 * intermediate_size**0.5
+    )
+    router_logits = torch.randn((batch_size, expert_num), dtype=dtype)
+    w13_bias = None
+    w2_bias = None
+    if use_bias:
+        w13_bias = torch.randn((expert_num, up_dim), dtype=dtype) / (0.5 * up_dim**0.5)
+        w2_bias = torch.randn((expert_num, hidden_size), dtype=dtype) / (
+            0.5 * hidden_size**0.5
+        )
+    score = torch.softmax(router_logits, dim=-1, dtype=torch.float32)
+    topk_weight, topk_ids = torch.topk(score, topk_num)
+    topk_ids = topk_ids.to(torch.int32)
+
+    ref_output = ref_fused_moe(
+        input,
+        w13,
+        w2,
+        w13_bias,
+        w2_bias,
+        topk_weight,
+        topk_ids,
+        act,
+    )
+
+    packed_w13 = cpu_prepack_moe_weight(w13, isa)
+    packed_w2 = cpu_prepack_moe_weight(w2, isa)
+    output = cpu_fused_moe(
+        input,
+        packed_w13,
+        packed_w2,
+        w13_bias,
+        w2_bias,
+        topk_weight,
+        topk_ids,
+        act,
+        isa,
+    )
+
+    atol, rtol = get_default_atol(output), get_default_rtol(output)
+    (
+        torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol),
+        f"{torch.max(torch.abs(output - ref_output))}",
+    )
diff --git a/tests/kernels/moe/test_cutlass_grouped_gemm.py b/tests/kernels/moe/test_cutlass_grouped_gemm.py
deleted file mode 100644
index 1c10cb3b2c699..0000000000000
--- a/tests/kernels/moe/test_cutlass_grouped_gemm.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# DeepGEMM Style Cutlass Grouped GEMM Test
-# See https://github.com/deepseek-ai/DeepGEMM/blob/main/tests/test_core.py
-
-import random
-
-import pytest
-import torch
-
-from tests.kernels.moe.utils import per_token_cast_to_fp8
-from tests.kernels.utils import baseline_scaled_mm
-from vllm import _custom_ops as ops
-from vllm.platforms import current_platform
-from vllm.utils.deep_gemm import per_block_cast_to_fp8
-from vllm.utils.math_utils import cdiv
-
-
-@pytest.mark.parametrize(
-    "num_groups, expected_m_per_group, k, n",
-    [
-        (4, 8192, 7168, 4096),
-        (4, 8192, 2048, 7168),
-        (8, 4096, 7168, 4096),
-        (8, 4096, 2048, 7168),
-        (32, 1024, 7168, 4096),
-        (32, 1024, 2048, 7168),
-    ],
-)
-@pytest.mark.parametrize("out_dtype", [torch.float16])
-@pytest.mark.skipif(
-    (lambda x: x is None or x.to_int() != 100)(
-        current_platform.get_device_capability()
-    ),
-    reason="Block Scaled Grouped GEMM is only supported on SM100.",
-)
-def test_cutlass_grouped_gemm(
-    num_groups: int,
-    expected_m_per_group: int,
-    k: int,
-    n: int,
-    out_dtype: torch.dtype,
-):
-    device = "cuda"
-    alignment = 128
-    group_ms = [
-        int(expected_m_per_group * random.uniform(0.7, 1.3)) for _ in range(num_groups)
-    ]
-    m = sum([cdiv(m, alignment) * alignment for m in group_ms])
-
-    x = torch.randn((m, k), device=device, dtype=out_dtype)
-    y = torch.randn((num_groups, n, k), device=device, dtype=out_dtype)
-    out = torch.empty((m, n), device=device, dtype=out_dtype)
-    ref_out = torch.randn((m, n), device=device, dtype=out_dtype)
-
-    ep_offset = [0] + [sum(group_ms[:i]) for i in range(1, num_groups)] + [m]
-    pb_size = []
-    for i in range(num_groups):
-        pb_size.append([ep_offset[i + 1] - ep_offset[i], n, k])
-    problem_sizes = torch.tensor(pb_size, device=device, dtype=torch.int32)
-    expert_offsets = torch.tensor(ep_offset, device=device, dtype=torch.int32)
-
-    x_fp8 = per_token_cast_to_fp8(x)
-    y_fp8 = (
-        torch.empty_like(y, dtype=torch.float8_e4m3fn),
-        torch.empty(
-            (num_groups, cdiv(n, 128), k // 128), device=device, dtype=torch.float
-        ),
-    )
-    for i in range(num_groups):
-        y_fp8[0][i], y_fp8[1][i] = per_block_cast_to_fp8(y[i], [128, 128])
-
-    for i in range(num_groups):
-        a = x_fp8[0][ep_offset[i] : ep_offset[i + 1]]
-        a_scale = x_fp8[1][ep_offset[i] : ep_offset[i + 1]]
-        b = y_fp8[0][i].t()
-        b_scale = y_fp8[1][i].t()
-        baseline = baseline_scaled_mm(a, b, a_scale, b_scale, out_dtype)
-        ref_out[ep_offset[i] : ep_offset[i + 1]] = baseline
-
-    ops.cutlass_blockwise_scaled_grouped_mm(
-        out,
-        x_fp8[0],
-        y_fp8[0],
-        x_fp8[1],
-        y_fp8[1],
-        problem_sizes,
-        expert_offsets[:-1],
-    )
-
-    torch.testing.assert_close(ref_out, out, atol=5e-1, rtol=1e-3)
diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py
index ce99d9691fdc8..fd6ce6bfbd782 100644
--- a/tests/kernels/moe/test_moe.py
+++ b/tests/kernels/moe/test_moe.py
@@ -60,6 +60,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import quantize_w
 from vllm.model_executor.models.mixtral import MixtralMoE
 from vllm.platforms import current_platform
 from vllm.scalar_type import ScalarType, scalar_types
+from vllm.v1.worker.workspace import init_workspace_manager
 
 NUM_EXPERTS = [8, 64, 192]
 EP_SIZE = [1, 4]
@@ -487,6 +488,7 @@ def test_mixtral_moe(
     monkeypatch.setenv("MASTER_ADDR", "localhost")
     monkeypatch.setenv("MASTER_PORT", "12345")
     init_distributed_environment()
+    init_workspace_manager(torch.cuda.current_device())
 
     # Instantiate our and huggingface's MoE blocks
     vllm_config.compilation_config.static_forward_context = dict()
@@ -533,6 +535,11 @@ def test_mixtral_moe(
             torch.cuda.synchronize()
             torch.cuda.empty_cache()
 
+        # FIXME (zyongye) fix this after we move self.kernel
+        # assignment in FusedMoE.__init__
+
+        vllm_moe.experts.quant_method.process_weights_after_loading(vllm_moe.experts)
+
         # Run forward passes for both MoE blocks
         hf_states, _ = hf_moe.forward(hf_inputs)
         vllm_states = vllm_moe.forward(vllm_inputs)
diff --git a/tests/kernels/test_flex_attention.py b/tests/kernels/test_flex_attention.py
index f6987d54399d2..7053a8697e190 100644
--- a/tests/kernels/test_flex_attention.py
+++ b/tests/kernels/test_flex_attention.py
@@ -15,7 +15,10 @@ from tests.v1.attention.utils import (
     create_standard_kv_cache_spec,
     create_vllm_config,
 )
-from vllm.v1.attention.backends.flex_attention import FlexAttentionMetadataBuilder
+from vllm.v1.attention.backends.flex_attention import (
+    FlexAttentionMetadataBuilder,
+    physical_to_logical_mapping,
+)
 
 from ..models.utils import check_embeddings_close, check_logprobs_close
 
@@ -205,5 +208,31 @@ def test_block_mask_direct_vs_slow_path():
     )
 
 
+def test_physical_to_logical_mapping_handles_reused_blocks():
+    """Regression test: reused physical blocks map to the latest logical block.
+
+    For sliding-window / hybrid attention layers, physical KV-cache blocks can be
+    reused over time. The inverse mapping must therefore select the latest
+    logical block index for a physical block id.
+    """
+    # Padding should not make physical block 0 look live.
+    block_table = torch.tensor([[6, 0, 0, 0]], dtype=torch.int32)
+    seq_lens = torch.tensor([1 * 16], dtype=torch.int32)  # only 1 block valid
+    out = physical_to_logical_mapping(
+        block_table=block_table, seq_lens=seq_lens, block_size=16, total_blocks=10
+    )
+    assert out[0, 0].item() == -1
+    assert out[0, 6].item() == 0
+
+    # If a physical block id appears multiple times (block reuse), mapping should
+    # point to the latest logical block index.
+    block_table2 = torch.tensor([[2, 2, 5]], dtype=torch.int32)
+    seq_lens2 = torch.tensor([3 * 16], dtype=torch.int32)
+    out2 = physical_to_logical_mapping(
+        block_table=block_table2, seq_lens=seq_lens2, block_size=16, total_blocks=8
+    )
+    assert out2[0, 2].item() == 1
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/tests/models/language/pooling_mteb_test/mteb_utils.py b/tests/models/language/pooling_mteb_test/mteb_utils.py
index 189cdbae99dcd..11e05a635c1d1 100644
--- a/tests/models/language/pooling_mteb_test/mteb_utils.py
+++ b/tests/models/language/pooling_mteb_test/mteb_utils.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import tempfile
+from pathlib import Path
 
 import mteb
 import numpy as np
@@ -19,6 +20,11 @@ from tests.models.utils import (
     get_vllm_extra_kwargs,
 )
 
+template_home = (
+    Path(__file__).parent.parent.parent.parent.parent
+    / "examples/pooling/score/template"
+)
+
 # Most embedding models on the STS12 task (See #17175):
 # - Model implementation and minor changes in tensor dtype
 #   results in differences less than 1e-4
@@ -102,30 +108,6 @@ class VllmMtebEncoder(mteb.EncoderProtocol):
         return sim
 
 
-class VllmMtebCrossEncoder(mteb.CrossEncoderProtocol):
-    mteb_model_meta = _empty_model_meta
-
-    def __init__(self, vllm_model):
-        self.llm = vllm_model
-        self.rng = np.random.default_rng(seed=42)
-
-    def predict(
-        self,
-        inputs1: DataLoader[mteb.types.BatchedInput],
-        inputs2: DataLoader[mteb.types.BatchedInput],
-        *args,
-        **kwargs,
-    ) -> np.ndarray:
-        queries = [text for batch in inputs1 for text in batch["text"]]
-        corpus = [text for batch in inputs2 for text in batch["text"]]
-
-        outputs = self.llm.score(
-            queries, corpus, truncate_prompt_tokens=-1, use_tqdm=False
-        )
-        scores = np.array(outputs)
-        return scores
-
-
 class OpenAIClientMtebEncoder(VllmMtebEncoder):
     def __init__(self, model_name: str, client):
         self.model_name = model_name
@@ -153,6 +135,35 @@ class OpenAIClientMtebEncoder(VllmMtebEncoder):
         return embeds
 
 
+class VllmMtebCrossEncoder(mteb.CrossEncoderProtocol):
+    mteb_model_meta = _empty_model_meta
+
+    def __init__(self, vllm_model):
+        self.llm = vllm_model
+        self.rng = np.random.default_rng(seed=42)
+        self.chat_template: str | None = getattr(vllm_model, "chat_template", None)
+
+    def predict(
+        self,
+        inputs1: DataLoader[mteb.types.BatchedInput],
+        inputs2: DataLoader[mteb.types.BatchedInput],
+        *args,
+        **kwargs,
+    ) -> np.ndarray:
+        queries = [text for batch in inputs1 for text in batch["text"]]
+        corpus = [text for batch in inputs2 for text in batch["text"]]
+
+        outputs = self.llm.score(
+            queries,
+            corpus,
+            truncate_prompt_tokens=-1,
+            use_tqdm=False,
+            chat_template=self.chat_template,
+        )
+        scores = np.array(outputs)
+        return scores
+
+
 class ScoreClientMtebEncoder(mteb.CrossEncoderProtocol):
     mteb_model_meta = _empty_model_meta
 
@@ -387,6 +398,11 @@ def mteb_test_rerank_models(
             == model_info.default_pooling_type
         )
 
+        chat_template: str | None = None
+        if model_info.chat_template_name is not None:
+            chat_template = (template_home / model_info.chat_template_name).read_text()
+        vllm_model.chat_template = chat_template
+
         vllm_main_score = run_mteb_rerank(
             vllm_mteb_encoder(vllm_model),
             tasks=MTEB_RERANK_TASKS,
diff --git a/tests/models/language/pooling_mteb_test/test_nemotron.py b/tests/models/language/pooling_mteb_test/test_nemotron.py
new file mode 100644
index 0000000000000..167c3fcf50d1c
--- /dev/null
+++ b/tests/models/language/pooling_mteb_test/test_nemotron.py
@@ -0,0 +1,42 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from tests.models.utils import (
+    EmbedModelInfo,
+    LASTPoolingEmbedModelInfo,
+    LASTPoolingRerankModelInfo,
+    RerankModelInfo,
+)
+
+from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
+
+EMBEDDING_MODELS = [
+    LASTPoolingEmbedModelInfo(
+        "nvidia/llama-nemotron-embed-1b-v2",
+        architecture="LlamaBidirectionalModel",
+        mteb_score=0.689164662128673,
+    )
+]
+
+RERANK_MODELS = [
+    LASTPoolingRerankModelInfo(
+        "nvidia/llama-nemotron-rerank-1b-v2",
+        architecture="LlamaBidirectionalForSequenceClassification",
+        chat_template_name="nemotron-rerank.jinja",
+        mteb_score=0.33994,
+    ),
+]
+
+
+@pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
+def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
+    mteb_test_embed_models(hf_runner, vllm_runner, model_info)
+
+
+@pytest.mark.parametrize("model_info", RERANK_MODELS)
+def test_rerank_models_mteb(
+    hf_runner, vllm_runner, model_info: RerankModelInfo
+) -> None:
+    mteb_test_rerank_models(hf_runner, vllm_runner, model_info)
diff --git a/tests/models/multimodal/generation/conftest.py b/tests/models/multimodal/conftest.py
similarity index 77%
rename from tests/models/multimodal/generation/conftest.py
rename to tests/models/multimodal/conftest.py
index 26f8586742cea..31d99218c8276 100644
--- a/tests/models/multimodal/generation/conftest.py
+++ b/tests/models/multimodal/conftest.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Pytest configuration for vLLM tests."""
+"""Pytest configuration for vLLM multimodal tests."""
 
 import warnings
 
@@ -9,20 +9,17 @@ import torch
 from vllm.platforms import current_platform
 
 
-def pytest_configure(config):
-    """Disable Flash/MemEfficient SDP on ROCm to avoid HF
-    Transformers accuracy issues.
-    """
+def pytest_collection_modifyitems(config, items):
+    """Configure ROCm-specific settings based on collected tests."""
     if not current_platform.is_rocm():
         return
 
     skip_patterns = ["test_granite_speech.py"]
     if any(pattern in str(arg) for arg in config.args for pattern in skip_patterns):
-        # Skip disabling SDP for Granite Speech tests on ROCm
         return
 
     # Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
-    # accuracy issues
+    # accuracy issues: https://github.com/vllm-project/vllm/issues/30167
     # TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
     torch.backends.cuda.enable_flash_sdp(False)
     torch.backends.cuda.enable_mem_efficient_sdp(False)
diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index c5a0b6748f797..6640e1ff9474d 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -173,6 +173,13 @@ VLM_TEST_SETTINGS = {
         auto_cls=AutoModelForImageTextToText,
         vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
         patch_hf_runner=model_utils.qwen3_vl_patch_hf_runner,
+        vllm_runner_kwargs={
+            "attention_config": {
+                "backend": "ROCM_AITER_FA",
+            },
+        }
+        if current_platform.is_rocm()
+        else None,
         image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
         marks=[
             pytest.mark.core_model,
@@ -253,8 +260,19 @@ VLM_TEST_SETTINGS = {
         image_size_factors=[(0.25, 0.2, 0.15)],
         vllm_runner_kwargs={
             "model_impl": "transformers",
+            # TODO: [ROCm] Revert this once issue #30167 is resolved
+            **(
+                {
+                    "mm_processor_kwargs": {
+                        "min_pixels": 256 * 28 * 28,
+                        "max_pixels": 1280 * 28 * 28,
+                    },
+                }
+                if current_platform.is_rocm()
+                else {}
+            ),
         },
-        marks=[large_gpu_mark(min_gb=32)],
+        marks=[large_gpu_mark(min_gb=80 if current_platform.is_rocm() else 32)],
     ),
     #### Extended model tests
     "aria": VLMTestInfo(
@@ -645,7 +663,17 @@ VLM_TEST_SETTINGS = {
         hf_output_post_proc=model_utils.minimax_vl_01_hf_output,
         patch_hf_runner=model_utils.minimax_vl_01_patch_hf_runner,
         auto_cls=AutoModelForImageTextToText,
-        marks=[large_gpu_mark(min_gb=80)],
+        marks=[
+            large_gpu_mark(min_gb=80),
+            # TODO: [ROCm] Fix pickle issue with ROCm spawn and tp>1
+            pytest.mark.skipif(
+                current_platform.is_rocm(),
+                reason=(
+                    "ROCm: Model too large for single GPU; "
+                    "multi-GPU blocked by HF _LazyConfigMapping pickle issue with spawn"
+                ),
+            ),
+        ],
     ),
     "molmo": VLMTestInfo(
         models=["allenai/Molmo-7B-D-0924"],
diff --git a/tests/models/multimodal/generation/test_granite_speech.py b/tests/models/multimodal/generation/test_granite_speech.py
index 489743c5a29b3..1519a50c1a0c3 100644
--- a/tests/models/multimodal/generation/test_granite_speech.py
+++ b/tests/models/multimodal/generation/test_granite_speech.py
@@ -39,7 +39,7 @@ models = [MODEL_NAME]
 def granite_speech_attention_config():
     """Return attention config for Granite Speech tests on ROCm."""
     if current_platform.is_rocm():
-        return {"backend": "TRITON_ATTN"}
+        return {"backend": "ROCM_AITER_FA"}
     return None
 
 
diff --git a/tests/models/multimodal/generation/test_keye.py b/tests/models/multimodal/generation/test_keye.py
index 6f98bde1d91ea..4205a8b2d1ac4 100644
--- a/tests/models/multimodal/generation/test_keye.py
+++ b/tests/models/multimodal/generation/test_keye.py
@@ -8,7 +8,7 @@ from PIL.Image import Image
 from transformers import AutoProcessor
 
 from vllm import LLM, EngineArgs, SamplingParams
-from vllm.multimodal.utils import encode_image_base64
+from vllm.multimodal.utils import encode_image_url
 
 MODEL_NAME = "Kwai-Keye/Keye-VL-8B-Preview"
 
@@ -31,10 +31,7 @@ def test_keye_vl(
     question: str,
 ):
     images = [asset.pil_image for asset in image_assets]
-
-    image_urls = [
-        f"data:image/jpeg;base64,{encode_image_base64(image)}" for image in images
-    ]
+    image_urls = [encode_image_url(image) for image in images]
 
     engine_args = EngineArgs(
         model=MODEL_NAME,
diff --git a/tests/models/multimodal/generation/test_multimodal_gguf.py b/tests/models/multimodal/generation/test_multimodal_gguf.py
index e596b20c6302b..813dccf1451b5 100644
--- a/tests/models/multimodal/generation/test_multimodal_gguf.py
+++ b/tests/models/multimodal/generation/test_multimodal_gguf.py
@@ -1,17 +1,23 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Literal, NamedTuple
+import os
+
+os.environ["TOKENIZERS_PARALLELISM"] = "true"
+
+from typing import Any, NamedTuple
 
 import pytest
 from huggingface_hub import hf_hub_download
 from pytest import MarkDecorator
+from transformers import AutoModelForImageTextToText
 
 from tests.quantization.utils import is_quant_method_supported
 from vllm.assets.image import ImageAsset
+from vllm.multimodal.image import rescale_image_size
 from vllm.utils.torch_utils import set_default_torch_num_threads
 
-from ....conftest import PromptImageInput, VllmRunner
+from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner
 from ...utils import check_logprobs_close
 
 
@@ -21,9 +27,10 @@ class GGUFMMTestConfig(NamedTuple):
     gguf_backbone: str
     gguf_mmproj: str
     prompt: list[str]
-    mm_data: dict[Literal["images"], PromptImageInput]
+    image_names: list[str]  # Store names, load PIL images at runtime
     max_model_len: int = 4096
     marks: list[MarkDecorator] = []
+    mm_processor_kwargs: dict[str, Any] = {}
 
     @property
     def gguf_model(self):
@@ -31,27 +38,75 @@ class GGUFMMTestConfig(NamedTuple):
         return hf_hub_download(self.gguf_repo, filename=self.gguf_backbone)
 
 
+# Common prompts aligned with test_common.py "gemma3" entry format
+_GEMMA3_PROMPTS = IMAGE_ASSETS.prompts(
+    {
+        "stop_sign": (
+            "<bos><start_of_turn>user\n"
+            "<start_of_image>What's the content in the center of the image?"
+            "<end_of_turn>\n<start_of_turn>model\n"
+        ),
+        "cherry_blossom": (
+            "<bos><start_of_turn>user\n"
+            "<start_of_image>What is the season?"
+            "<end_of_turn>\n<start_of_turn>model\n"
+        ),
+    }
+)
+
+# Image asset names - load at runtime to avoid pickle issues with subprocess
+_GEMMA3_IMAGE_NAMES = ["stop_sign", "cherry_blossom"]
+
+# Regular multimodal (no pan-and-scan) - uses QAT Q4_0 GGUF
 GEMMA3_CONFIG = GGUFMMTestConfig(
     original_model="google/gemma-3-4b-it",
     gguf_repo="google/gemma-3-4b-it-qat-q4_0-gguf",
     gguf_backbone="gemma-3-4b-it-q4_0.gguf",
     gguf_mmproj="mmproj-model-f16-4B.gguf",
-    prompt=["<start_of_image>Describe this image in detail:"],
-    mm_data={"images": [ImageAsset("stop_sign").pil_image]},
+    prompt=_GEMMA3_PROMPTS,
+    image_names=_GEMMA3_IMAGE_NAMES,
+    max_model_len=4096,
     marks=[pytest.mark.core_model],
+    mm_processor_kwargs={},
 )
 
-MODELS_TO_TEST = [GEMMA3_CONFIG]
+# Pan-and-scan multimodal - uses unquantized BF16 GGUF
+GEMMA3_CONFIG_PAN_AND_SCAN = GGUFMMTestConfig(
+    original_model="google/gemma-3-4b-it",
+    gguf_repo="unsloth/gemma-3-4b-it-GGUF",
+    gguf_backbone="gemma-3-4b-it-BF16.gguf",
+    gguf_mmproj="mmproj-BF16.gguf",
+    prompt=_GEMMA3_PROMPTS,
+    image_names=_GEMMA3_IMAGE_NAMES,
+    max_model_len=4096,
+    marks=[pytest.mark.core_model],
+    mm_processor_kwargs={"do_pan_and_scan": True},
+)
+
+MODELS_TO_TEST = [GEMMA3_CONFIG, GEMMA3_CONFIG_PAN_AND_SCAN]
 
 
 def run_multimodal_gguf_test(
+    hf_runner: type[HfRunner],
     vllm_runner: type[VllmRunner],
     model: GGUFMMTestConfig,
     dtype: str,
     max_tokens: int,
     num_logprobs: int,
 ):
-    # Run gguf model.
+    # Load images at runtime (inside subprocess) to avoid pickle issues
+    images = [ImageAsset(name).pil_image for name in model.image_names]
+    size_factors = [0.25, 0.5, 1.0]
+    inputs_per_image = [
+        (
+            [prompt for _ in size_factors],
+            [rescale_image_size(image, factor) for factor in size_factors],
+        )
+        for image, prompt in zip(images, model.prompt)
+    ]
+
+    # NOTE: Run vLLM first to avoid CUDA init issues with multiprocessing fork.
+    # Run GGUF model via vLLM.
     with (
         set_default_torch_num_threads(1),
         vllm_runner(
@@ -60,35 +115,42 @@ def run_multimodal_gguf_test(
             tokenizer_name=model.original_model,
             dtype=dtype,
             max_model_len=model.max_model_len,
+            mm_processor_kwargs=model.mm_processor_kwargs,
         ) as gguf_model,
     ):
-        gguf_outputs = gguf_model.generate_greedy_logprobs(
-            prompts=model.prompt,
-            max_tokens=max_tokens,
-            num_logprobs=num_logprobs,
-            **model.mm_data,
-        )
+        gguf_outputs_per_case = [
+            gguf_model.generate_greedy_logprobs(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                images=images,
+            )
+            for prompts, images in inputs_per_image
+        ]
 
-    # Run unquantized model.
-    with vllm_runner(
-        model_name=model.original_model,
-        enforce_eager=True,  # faster tests
+    # Then run HfRunner for HuggingFace baseline comparison.
+    with hf_runner(
+        model.original_model,
         dtype=dtype,
-        max_model_len=model.max_model_len,
-    ) as original_model:
-        original_outputs = original_model.generate_greedy_logprobs(
-            prompts=model.prompt,
-            max_tokens=max_tokens,
-            num_logprobs=num_logprobs,
-            **model.mm_data,
-        )
+        auto_cls=AutoModelForImageTextToText,
+    ) as hf_model:
+        hf_outputs_per_case = [
+            hf_model.generate_greedy_logprobs_limit(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                images=images,
+            )
+            for prompts, images in inputs_per_image
+        ]
 
-    check_logprobs_close(
-        outputs_0_lst=original_outputs,
-        outputs_1_lst=gguf_outputs,
-        name_0="original",
-        name_1="gguf",
-    )
+    for hf_outputs, gguf_outputs in zip(hf_outputs_per_case, gguf_outputs_per_case):
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=gguf_outputs,
+            name_0="hf",
+            name_1="gguf",
+        )
 
 
 @pytest.mark.skipif(
@@ -105,11 +167,14 @@ def run_multimodal_gguf_test(
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("num_logprobs", [10])
-def test_models(
+def test_gemma3_mm_gguf(
+    hf_runner: type[HfRunner],
     vllm_runner: type[VllmRunner],
     model: GGUFMMTestConfig,
     dtype: str,
     max_tokens: int,
     num_logprobs: int,
 ) -> None:
-    run_multimodal_gguf_test(vllm_runner, model, dtype, max_tokens, num_logprobs)
+    run_multimodal_gguf_test(
+        hf_runner, vllm_runner, model, dtype, max_tokens, num_logprobs
+    )
diff --git a/tests/models/multimodal/generation/test_vit_backend_functionality.py b/tests/models/multimodal/generation/test_vit_backend_functionality.py
index a4e4ce312ddd4..8cea6135ba6a2 100644
--- a/tests/models/multimodal/generation/test_vit_backend_functionality.py
+++ b/tests/models/multimodal/generation/test_vit_backend_functionality.py
@@ -15,7 +15,7 @@ from transformers import AutoProcessor
 
 from vllm import LLM, EngineArgs, SamplingParams
 from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.multimodal.utils import encode_image_base64
+from vllm.multimodal.utils import encode_image_url
 from vllm.multimodal.video import sample_frames_from_video
 from vllm.platforms import current_platform
 
@@ -178,8 +178,7 @@ def build_dots_ocr_prompt(images, config):
     """Build Dots.OCR specific prompt with OCR instructions."""
     # Use only stop_sign image for Dots.OCR
     image = images[0]  # Already filtered to stop_sign
-
-    image_url = f"data:image/jpeg;base64,{encode_image_base64(image)}"
+    image_url = encode_image_url(image)
 
     placeholders = [{"type": "image_url", "image_url": {"url": image_url}}]
     messages = [
@@ -204,9 +203,7 @@ def build_processor_prompt(images, config):
         config["model_name"], trust_remote_code=True
     )
 
-    image_urls = [
-        f"data:image/jpeg;base64,{encode_image_base64(img)}" for img in images
-    ]
+    image_urls = [encode_image_url(img) for img in images]
     placeholders = [{"type": "image", "image": url} for url in image_urls]
     messages = [
         {
@@ -225,9 +222,7 @@ def build_processor_prompt(images, config):
 
 def build_ovis_prompt(images, config):
     """Build Ovis2.5 specific prompt with custom format."""
-    image_urls = [
-        f"data:image/jpeg;base64,{encode_image_base64(img)}" for img in images
-    ]
+    image_urls = [encode_image_url(img) for img in images]
 
     placeholders = "\n".join(
         f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
diff --git a/tests/models/multimodal/generation/test_voxtral.py b/tests/models/multimodal/generation/test_voxtral.py
index 0eaef49e2395c..9f8415c0c390c 100644
--- a/tests/models/multimodal/generation/test_voxtral.py
+++ b/tests/models/multimodal/generation/test_voxtral.py
@@ -111,4 +111,5 @@ async def test_online_serving(client, audio_assets: AudioTestAssets):
 
     assert len(chat_completion.choices) == 1
     choice = chat_completion.choices[0]
+    assert choice.message.content == "In the first audio clip, you hear a brief"
     assert choice.finish_reason == "length"
diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py
index cb875436857cf..46fd4249ea4f8 100644
--- a/tests/models/multimodal/processing/test_tensor_schema.py
+++ b/tests/models/multimodal/processing/test_tensor_schema.py
@@ -138,7 +138,7 @@ def create_batched_mm_kwargs(
     )
 
 
-# TODO(Isotr0py): Don't initalize model during test
+# TODO(Isotr0py): Don't initialize model during test
 @contextmanager
 def initialize_dummy_model(
     model_cls: type[nn.Module],
diff --git a/tests/models/registry.py b/tests/models/registry.py
index c5d72b5d581b9..2922414cdaa6a 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -215,7 +215,10 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
         trust_remote_code=True,
     ),
     "CwmForCausalLM": _HfExamplesInfo("facebook/cwm", min_transformers_version="4.58"),
-    "DbrxForCausalLM": _HfExamplesInfo("databricks/dbrx-instruct"),
+    # FIXME: databricks/dbrx-instruct has been deleted
+    "DbrxForCausalLM": _HfExamplesInfo(
+        "databricks/dbrx-instruct", is_available_online=False
+    ),
     "DeciLMForCausalLM": _HfExamplesInfo(
         "nvidia/Llama-3_3-Nemotron-Super-49B-v1",
         trust_remote_code=True,
@@ -295,6 +298,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
         "internlm/internlm3-8b-instruct", trust_remote_code=True
     ),
     "JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"),
+    "Jais2ForCausalLM": _HfExamplesInfo(
+        "inceptionai/Jais-2-8B-Chat", min_transformers_version="4.58"
+    ),
     "JambaForCausalLM": _HfExamplesInfo(
         "ai21labs/AI21-Jamba-1.5-Mini",
         extras={
@@ -363,7 +369,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
         {"tiny": "TitanML/tiny-mixtral"},
     ),
     "MptForCausalLM": _HfExamplesInfo("mpt", is_available_online=False),
-    "MPTForCausalLM": _HfExamplesInfo("mosaicml/mpt-7b"),
+    # FIXME: mosaicml/mpt-7b has been deleted
+    "MPTForCausalLM": _HfExamplesInfo("mosaicml/mpt-7b", is_available_online=False),
     "NemotronForCausalLM": _HfExamplesInfo("nvidia/Minitron-8B-Base"),
     "NemotronHForCausalLM": _HfExamplesInfo(
         "nvidia/Nemotron-H-8B-Base-8K", trust_remote_code=True
@@ -456,6 +463,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     ),
     "Zamba2ForCausalLM": _HfExamplesInfo("Zyphra/Zamba2-7B-instruct"),
     "MiMoForCausalLM": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL", trust_remote_code=True),
+    "MiMoV2FlashForCausalLM": _HfExamplesInfo(
+        "XiaomiMiMo/MiMo-V2-Flash", trust_remote_code=True
+    ),
     "Dots1ForCausalLM": _HfExamplesInfo("rednote-hilab/dots.llm1.inst"),
 }
 
@@ -478,6 +488,9 @@ _EMBEDDING_EXAMPLE_MODELS = {
     ),
     "JambaForSequenceClassification": _HfExamplesInfo("ai21labs/Jamba-tiny-reward-dev"),
     "LlamaModel": _HfExamplesInfo("llama", is_available_online=False),
+    "LlamaBidirectionalModel": _HfExamplesInfo(
+        "nvidia/llama-nemotron-embed-1b-v2", trust_remote_code=True
+    ),
     "MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"),
     "ModernBertModel": _HfExamplesInfo(
         "Alibaba-NLP/gte-modernbert-base", trust_remote_code=True
@@ -544,6 +557,9 @@ _SEQUENCE_CLASSIFICATION_EXAMPLE_MODELS = {
         trust_remote_code=True,
         hf_overrides={"architectures": ["GteNewForSequenceClassification"]},
     ),
+    "LlamaBidirectionalForSequenceClassification": _HfExamplesInfo(
+        "nvidia/llama-nemotron-rerank-1b-v2", trust_remote_code=True
+    ),
     "ModernBertForSequenceClassification": _HfExamplesInfo(
         "Alibaba-NLP/gte-reranker-modernbert-base"
     ),
@@ -844,6 +860,11 @@ _MULTIMODAL_EXAMPLE_MODELS = {
         # disable this temporarily until we support HF format
         is_available_online=False,
     ),
+    "VoxtralStreamingGeneration": _HfExamplesInfo(
+        "<place-holder>",
+        # disable this temporarily until we support HF format
+        is_available_online=False,
+    ),
     # [Encoder-decoder]
     "WhisperForConditionalGeneration": _HfExamplesInfo(
         "openai/whisper-large-v3-turbo",
diff --git a/tests/models/utils.py b/tests/models/utils.py
index d84b4b820533e..bf26c21fb5f58 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -399,6 +399,7 @@ class LASTPoolingEmbedModelInfo(EmbedModelInfo):
 @dataclass
 class RerankModelInfo(ModelInfo):
     mteb_score: float | None = None
+    chat_template_name: str | None = None
 
 
 @dataclass
diff --git a/tests/multimodal/test_embedding_shape_validation_unit.py b/tests/multimodal/test_embedding_shape_validation_unit.py
new file mode 100644
index 0000000000000..7966aad4e988c
--- /dev/null
+++ b/tests/multimodal/test_embedding_shape_validation_unit.py
@@ -0,0 +1,249 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Unit tests for embedding shape validation.
+
+Simple, fast unit tests that can run without server fixtures.
+Run with: pytest tests/multimodal/test_embedding_shape_validation_unit.py -v
+"""
+
+import pytest
+import torch
+
+from vllm.multimodal.parse import (
+    AudioEmbeddingItems,
+    ImageEmbeddingItems,
+)
+
+
+class TestImageEmbedBasicValidation:
+    """Test basic ndim validation in image embeddings via ImageEmbeddingItems."""
+
+    def test_valid_2d_tensor_accepted(self):
+        """Baseline: 2D tensors should be accepted."""
+        valid_tensor = torch.randn(10, 768, dtype=torch.float32)
+
+        # Should not raise - 2D is valid
+        items = ImageEmbeddingItems(valid_tensor)
+        assert items.get_count() == 10
+
+    def test_valid_3d_tensor_accepted(self):
+        """Baseline: 3D tensors should be accepted."""
+        valid_tensor = torch.randn(2, 10, 768, dtype=torch.float32)
+
+        # Should not raise - 3D is valid
+        items = ImageEmbeddingItems(valid_tensor)
+        assert items.get_count() == 2
+
+    def test_valid_list_of_2d_tensors_accepted(self):
+        """Baseline: List of 2D tensors should be accepted."""
+        tensors = [
+            torch.randn(10, 768, dtype=torch.float32),
+            torch.randn(15, 768, dtype=torch.float32),
+        ]
+
+        # Should not raise
+        items = ImageEmbeddingItems(tensors)
+        assert items.get_count() == 2
+
+    def test_1d_tensor_rejected(self):
+        """Security: 1D tensors should be rejected (invalid ndim)."""
+        invalid_tensor = torch.randn(768, dtype=torch.float32)  # 1D
+
+        with pytest.raises(ValueError) as exc_info:
+            ImageEmbeddingItems(invalid_tensor)
+
+        assert "must be 2D" in str(exc_info.value) or "3D" in str(exc_info.value)
+
+    def test_4d_tensor_rejected(self):
+        """Security: 4D tensors should be rejected (invalid ndim)."""
+        invalid_tensor = torch.randn(1, 2, 10, 768, dtype=torch.float32)  # 4D
+
+        with pytest.raises(ValueError) as exc_info:
+            ImageEmbeddingItems(invalid_tensor)
+
+        assert "must be 2D" in str(exc_info.value) or "3D" in str(exc_info.value)
+
+    def test_hidden_size_validation_correct_size(self):
+        """Embeddings with correct hidden size should be accepted."""
+        expected_hidden_size = 768
+        valid_tensor = torch.randn(10, expected_hidden_size, dtype=torch.float32)
+
+        # Should not raise
+        items = ImageEmbeddingItems(
+            valid_tensor, expected_hidden_size=expected_hidden_size
+        )
+        assert items.get_count() == 10
+
+    def test_hidden_size_validation_wrong_size_rejected(self):
+        """Embeddings with wrong hidden size should be rejected."""
+        expected_hidden_size = 768
+        wrong_hidden_size = 4096
+        invalid_tensor = torch.randn(10, wrong_hidden_size, dtype=torch.float32)
+
+        with pytest.raises(ValueError) as exc_info:
+            ImageEmbeddingItems(
+                invalid_tensor, expected_hidden_size=expected_hidden_size
+            )
+
+        error_msg = str(exc_info.value)
+        assert "hidden dimension mismatch" in error_msg.lower()
+        assert str(wrong_hidden_size) in error_msg
+        assert str(expected_hidden_size) in error_msg
+
+
+class TestAudioEmbedBasicValidation:
+    """Test basic ndim validation in audio embeddings via AudioEmbeddingItems."""
+
+    def test_valid_2d_tensor_accepted(self):
+        """Baseline: 2D tensors should be accepted."""
+        valid_tensor = torch.randn(10, 768, dtype=torch.float32)
+
+        # Should not raise - 2D is valid
+        items = AudioEmbeddingItems(valid_tensor)
+        assert items.get_count() == 10
+
+    def test_valid_3d_tensor_accepted(self):
+        """Baseline: 3D tensors should be accepted."""
+        valid_tensor = torch.randn(2, 10, 768, dtype=torch.float32)
+
+        # Should not raise - 3D is valid
+        items = AudioEmbeddingItems(valid_tensor)
+        assert items.get_count() == 2
+
+    def test_valid_list_of_2d_tensors_accepted(self):
+        """Baseline: List of 2D tensors should be accepted."""
+        tensors = [
+            torch.randn(10, 768, dtype=torch.float32),
+            torch.randn(15, 768, dtype=torch.float32),
+        ]
+
+        # Should not raise
+        items = AudioEmbeddingItems(tensors)
+        assert items.get_count() == 2
+
+    def test_1d_tensor_rejected(self):
+        """Security: 1D tensors should be rejected (invalid ndim)."""
+        invalid_tensor = torch.randn(768, dtype=torch.float32)  # 1D
+
+        with pytest.raises(ValueError) as exc_info:
+            AudioEmbeddingItems(invalid_tensor)
+
+        assert "must be 2D" in str(exc_info.value) or "3D" in str(exc_info.value)
+
+    def test_scalar_rejected(self):
+        """Security: Scalar tensors should be rejected."""
+        invalid_tensor = torch.tensor(1.0)  # 0D (scalar)
+
+        with pytest.raises(ValueError):
+            AudioEmbeddingItems(invalid_tensor)
+
+    def test_hidden_size_validation_correct_size(self):
+        """Embeddings with correct hidden size should be accepted."""
+        expected_hidden_size = 768
+        valid_tensor = torch.randn(10, expected_hidden_size, dtype=torch.float32)
+
+        # Should not raise
+        items = AudioEmbeddingItems(
+            valid_tensor, expected_hidden_size=expected_hidden_size
+        )
+        assert items.get_count() == 10
+
+    def test_hidden_size_validation_wrong_size_rejected(self):
+        """Embeddings with wrong hidden size should be rejected."""
+        expected_hidden_size = 768
+        wrong_hidden_size = 4096
+        invalid_tensor = torch.randn(10, wrong_hidden_size, dtype=torch.float32)
+
+        with pytest.raises(ValueError) as exc_info:
+            AudioEmbeddingItems(
+                invalid_tensor, expected_hidden_size=expected_hidden_size
+            )
+
+        error_msg = str(exc_info.value)
+        assert "hidden dimension mismatch" in error_msg.lower()
+        assert str(wrong_hidden_size) in error_msg
+        assert str(expected_hidden_size) in error_msg
+
+
+class TestShapeValidationDoSPrevention:
+    """
+    Tests for DoS prevention through shape validation.
+
+    Verifies that embeddings with incorrect shapes are rejected early,
+    preventing crashes during model inference.
+    """
+
+    def test_prevent_crash_from_wrong_shape_image_embeds(self):
+        """
+        Prevent crash scenario: wrong hidden size in image embeddings.
+
+        Without validation, this would pass initial checks but crash later
+        during model forward pass when dimensions don't match.
+        """
+        expected_hidden_size = 768  # Typical model hidden size
+        wrong_hidden_size = 4096  # Wrong size (e.g., Llama-sized)
+
+        wrong_embedding = torch.randn(100, wrong_hidden_size, dtype=torch.float32)
+
+        # Should be rejected at instantiation time, not during inference
+        with pytest.raises(ValueError) as exc_info:
+            ImageEmbeddingItems(
+                wrong_embedding, expected_hidden_size=expected_hidden_size
+            )
+
+        error_msg = str(exc_info.value)
+        assert "hidden dimension mismatch" in error_msg.lower()
+        assert str(expected_hidden_size) in error_msg  # Expected
+        assert str(wrong_hidden_size) in error_msg  # Received
+
+    def test_prevent_crash_from_wrong_shape_audio_embeds(self):
+        """
+        Prevent crash scenario: wrong hidden size in audio embeddings.
+        """
+        expected_hidden_size = 768
+        wrong_hidden_size = 4096
+
+        wrong_embedding = torch.randn(100, wrong_hidden_size, dtype=torch.float32)
+
+        with pytest.raises(ValueError) as exc_info:
+            AudioEmbeddingItems(
+                wrong_embedding, expected_hidden_size=expected_hidden_size
+            )
+
+        error_msg = str(exc_info.value)
+        assert "hidden dimension mismatch" in error_msg.lower()
+
+    def test_extremely_large_hidden_size_rejected(self):
+        """Security: Prevent DoS from extremely large embeddings."""
+        expected_hidden_size = 768
+        huge_hidden_size = 100000  # Large but not extreme to avoid test OOM
+
+        invalid_tensor = torch.randn(10, huge_hidden_size, dtype=torch.float32)
+
+        with pytest.raises(ValueError) as exc_info:
+            ImageEmbeddingItems(
+                invalid_tensor, expected_hidden_size=expected_hidden_size
+            )
+
+        assert "hidden dimension mismatch" in str(exc_info.value).lower()
+
+    def test_batch_with_mixed_hidden_sizes_rejected(self):
+        """All embeddings in a list must have the same hidden size."""
+        expected_hidden_size = 768
+
+        # One correct, one wrong
+        batch = [
+            torch.randn(10, expected_hidden_size, dtype=torch.float32),
+            torch.randn(10, expected_hidden_size + 100, dtype=torch.float32),  # Wrong!
+        ]
+
+        # Should fail on the second one
+        with pytest.raises(ValueError) as exc_info:
+            ImageEmbeddingItems(batch, expected_hidden_size=expected_hidden_size)
+
+        assert "hidden dimension mismatch" in str(exc_info.value).lower()
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v", "--tb=short"])
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 412b21328a325..535f028202275 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -83,7 +83,7 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
         current_platform.is_rocm()
         and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL
     ):
-        pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
+        pytest.skip(f"Skip model {model_path} as it is not supported on ROCm.")
 
     with vllm_runner(model_path, enforce_eager=True) as llm:
 
@@ -161,7 +161,7 @@ def test_compressed_tensors_w8a8_logprobs(
         current_platform.is_rocm()
         and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL
     ):
-        pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
+        pytest.skip(f"Skip model {model_path} as it is not supported on ROCm.")
 
     if use_aiter:
         if model_path not in ROCM_AITER_SUPPORTED_INT8_MODEL:
@@ -231,7 +231,7 @@ def test_compressed_tensors_w8a8_dynamic_per_token(
         current_platform.is_rocm()
         and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL
     ):
-        pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
+        pytest.skip(f"Skip model {model_path} as it is not supported on ROCm.")
 
     if use_aiter:
         if model_path not in ROCM_AITER_SUPPORTED_INT8_MODEL:
diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index 62203186510ce..a4b6d35987e13 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -15,6 +15,7 @@ from vllm.model_executor.layers.quantization.fp8 import (
     Fp8Config,
     Fp8KVCacheMethod,
     Fp8LinearMethod,
+    Fp8MoeBackend,
     Fp8MoEMethod,
 )
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -216,7 +217,7 @@ def test_scaled_fp8_quant(dtype) -> None:
     ref_y, inv_scale = ops.scaled_fp8_quant(x, None)
     ref_y = per_tensor_dequantize(ref_y, inv_scale, dtype)
 
-    # Reference dynamic quantizaton
+    # Reference dynamic quantization
     y = quantize_ref(x, inv_scale)
     torch.testing.assert_close(ref_y, per_tensor_dequantize(y, inv_scale, dtype))
 
@@ -324,7 +325,10 @@ def test_fp8_reloading(
                 weight_loader=default_weight_loader,
             )
 
+        # Fp8LinearMethod uses use_marlin
+        # Fp8MoEMethod uses fp8_backend
         method.use_marlin = use_marlin
+        method.fp8_backend = Fp8MoeBackend.MARLIN if use_marlin else None
 
     # capture weights format during loading
     original_metadata = [
diff --git a/tests/quantization/test_modelopt.py b/tests/quantization/test_modelopt.py
index 0298994c396f6..154b29d7017ac 100644
--- a/tests/quantization/test_modelopt.py
+++ b/tests/quantization/test_modelopt.py
@@ -6,6 +6,7 @@ Run `pytest tests/quantization/test_modelopt.py`.
 """
 
 import os
+from typing import NoReturn
 
 import pytest
 import torch
@@ -19,6 +20,28 @@ def enable_pickle(monkeypatch):
     monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
 
 
+def _skip(msg: str) -> NoReturn:
+    pytest.skip(msg)
+    raise RuntimeError(msg)
+
+
+def _snapshot_download_or_skip(model_id: str) -> str:
+    try:
+        from huggingface_hub import snapshot_download
+    except Exception as e:  # pragma: no cover
+        _skip(f"huggingface_hub is required to download {model_id}: {e}")
+
+    try:
+        return snapshot_download(
+            repo_id=model_id,
+            repo_type="model",
+            # These checkpoints are already small; download full repo for simplicity.
+            allow_patterns=["*"],
+        )
+    except Exception as e:
+        _skip(f"Failed to download {model_id} from the HF Hub: {e}")
+
+
 @pytest.mark.skipif(
     not is_quant_method_supported("modelopt"),
     reason="ModelOpt FP8 is not supported on this GPU type.",
@@ -91,3 +114,121 @@ def test_modelopt_fp8_checkpoint_setup(vllm_runner):
         output = llm.generate_greedy(["Hello my name is"], max_tokens=4)
         assert output
         print(f"ModelOpt FP8 output: {output}")
+
+
+@pytest.mark.skipif(
+    not is_quant_method_supported("modelopt"),
+    reason="ModelOpt FP8 is not supported on this GPU type.",
+)
+def test_modelopt_fp8_pc_pt_checkpoint_setup(vllm_runner):
+    """Test ModelOpt FP8_PER_CHANNEL_PER_TOKEN checkpoint setup."""
+    model_id = "CedricHwang/qwen2.5-0.5b-modelopt-fp8-pc-pt"
+    model_path = _snapshot_download_or_skip(model_id)
+
+    with vllm_runner(model_path, quantization="modelopt", enforce_eager=True) as llm:
+
+        def check_model(model):
+            layer = model.model.layers[0]
+
+            qkv_proj = layer.self_attn.qkv_proj
+            o_proj = layer.self_attn.o_proj
+            gate_up_proj = layer.mlp.gate_up_proj
+            down_proj = layer.mlp.down_proj
+
+            from vllm.model_executor.layers.quantization.modelopt import (
+                ModelOptFp8PcPtLinearMethod,
+            )
+
+            assert isinstance(qkv_proj.quant_method, ModelOptFp8PcPtLinearMethod)
+            assert isinstance(o_proj.quant_method, ModelOptFp8PcPtLinearMethod)
+            assert isinstance(gate_up_proj.quant_method, ModelOptFp8PcPtLinearMethod)
+            assert isinstance(down_proj.quant_method, ModelOptFp8PcPtLinearMethod)
+
+            assert qkv_proj.weight.dtype == torch.float8_e4m3fn
+            assert o_proj.weight.dtype == torch.float8_e4m3fn
+            assert gate_up_proj.weight.dtype == torch.float8_e4m3fn
+            assert down_proj.weight.dtype == torch.float8_e4m3fn
+
+            # Per-channel scales; activations are dynamically scaled per token.
+            assert hasattr(qkv_proj, "weight_scale")
+            assert qkv_proj.weight_scale.dtype == torch.float32
+            assert qkv_proj.weight_scale.dim() == 1
+            assert not hasattr(qkv_proj, "input_scale")
+
+            assert hasattr(o_proj, "weight_scale")
+            assert o_proj.weight_scale.dtype == torch.float32
+            assert o_proj.weight_scale.dim() == 1
+            assert not hasattr(o_proj, "input_scale")
+
+            assert hasattr(gate_up_proj, "weight_scale")
+            assert gate_up_proj.weight_scale.dtype == torch.float32
+            assert gate_up_proj.weight_scale.dim() == 1
+            assert not hasattr(gate_up_proj, "input_scale")
+
+            assert hasattr(down_proj, "weight_scale")
+            assert down_proj.weight_scale.dtype == torch.float32
+            assert down_proj.weight_scale.dim() == 1
+            assert not hasattr(down_proj, "input_scale")
+
+        llm.apply_model(check_model)
+
+        output = llm.generate_greedy(["Hello my name is"], max_tokens=4)
+        assert output
+        print(f"ModelOpt FP8_PER_CHANNEL_PER_TOKEN output: {output}")
+
+
+@pytest.mark.skipif(
+    not is_quant_method_supported("modelopt"),
+    reason="ModelOpt FP8 is not supported on this GPU type.",
+)
+def test_modelopt_fp8_pb_wo_checkpoint_setup(vllm_runner):
+    """Test ModelOpt FP8_PB_WO checkpoint setup."""
+    model_id = "CedricHwang/qwen2.5-0.5b-modelopt-fp8-pb-wo"
+    model_path = _snapshot_download_or_skip(model_id)
+
+    with vllm_runner(model_path, quantization="modelopt", enforce_eager=True) as llm:
+
+        def check_model(model):
+            layer = model.model.layers[0]
+
+            qkv_proj = layer.self_attn.qkv_proj
+            o_proj = layer.self_attn.o_proj
+            gate_up_proj = layer.mlp.gate_up_proj
+            down_proj = layer.mlp.down_proj
+
+            from vllm.model_executor.layers.quantization.modelopt import (
+                ModelOptFp8PbWoLinearMethod,
+            )
+
+            assert isinstance(qkv_proj.quant_method, ModelOptFp8PbWoLinearMethod)
+            assert isinstance(o_proj.quant_method, ModelOptFp8PbWoLinearMethod)
+            assert isinstance(gate_up_proj.quant_method, ModelOptFp8PbWoLinearMethod)
+            assert isinstance(down_proj.quant_method, ModelOptFp8PbWoLinearMethod)
+
+            assert qkv_proj.weight.dtype == torch.float8_e4m3fn
+            assert o_proj.weight.dtype == torch.float8_e4m3fn
+            assert gate_up_proj.weight.dtype == torch.float8_e4m3fn
+            assert down_proj.weight.dtype == torch.float8_e4m3fn
+
+            # Block scales; should be materialized as a 2D [out_blk, in_blk] tensor.
+            assert hasattr(qkv_proj, "weight_scale")
+            assert qkv_proj.weight_scale.dtype == torch.float32
+            assert qkv_proj.weight_scale.dim() == 2
+
+            assert hasattr(o_proj, "weight_scale")
+            assert o_proj.weight_scale.dtype == torch.float32
+            assert o_proj.weight_scale.dim() == 2
+
+            assert hasattr(gate_up_proj, "weight_scale")
+            assert gate_up_proj.weight_scale.dtype == torch.float32
+            assert gate_up_proj.weight_scale.dim() == 2
+
+            assert hasattr(down_proj, "weight_scale")
+            assert down_proj.weight_scale.dtype == torch.float32
+            assert down_proj.weight_scale.dim() == 2
+
+        llm.apply_model(check_model)
+
+        output = llm.generate_greedy(["Hello my name is"], max_tokens=4)
+        assert output
+        print(f"ModelOpt FP8_PB_WO output: {output}")
diff --git a/tests/standalone_tests/python_only_compile.sh b/tests/standalone_tests/python_only_compile.sh
index 2017e34030d60..ebf199a5056fb 100644
--- a/tests/standalone_tests/python_only_compile.sh
+++ b/tests/standalone_tests/python_only_compile.sh
@@ -18,25 +18,37 @@ for i in {1..5}; do
     echo "Checking metadata.json URL (attempt $i)..."
     if curl --fail "$meta_json_url" > metadata.json; then
         echo "INFO: metadata.json URL is valid."
-        # check whether it is valid json by python
+        # check whether it is valid json by python (printed to stdout)
         if python3 -m json.tool metadata.json; then
-            echo "INFO: metadata.json is valid JSON. Proceeding with the test."
+            echo "INFO: metadata.json is valid JSON. Proceeding with the check."
+            # check whether there is an object in the json matching:
+            # "package_name": "vllm", and "platform_tag" matches the current architecture
+            # see `determine_wheel_url` in setup.py for more details
+            if python3 -c "import platform as p,json as j,sys as s; d = j.load(open('metadata.json')); \
+             s.exit(int(not any(o.get('package_name') == 'vllm' and p.machine() in o.get('platform_tag') \
+             for o in d)))" 2>/dev/null; then
+                echo "INFO: metadata.json contains a pre-compiled wheel for the current architecture."
+                break
+            else
+                echo "WARN: metadata.json does not have a pre-compiled wheel for the current architecture."
+            fi
         else
             echo "CRITICAL: metadata.json exists but is not valid JSON, please do report in #sig-ci channel!"
+            echo "INFO: metadata.json content:"
+            cat metadata.json
             exit 1
         fi
-        break
     fi
-    # failure handling
+    # failure handling & retry logic
     if [ $i -eq 5 ]; then
-        echo "ERROR: metadata.json URL is still not valid after 5 attempts."
-        echo "ERROR: Please check whether the precompiled wheel for commit $merge_base_commit exists."
+        echo "ERROR: metadata is still not available after 5 attempts."
+        echo "ERROR: Please check whether the precompiled wheel for commit $merge_base_commit is available."
         echo " NOTE: If $merge_base_commit is a new commit on main, maybe try again after its release pipeline finishes."
         echo " NOTE: If it fails, please report in #sig-ci channel."
         exit 1
     else
-        echo "WARNING: metadata.json URL is not valid. Retrying in 3 minutes..."
-        sleep 180
+        echo "WARNING: metadata is not available. Retrying after 5 minutes..."
+        sleep 300
     fi
 done
 
diff --git a/tests/test_attention_backend_registry.py b/tests/test_attention_backend_registry.py
new file mode 100644
index 0000000000000..7b90b949aa457
--- /dev/null
+++ b/tests/test_attention_backend_registry.py
@@ -0,0 +1,169 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.attention.backends.abstract import (
+    AttentionBackend,
+    AttentionImpl,
+)
+from vllm.attention.backends.registry import (
+    AttentionBackendEnum,
+    MambaAttentionBackendEnum,
+    register_backend,
+)
+
+
+class CustomAttentionImpl(AttentionImpl):
+    """Mock custom attention implementation for testing."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+
+    def forward(self, *args, **kwargs):
+        """Mock forward pass."""
+        pass
+
+
+class CustomAttentionBackend(AttentionBackend):
+    """Mock custom attention backend for testing."""
+
+    @staticmethod
+    def get_name():
+        return "CUSTOM"
+
+    @staticmethod
+    def get_impl_cls():
+        return CustomAttentionImpl
+
+    @staticmethod
+    def get_builder_cls():
+        """Mock builder class."""
+        return None
+
+    @staticmethod
+    def get_required_kv_cache_layout():
+        """Mock KV cache layout."""
+        return None
+
+
+class CustomMambaAttentionImpl(AttentionImpl):
+    """Mock custom mamba attention implementation for testing."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+
+    def forward(self, *args, **kwargs):
+        """Mock forward pass."""
+        pass
+
+
+class CustomMambaAttentionBackend(AttentionBackend):
+    """Mock custom mamba attention backend for testing."""
+
+    @staticmethod
+    def get_name():
+        return "CUSTOM_MAMBA"
+
+    @staticmethod
+    def get_impl_cls():
+        return CustomMambaAttentionImpl
+
+    @staticmethod
+    def get_builder_cls():
+        """Mock builder class."""
+        return None
+
+    @staticmethod
+    def get_required_kv_cache_layout():
+        """Mock KV cache layout."""
+        return None
+
+
+def test_custom_is_not_alias_of_any_backend():
+    # Get all members of AttentionBackendEnum
+    all_backends = list(AttentionBackendEnum)
+
+    # Find any aliases of CUSTOM
+    aliases = []
+    for backend in all_backends:
+        if backend.name != "CUSTOM" and backend is AttentionBackendEnum.CUSTOM:
+            aliases.append(backend.name)
+
+    # CUSTOM should not be an alias of any other backend
+    assert len(aliases) == 0, (
+        f"BUG! CUSTOM is an alias of: {', '.join(aliases)}!\n"
+        f"CUSTOM.value = {repr(AttentionBackendEnum.CUSTOM.value)}\n"
+        f"This happens when CUSTOM has the same value as another backend.\n"
+        f"When you register to CUSTOM, you're actually registering to {aliases[0]}!\n"
+        f"All backend values:\n"
+        + "\n".join(f"  {b.name}: {repr(b.value)}" for b in all_backends)
+    )
+
+    # Verify CUSTOM has its own unique identity
+    assert AttentionBackendEnum.CUSTOM.name == "CUSTOM", (
+        f"CUSTOM.name should be 'CUSTOM', but got '{AttentionBackendEnum.CUSTOM.name}'"
+    )
+
+
+def test_register_custom_backend_with_class_path():
+    # Register with explicit class path
+    register_backend(
+        backend=AttentionBackendEnum.CUSTOM,
+        class_path="tests.test_attention_backend_registry.CustomAttentionBackend",
+        is_mamba=False,
+    )
+
+    # Check that CUSTOM backend is registered
+    assert AttentionBackendEnum.CUSTOM.is_overridden(), (
+        "CUSTOM should be overridden after registration"
+    )
+
+    # Get the registered class path
+    class_path = AttentionBackendEnum.CUSTOM.get_path()
+    assert class_path == "tests.test_attention_backend_registry.CustomAttentionBackend"
+
+    # Get the backend class
+    backend_cls = AttentionBackendEnum.CUSTOM.get_class()
+    assert backend_cls.get_name() == "CUSTOM"
+    assert backend_cls.get_impl_cls() == CustomAttentionImpl
+
+
+def test_mamba_custom_is_not_alias_of_any_backend():
+    # Get all mamba backends
+    all_backends = list(MambaAttentionBackendEnum)
+
+    # Find any aliases of CUSTOM
+    aliases = []
+    for backend in all_backends:
+        if backend.name != "CUSTOM" and backend is MambaAttentionBackendEnum.CUSTOM:
+            aliases.append(backend.name)
+
+    # CUSTOM should not be an alias of any other backend
+    assert len(aliases) == 0, (
+        f"BUG! MambaAttentionBackendEnum.CUSTOM is an alias of: {', '.join(aliases)}!\n"
+        f"CUSTOM.value = {repr(MambaAttentionBackendEnum.CUSTOM.value)}\n"
+        f"All mamba backend values:\n"
+        + "\n".join(f"  {b.name}: {repr(b.value)}" for b in all_backends)
+    )
+
+
+def test_register_custom_mamba_backend_with_class_path():
+    # Register with explicit class path
+    register_backend(
+        backend=MambaAttentionBackendEnum.CUSTOM,
+        class_path="tests.test_attention_backend_registry.CustomMambaAttentionBackend",
+        is_mamba=True,
+    )
+
+    # Check that the backend is registered
+    assert MambaAttentionBackendEnum.CUSTOM.is_overridden()
+
+    # Get the registered class path
+    class_path = MambaAttentionBackendEnum.CUSTOM.get_path()
+    assert (
+        class_path
+        == "tests.test_attention_backend_registry.CustomMambaAttentionBackend"
+    )
+
+    # Get the backend class
+    backend_cls = MambaAttentionBackendEnum.CUSTOM.get_class()
+    assert backend_cls.get_name() == "CUSTOM_MAMBA"
+    assert backend_cls.get_impl_cls() == CustomMambaAttentionImpl
diff --git a/tests/test_routing_simulator.py b/tests/test_routing_simulator.py
index e8826eb441a24..44cbdeed45074 100644
--- a/tests/test_routing_simulator.py
+++ b/tests/test_routing_simulator.py
@@ -127,7 +127,7 @@ def test_routing_strategy_integration(monkeypatch, device):
         envs.environment_variables[env_name] = lambda s=strategy: s
 
         # Test the select_experts method
-        topk_weights, topk_ids, _ = fused_moe.select_experts(
+        topk_weights, topk_ids = fused_moe.select_experts(
             hidden_states=hidden_states,
             router_logits=router_logits,
         )
diff --git a/tests/tokenizers_/test_detokenize.py b/tests/tokenizers_/test_detokenize.py
index d307993d04df9..ad6c5fb415aad 100644
--- a/tests/tokenizers_/test_detokenize.py
+++ b/tests/tokenizers_/test_detokenize.py
@@ -38,7 +38,8 @@ TOKENIZERS = [
     "EleutherAI/gpt-j-6b",
     "EleutherAI/pythia-70m",
     "bigscience/bloom-560m",
-    "mosaicml/mpt-7b",
+    # FIXME: mosaicml/mpt-7b has been deleted
+    # "mosaicml/mpt-7b",
     "tiiuae/falcon-7b",
     "meta-llama/Llama-3.2-1B-Instruct",
     "codellama/CodeLlama-7b-hf",
diff --git a/tests/tool_parsers/test_mistral_tool_parser.py b/tests/tool_parsers/test_mistral_tool_parser.py
index 9400a67267f4c..d2502079d0de9 100644
--- a/tests/tool_parsers/test_mistral_tool_parser.py
+++ b/tests/tool_parsers/test_mistral_tool_parser.py
@@ -281,6 +281,8 @@ def test_extract_tool_calls_pre_v11_tokenizer(
         "single_tool_add",
         "single_tool_weather",
         "multiple_tool_calls",
+        "complex",
+        "wrong_json",
     ],
     argnames=["model_output", "expected_tool_calls", "expected_content"],
     argvalues=[
@@ -326,6 +328,36 @@ def test_extract_tool_calls_pre_v11_tokenizer(
             ],
             None,
         ),
+        (
+            # Complex
+            """hi{hi[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="bash",
+                        arguments=json.dumps(
+                            {"command": "print(\"hello world!\")\nre.compile(r'{}')"}
+                        )[:-2],
+                    )
+                )
+            ],
+            "hi{hi",
+        ),
+        (
+            # Wrong json
+            """hi{hi[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')"}""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="bash",
+                        arguments=json.dumps(
+                            {"command": "print(\"hello world!\")\nre.compile(r'{}')"}
+                        ),
+                    )
+                )
+            ],
+            "hi{hi",
+        ),
     ],
 )
 def test_extract_tool_calls(
@@ -673,7 +705,7 @@ def test_extract_tool_calls_streaming(
         ),
         (
             # Complex
-            """[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')"}""",  # noqa: E501
+            """hi{hi[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')"}""",  # noqa: E501
             [
                 ToolCall(
                     function=FunctionCall(
@@ -684,7 +716,7 @@ def test_extract_tool_calls_streaming(
                     )
                 )
             ],
-            "",
+            "hi{hi",
         ),
     ],
 )
diff --git a/tests/v1/cudagraph/test_cudagraph_dispatch.py b/tests/v1/cudagraph/test_cudagraph_dispatch.py
index 0e71d6c63ce68..f9d3e8d0532b5 100644
--- a/tests/v1/cudagraph/test_cudagraph_dispatch.py
+++ b/tests/v1/cudagraph/test_cudagraph_dispatch.py
@@ -49,7 +49,10 @@ def _create_vllm_config(
         mock_config.lora_config = None
     # Mimic the behavior of VllmConfig.__post_init__()
     if compilation_config.mode == CompilationMode.VLLM_COMPILE:
-        compilation_config.set_splitting_ops_for_v1()
+        compilation_config.set_splitting_ops_for_v1(
+            all2all_backend=mock_config.parallel_config.all2all_backend,
+            data_parallel_size=mock_config.parallel_config.data_parallel_size,
+        )
 
     # mimic VllmConfig.__post_init__
     if compilation_config.cudagraph_capture_sizes:
diff --git a/tests/v1/e2e/test_async_scheduling.py b/tests/v1/e2e/test_async_scheduling.py
index 61e56c079a3b5..6447a33838d75 100644
--- a/tests/v1/e2e/test_async_scheduling.py
+++ b/tests/v1/e2e/test_async_scheduling.py
@@ -148,7 +148,7 @@ def run_tests(
             # Use TRITON_ATTN for spec decoding test for consistency
             attention_config = {"backend": "TRITON_ATTN"}
         else:
-            attention_config = {"backend": "ROCM_AITER_FA"}
+            attention_config = {"backend": "ROCM_ATTN"}
     else:
         attention_config = {"backend": "FLEX_ATTENTION"}
 
@@ -284,14 +284,6 @@ def run_test(
     print(f"---- TESTING {test_str}: {test_config}")
     print("-" * 80)
 
-    # On ROCm: use float16 for first test (ROCM_AITER_FA), but float32 for
-    # spec decoding test (TRITON_ATTN) for better precision.
-    # On others: always use float32.
-    if current_platform.is_rocm() and not is_testing_with_spec_decoding:
-        dtype = "float16"
-    else:
-        dtype = "float32"
-
     with VllmRunner(
         model,
         max_model_len=512,
@@ -301,7 +293,7 @@ def run_test(
         # enforce_eager=True,
         async_scheduling=async_scheduling,
         distributed_executor_backend=executor,
-        dtype=dtype,
+        dtype="float32",
         speculative_config=spec_config,
         disable_log_stats=False,
         attention_config=attention_config,
diff --git a/tests/v1/ec_connector/integration/test_epd_correctness.py b/tests/v1/ec_connector/integration/test_epd_correctness.py
index 616d34441ab8e..eae4b7427240f 100644
--- a/tests/v1/ec_connector/integration/test_epd_correctness.py
+++ b/tests/v1/ec_connector/integration/test_epd_correctness.py
@@ -31,7 +31,7 @@ import openai
 import requests
 
 from vllm.assets.image import ImageAsset
-from vllm.multimodal.utils import encode_image_base64
+from vllm.multimodal.utils import encode_image_url
 
 MAX_OUTPUT_LEN = 256
 
@@ -49,9 +49,7 @@ SAMPLE_PROMPTS_MM: list[dict] = [
                 "content": [
                     {
                         "type": "image_url",
-                        "image_url": {
-                            "url": f"data:image;base64,{encode_image_base64(image_1)}"
-                        },
+                        "image_url": {"url": encode_image_url(image_1)},
                     },
                     {"type": "text", "text": "What's in this image?"},
                 ],
@@ -66,9 +64,7 @@ SAMPLE_PROMPTS_MM: list[dict] = [
                 "content": [
                     {
                         "type": "image_url",
-                        "image_url": {
-                            "url": f"data:image;base64,{encode_image_base64(image_2)}"
-                        },
+                        "image_url": {"url": encode_image_url(image_2)},
                     },
                     {
                         "type": "image_url",
diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index 25af55baa91f4..11681cfcebca4 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -11,6 +11,13 @@ from vllm import SamplingParams
 from vllm.assets.image import ImageAsset
 from vllm.config import VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    ChatCompletionResponse,
+    ErrorResponse,
+)
+from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
+from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
 from vllm.inputs import PromptType
 from vllm.outputs import RequestOutput
 from vllm.platforms import current_platform
@@ -253,7 +260,7 @@ async def test_multi_abort(output_kind: RequestOutputKind):
 
         # Use multi-abort to abort multiple requests at once
         abort_request_ids = [request_ids[i] for i in REQUEST_IDS_TO_ABORT]
-        await engine.abort(abort_request_ids)
+        await engine.abort(abort_request_ids, internal=False)
 
         # Wait for all tasks to complete
         results = await asyncio.gather(*tasks, return_exceptions=True)
@@ -484,6 +491,60 @@ async def test_dp_rank_argument():
                 pass
 
 
+@pytest.mark.asyncio(scope="module")
+async def test_header_dp_rank_argument():
+    with ExitStack() as after:
+        with set_default_torch_num_threads(1):
+            engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
+        after.callback(engine.shutdown)
+
+        MODEL_NAME = "test-model"
+        BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
+
+        # Create models first
+        models = OpenAIServingModels(
+            engine_client=engine,
+            base_model_paths=BASE_MODEL_PATHS,
+        )
+
+        # Create serving chat instance
+        serving_chat = OpenAIServingChat(
+            engine_client=engine,
+            models=models,
+            response_role="assistant",
+            chat_template=None,
+            chat_template_content_format="auto",
+            request_logger=None,
+        )
+        # Create a chat completion request
+        req = ChatCompletionRequest(
+            model=MODEL_NAME,
+            messages=[{"role": "user", "content": TEXT_PROMPT}],
+            max_tokens=100,
+            temperature=1.0,
+            seed=33,
+        )
+        # Test 1: Valid DP rank (0)
+        mock_raw_request = MagicMock()
+        mock_raw_request.headers = {"X-data-parallel-rank": "0"}
+        mock_raw_request.state = MagicMock()
+
+        # Should succeed with valid rank
+        response = await serving_chat.create_chat_completion(req, mock_raw_request)
+        assert isinstance(response, ChatCompletionResponse), (
+            "Expected a ChatCompletionResponse for valid DP rank"
+        )
+
+        # Test 2: Out-of-range DP rank (1)
+        mock_raw_request.headers = {"X-data-parallel-rank": "1"}
+
+        # should return ErrorResponse for out-of-range rank
+        response2 = await serving_chat.create_chat_completion(req, mock_raw_request)
+        assert isinstance(response2, ErrorResponse), (
+            "Expected an ErrorResponse for out-of-range DP rank"
+        )
+
+
 @pytest.mark.asyncio
 async def test_check_health():
     """Test that check_health returns normally for healthy engine
@@ -548,7 +609,7 @@ async def test_abort_final_output(output_kind: RequestOutputKind):
         await asyncio.sleep(0.5)
 
         # Abort the request
-        await engine.abort(request_id)
+        await engine.abort(request_id, internal=False)
 
         # Wait for generation to complete and return final output
         final_output = await generated
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index 5fa16897b4e0c..4f96ded7ec351 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -40,10 +40,16 @@ TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
 PROMPT = "I am Gyoubu Masataka Oniwa"
 PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids
 
+_REQUEST_COUNTER = 0
+
 
 def make_request() -> EngineCoreRequest:
+    global _REQUEST_COUNTER
+    _REQUEST_COUNTER += 1
+    request_id = f"request-{_REQUEST_COUNTER}"
     return EngineCoreRequest(
-        request_id=str(uuid.uuid4()),
+        request_id=request_id,
+        external_req_id=f"{request_id}-{uuid.uuid4()}",
         prompt_token_ids=PROMPT_TOKENS,
         mm_features=None,
         sampling_params=SamplingParams(),
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index 770560a5e549e..c8d25f9700bf1 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -2,12 +2,14 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
+import importlib
 import os
 import signal
 import time
 import uuid
 from dataclasses import dataclass
 from threading import Thread
+from types import SimpleNamespace
 from typing import Any
 from unittest.mock import MagicMock
 
@@ -24,7 +26,11 @@ from vllm.usage.usage_lib import UsageContext
 from vllm.utils.torch_utils import set_default_torch_num_threads
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.core import EngineCore
-from vllm.v1.engine.core_client import AsyncMPClient, EngineCoreClient, SyncMPClient
+from vllm.v1.engine.core_client import (
+    AsyncMPClient,
+    EngineCoreClient,
+    SyncMPClient,
+)
 from vllm.v1.engine.utils import CoreEngineProcManager
 from vllm.v1.executor.abstract import Executor
 
@@ -39,6 +45,8 @@ TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
 PROMPT = "Hello my name is Robert and I love quantization kernels"
 PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids
 
+_REQUEST_COUNTER = 0
+
 
 def make_request(
     params: SamplingParams, prompt_tokens_ids: list[int] | None = None
@@ -46,8 +54,12 @@ def make_request(
     if not prompt_tokens_ids:
         prompt_tokens_ids = PROMPT_TOKENS
 
+    global _REQUEST_COUNTER
+    _REQUEST_COUNTER += 1
+    request_id = f"request-{_REQUEST_COUNTER}"
     return EngineCoreRequest(
-        request_id=str(uuid.uuid4()),
+        request_id=request_id,
+        external_req_id=f"{request_id}-{uuid.uuid4()}",
         prompt_token_ids=prompt_tokens_ids,
         mm_features=None,
         sampling_params=params,
@@ -60,6 +72,91 @@ def make_request(
     )
 
 
+def _reload_envs_module():
+    import vllm.envs as envs_mod
+
+    cache_clear = getattr(getattr(envs_mod, "__getattr__", None), "cache_clear", None)
+    if cache_clear is not None:
+        cache_clear()
+    return importlib.reload(envs_mod)
+
+
+def _reload_core_client_module():
+    module = importlib.import_module("vllm.v1.engine.core_client")
+    return importlib.reload(module)
+
+
+def test_mp_client_uses_env_timeout(monkeypatch: pytest.MonkeyPatch):
+    timeout_value = 654
+    monkeypatch.setenv("VLLM_ENGINE_READY_TIMEOUT_S", str(timeout_value))
+
+    # Ensure that the environment variable is loaded if caching is enabled
+    _reload_envs_module()
+    core_client_mod = _reload_core_client_module()
+
+    poll_timeouts: list[int] = []
+
+    class ShadowSocket:
+        def poll(self, timeout: int) -> int:
+            # Capture the timeout value for each poll call
+            poll_timeouts.append(timeout)
+            return 1
+
+        def recv_multipart(self):
+            return (b"\x00\x00", b"ready")
+
+    class DummySocket:
+        def send_multipart(self, _msg, *, copy: bool = False, track: bool = False):
+            if track:
+                return SimpleNamespace(done=True)
+
+        def recv_multipart(self, *, copy: bool = False):
+            return (b"", b"")
+
+        def close(self, *, linger: int = 0):
+            pass
+
+        def bind(self, _address):
+            pass
+
+        def connect(self, _address):
+            pass
+
+        def setsockopt(self, *_args, **_kwargs):
+            pass
+
+    monkeypatch.setattr(core_client_mod.zmq.Socket, "shadow", lambda *_: ShadowSocket())
+    monkeypatch.setattr(
+        core_client_mod, "make_zmq_socket", lambda *_, **__: DummySocket()
+    )
+
+    parallel_config = SimpleNamespace(
+        data_parallel_size=1,
+        data_parallel_rank=0,
+        data_parallel_size_local=1,
+        data_parallel_rank_local=None,
+        data_parallel_hybrid_lb=False,
+        data_parallel_external_lb=False,
+    )
+    vllm_config = SimpleNamespace(parallel_config=parallel_config)
+
+    client = core_client_mod.MPClient(
+        asyncio_mode=False,
+        vllm_config=vllm_config,
+        executor_class=object,
+        log_stats=False,
+        client_addresses={
+            "input_address": "inproc://input",
+            "output_address": "inproc://output",
+        },
+    )
+    try:
+        # timeout_value is in seconds, but poll receives milliseconds
+        assert poll_timeouts == [timeout_value * 1000]
+    finally:
+        client.shutdown()
+
+
 def loop_until_done(client: EngineCoreClient, outputs: dict):
     while True:
         engine_core_outputs = client.get_output().outputs
diff --git a/tests/v1/engine/test_fast_incdec_prefix_err.py b/tests/v1/engine/test_fast_incdec_prefix_err.py
index 77e67d54e587e..67a3b6b012dcc 100644
--- a/tests/v1/engine/test_fast_incdec_prefix_err.py
+++ b/tests/v1/engine/test_fast_incdec_prefix_err.py
@@ -27,6 +27,7 @@ def test_fast_inc_detok_invalid_utf8_err_case():
     params = SamplingParams(skip_special_tokens=True)
     request = EngineCoreRequest(
         request_id="test",
+        external_req_id="test-ext",
         prompt_token_ids=prompt_token_ids,
         mm_features=None,
         sampling_params=params,
diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py
index 990aa9d925855..f1185222f7137 100644
--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@@ -58,12 +58,12 @@ def test_incremental_detokenization(
     output_processor = OutputProcessor(
         dummy_test_vectors.tokenizer, log_stats=False, stream_interval=stream_interval
     )
-    engine_core = MockEngineCore(tokens_list=dummy_test_vectors.generation_tokens)
 
     # Make N requests.
     requests = [
         EngineCoreRequest(
-            request_id=f"request-{idx}",
+            request_id=f"request-{idx}-int",
+            external_req_id=f"request-{idx}",
             prompt_token_ids=prompt_tokens,
             mm_features=None,
             eos_token_id=None,
@@ -83,6 +83,11 @@ def test_incremental_detokenization(
         for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
     ]
 
+    engine_core = MockEngineCore(
+        tokens_list=dummy_test_vectors.generation_tokens,
+        request_ids=[req.request_id for req in requests],
+    )
+
     # Add requests to the detokenizer.
     for request, prompt in zip(requests, dummy_test_vectors.prompt_strings):
         output_processor.add_request(request, prompt)
@@ -438,15 +443,6 @@ def test_logprobs_processor(
     dummy_test_vectors,
 ):
     output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=False)
-    engine_core = MockEngineCore(
-        tokens_list=dummy_test_vectors.generation_tokens,
-        generated_logprobs_raw=None
-        if num_sample_logprobs is None
-        else dummy_test_vectors.generation_logprobs,
-        prompt_logprobs_raw=None
-        if num_prompt_logprobs is None
-        else dummy_test_vectors.prompt_logprobs,
-    )
 
     # Make N requests.
     request_id_list = [
@@ -454,7 +450,8 @@ def test_logprobs_processor(
     ]
     requests = [
         EngineCoreRequest(
-            request_id=request_id_list[idx],
+            request_id=request_id_list[idx] + "-int",
+            external_req_id=request_id_list[idx],
             prompt_token_ids=prompt_tokens,
             mm_features=None,
             eos_token_id=None,
@@ -476,6 +473,17 @@ def test_logprobs_processor(
         for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
     ]
 
+    engine_core = MockEngineCore(
+        tokens_list=dummy_test_vectors.generation_tokens,
+        generated_logprobs_raw=None
+        if num_sample_logprobs is None
+        else dummy_test_vectors.generation_logprobs,
+        prompt_logprobs_raw=None
+        if num_prompt_logprobs is None
+        else dummy_test_vectors.prompt_logprobs,
+        request_ids=[req.request_id for req in requests],
+    )
+
     # Add requests to the detokenizer.
     for request, prompt in zip(requests, dummy_test_vectors.prompt_strings):
         output_processor.add_request(request, prompt)
@@ -621,19 +629,12 @@ def test_stop_token(
         ]
     prompt_string = dummy_test_vectors.prompt_strings[0]
     prompt_tokens = dummy_test_vectors.prompt_tokens[0]
-    engine_core = MockEngineCore(
-        tokens_list=[generation_tokens],
-        generated_logprobs_raw=[generation_logprobs] if do_logprobs else None,
-        prompt_logprobs_raw=None,
-        eos_token_id=eos_token_id,
-        stop_token_ids=stop_token_ids,
-        ignore_eos=ignore_eos,
-    )
 
     # Make request.
     request_id = "request-0"
     request = EngineCoreRequest(
         request_id=request_id,
+        external_req_id=request_id + "-ext",
         prompt_token_ids=prompt_tokens,
         mm_features=None,
         eos_token_id=eos_token_id,
@@ -655,6 +656,16 @@ def test_stop_token(
         pooling_params=None,
     )
 
+    engine_core = MockEngineCore(
+        tokens_list=[generation_tokens],
+        generated_logprobs_raw=[generation_logprobs] if do_logprobs else None,
+        prompt_logprobs_raw=None,
+        eos_token_id=eos_token_id,
+        stop_token_ids=stop_token_ids,
+        ignore_eos=ignore_eos,
+        request_ids=[request.request_id],
+    )
+
     # Add request to the detokenizer.
     output_processor.add_request(request, prompt_string)
 
@@ -720,13 +731,6 @@ def test_stop_string(
     dummy_test_vectors,
 ):
     output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=False)
-    engine_core = MockEngineCore(
-        tokens_list=dummy_test_vectors.generation_tokens,
-        generated_logprobs_raw=dummy_test_vectors.generation_logprobs
-        if num_sample_logprobs
-        else None,
-        prompt_logprobs_raw=None,
-    )
 
     # Make N requests.
     request_id_list = [
@@ -734,7 +738,8 @@ def test_stop_string(
     ]
     requests = [
         EngineCoreRequest(
-            request_id=request_id_list[idx],
+            request_id=request_id_list[idx] + "-int",
+            external_req_id=request_id_list[idx],
             prompt_token_ids=prompt_tokens,
             mm_features=None,
             eos_token_id=None,
@@ -756,6 +761,15 @@ def test_stop_string(
         for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
     ]
 
+    engine_core = MockEngineCore(
+        tokens_list=dummy_test_vectors.generation_tokens,
+        generated_logprobs_raw=dummy_test_vectors.generation_logprobs
+        if num_sample_logprobs
+        else None,
+        prompt_logprobs_raw=None,
+        request_ids=[req.request_id for req in requests],
+    )
+
     # Add requests to the detokenizer.
     for request, prompt in zip(requests, dummy_test_vectors.prompt_strings):
         output_processor.add_request(request, prompt)
@@ -813,9 +827,12 @@ def test_stop_string(
     for idx, (ref_gen_str, stop_str) in enumerate(
         zip(dummy_test_vectors.generation_strings, STOP_STRINGS)
     ):
-        # Request should be aborted.
+        # Request should be aborted (check internal ID in abort list).
+        internal_request_id = f"request-{idx}-int"
+        assert internal_request_id in aborted
+
+        # Use external ID for collecting outputs
         request_id = f"request-{idx}"
-        assert request_id in aborted
 
         # Collected values that were generated.
         gen_str = gen_strings[request_id]
@@ -848,13 +865,13 @@ def test_stop_string(
 
 def test_iteration_stats(dummy_test_vectors):
     output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=True)
-    engine_core = MockEngineCore(dummy_test_vectors.generation_tokens)
     engine_core_timestamp = time.monotonic()
 
     # Make N requests.
     requests = [
         EngineCoreRequest(
             request_id=f"request-{idx}",
+            external_req_id=f"request-{idx}-ext",
             prompt_token_ids=prompt_tokens,
             mm_features=None,
             eos_token_id=None,
@@ -868,6 +885,11 @@ def test_iteration_stats(dummy_test_vectors):
         for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
     ]
 
+    engine_core = MockEngineCore(
+        dummy_test_vectors.generation_tokens,
+        request_ids=[req.request_id for req in requests],
+    )
+
     # Add all requests except one to the OutputProcessor.
     num_active = len(dummy_test_vectors.generation_tokens) - 1
     for request in requests[:num_active]:
@@ -922,7 +944,6 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
     output_processor = OutputProcessor(
         dummy_test_vectors.tokenizer, log_stats=log_stats
     )
-    engine_core = MockEngineCore(dummy_test_vectors.generation_tokens)
     engine_core_timestamp = time.monotonic()
 
     # Create LoRA requests
@@ -936,7 +957,8 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
     lora_assignments = [lora1, lora2, None]
     requests = [
         EngineCoreRequest(
-            request_id=f"request-{idx}",
+            request_id=f"request-{idx}-int",
+            external_req_id=f"request-{idx}",
             prompt_token_ids=prompt_tokens,
             mm_features=None,
             eos_token_id=None,
@@ -950,6 +972,11 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
         for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
     ]
 
+    engine_core = MockEngineCore(
+        dummy_test_vectors.generation_tokens,
+        request_ids=[req.request_id for req in requests],
+    )
+
     # Add all requests to the OutputProcessor
     for request in requests:
         output_processor.add_request(request, None)
@@ -1015,9 +1042,9 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
     outputs = EngineCoreOutputs(
         outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats()
     )
-    # Find and mark request-0 as finished (it uses lora-1)
+    # Find and mark request-0-int as finished (it uses lora-1)
     for output in outputs.outputs:
-        if output.request_id == "request-0":
+        if output.request_id == "request-0-int":
             output.finish_reason = FinishReason.LENGTH
             break
 
@@ -1040,9 +1067,9 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
     outputs = EngineCoreOutputs(
         outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats()
     )
-    # Find and mark request-1 as finished (it uses lora-2)
+    # Find and mark request-1-int as finished (it uses lora-2)
     for output in outputs.outputs:
-        if output.request_id == "request-1":
+        if output.request_id == "request-1-int":
             output.finish_reason = FinishReason.LENGTH
             break
 
@@ -1064,9 +1091,9 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
     outputs = EngineCoreOutputs(
         outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats()
     )
-    # Find and mark request-2 as finished (it has no LoRA)
+    # Find and mark request-2-int as finished (it has no LoRA)
     for output in outputs.outputs:
-        if output.request_id == "request-2":
+        if output.request_id == "request-2-int":
             output.finish_reason = FinishReason.LENGTH
             break
 
@@ -1107,7 +1134,9 @@ async def test_request_output_collector():
             for idx in range(NUM_REQS)
         ]
 
-    collector = RequestOutputCollector(RequestOutputKind.DELTA)
+    collector = RequestOutputCollector(
+        RequestOutputKind.DELTA, request_id="my-request-id-int"
+    )
 
     # CASE 1: Put then get.
     outputs = make_outputs()
@@ -1163,7 +1192,9 @@ async def test_request_output_collector():
 @pytest.mark.asyncio
 async def test_cumulative_output_collector_n():
     """Test collector correctly handles multiple outputs by index."""
-    collector = RequestOutputCollector(RequestOutputKind.CUMULATIVE)
+    collector = RequestOutputCollector(
+        RequestOutputKind.CUMULATIVE, request_id="my-request-id-int"
+    )
     outputs = [
         RequestOutput(
             request_id="my-request-id",
@@ -1242,11 +1273,13 @@ async def test_cumulative_output_collector_n():
 
 
 @pytest.mark.parametrize("runner", ["generate", "pooling"])
-def test_abort_requests(runner: str, dummy_test_vectors):
+@pytest.mark.parametrize("abort_by", ["internal", "external"])
+def test_abort_requests(runner: str, abort_by: str, dummy_test_vectors):
     output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=True)
     requests = [
         EngineCoreRequest(
             request_id=f"request-{idx}",
+            external_req_id=f"external-{idx}",
             prompt_token_ids=prompt_tokens,
             mm_features=None,
             eos_token_id=None,
@@ -1265,8 +1298,13 @@ def test_abort_requests(runner: str, dummy_test_vectors):
             output_kind = request.sampling_params.output_kind
         else:
             output_kind = request.pooling_params.output_kind
-        queue = RequestOutputCollector(output_kind=output_kind)
+        queue = RequestOutputCollector(
+            output_kind=output_kind, request_id=request.request_id
+        )
         output_processor.add_request(request, None, queue=queue)
 
     for request in requests:
-        output_processor.abort_requests([request.request_id])
+        if abort_by == "internal":
+            output_processor.abort_requests([request.request_id], internal=True)
+        else:
+            output_processor.abort_requests([request.external_req_id], internal=False)
diff --git a/tests/v1/engine/test_parallel_sampling.py b/tests/v1/engine/test_parallel_sampling.py
index 736c0e54837fe..fe6f15df20982 100644
--- a/tests/v1/engine/test_parallel_sampling.py
+++ b/tests/v1/engine/test_parallel_sampling.py
@@ -4,11 +4,12 @@
 from vllm import SamplingParams
 from vllm.outputs import CompletionOutput
 from vllm.sampling_params import RequestOutputKind
+from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.parallel_sampling import ParentRequest
 
 
 def test_parent_request_to_output_stream() -> None:
-    parent_request = ParentRequest("parent_id", SamplingParams(n=2))
+    parent_request = ParentRequest(make_request(SamplingParams(n=2)))
     parent_request.child_requests = {"child_id_0", "child_id_1"}
     output_0 = CompletionOutput(
         index=0, text="child 0", token_ids=[], cumulative_logprob=None, logprobs=None
@@ -17,51 +18,31 @@ def test_parent_request_to_output_stream() -> None:
         index=1, text="child 1", token_ids=[], cumulative_logprob=None, logprobs=None
     )
     # Request not finished
-    assert ("parent_id", [output_0], False) == parent_request.get_outputs(
-        "child_id_0", output_0
-    )
-    assert ("parent_id", [output_1], False) == parent_request.get_outputs(
-        "child_id_1", output_1
-    )
-    assert ("parent_id", [output_0], False) == parent_request.get_outputs(
-        "child_id_0", output_0
-    )
-    assert ("parent_id", [output_1], False) == parent_request.get_outputs(
-        "child_id_1", output_1
-    )
+    assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0)
+    assert ([output_1], False) == parent_request.get_outputs("child_id_1", output_1)
+    assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0)
+    assert ([output_1], False) == parent_request.get_outputs("child_id_1", output_1)
 
     # output_1 finished
     output_1.finish_reason = "ended"
-    assert ("parent_id", [output_0], False) == parent_request.get_outputs(
-        "child_id_0", output_0
-    )
-    assert ("parent_id", [output_1], False) == parent_request.get_outputs(
-        "child_id_1", output_1
-    )
+    assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0)
+    assert ([output_1], False) == parent_request.get_outputs("child_id_1", output_1)
     # Finished output_1 had already returned, DO NOT returned again
-    assert ("parent_id", [output_0], False) == parent_request.get_outputs(
-        "child_id_0", output_0
-    )
-    assert parent_request.get_outputs("child_id_1", output_1) == (
-        "parent_id",
-        [],
-        False,
-    )
+    assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0)
+    assert parent_request.get_outputs("child_id_1", output_1) == ([], False)
 
     # output_0 finished
     output_0.finish_reason = "ended"
-    assert ("parent_id", [output_0], True) == parent_request.get_outputs(
-        "child_id_0", output_0
-    )
-    assert parent_request.get_outputs("child_id_1", output_1) == ("parent_id", [], True)
+    assert ([output_0], True) == parent_request.get_outputs("child_id_0", output_0)
+    assert parent_request.get_outputs("child_id_1", output_1) == ([], True)
     # Finished output_0 had already returned, DO NOT returned again
-    assert parent_request.get_outputs("child_id_0", output_0) == ("parent_id", [], True)
-    assert parent_request.get_outputs("child_id_1", output_1) == ("parent_id", [], True)
+    assert parent_request.get_outputs("child_id_0", output_0) == ([], True)
+    assert parent_request.get_outputs("child_id_1", output_1) == ([], True)
 
 
 def test_parent_request_to_output_final_only() -> None:
     parent_request = ParentRequest(
-        "parent_id", SamplingParams(n=2, output_kind=RequestOutputKind.FINAL_ONLY)
+        make_request(SamplingParams(n=2, output_kind=RequestOutputKind.FINAL_ONLY))
     )
     parent_request.child_requests = {"child_id_0", "child_id_1"}
     output_0 = CompletionOutput(
@@ -71,33 +52,33 @@ def test_parent_request_to_output_final_only() -> None:
         index=1, text="child 1", token_ids=[], cumulative_logprob=None, logprobs=None
     )
     # Request not finished, return nothing
-    assert parent_request.get_outputs("child_id_0", output_0) == (
-        "parent_id",
-        [],
-        False,
-    )
-    assert parent_request.get_outputs("child_id_1", output_1) == (
-        "parent_id",
-        [],
-        False,
-    )
+    assert parent_request.get_outputs("child_id_0", output_0) == ([], False)
+    assert parent_request.get_outputs("child_id_1", output_1) == ([], False)
     # output_1 finished, but outputs won't be returned until all child requests finished
     output_1.finish_reason = "ended"
-    assert parent_request.get_outputs("child_id_0", output_0) == (
-        "parent_id",
-        [],
-        False,
-    )
-    assert parent_request.get_outputs("child_id_1", output_1) == (
-        "parent_id",
-        [],
-        False,
-    )
+    assert parent_request.get_outputs("child_id_0", output_0) == ([], False)
+    assert parent_request.get_outputs("child_id_1", output_1) == ([], False)
     # output_0 finished, as all child requests finished, the output would be returned
     output_0.finish_reason = "ended"
-    assert ("parent_id", [output_0, output_1], True) == parent_request.get_outputs(
+    assert ([output_0, output_1], True) == parent_request.get_outputs(
         "child_id_0", output_0
     )
-    assert ("parent_id", [output_0, output_1], True) == parent_request.get_outputs(
+    assert ([output_0, output_1], True) == parent_request.get_outputs(
         "child_id_1", output_1
     )
+
+
+def make_request(sampling_params: SamplingParams) -> EngineCoreRequest:
+    return EngineCoreRequest(
+        request_id="parent_id",
+        external_req_id="ext_parent_id",
+        prompt_token_ids=None,
+        mm_features=None,
+        sampling_params=sampling_params,
+        pooling_params=None,
+        eos_token_id=None,
+        arrival_time=0.0,
+        lora_request=None,
+        cache_salt=None,
+        data_parallel_rank=None,
+    )
diff --git a/tests/v1/engine/test_preprocess_error_handling.py b/tests/v1/engine/test_preprocess_error_handling.py
new file mode 100644
index 0000000000000..821ac168d97a9
--- /dev/null
+++ b/tests/v1/engine/test_preprocess_error_handling.py
@@ -0,0 +1,62 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch.cuda
+
+from vllm import LLM, SamplingParams
+from vllm.platforms import current_platform
+from vllm.v1.engine import EngineCoreRequest
+from vllm.v1.engine.core import EngineCore
+
+MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
+
+
+def test_preprocess_error_handling(monkeypatch: pytest.MonkeyPatch):
+    """Test that preprocessing errors are handled gracefully."""
+
+    if current_platform.is_rocm():
+        pytest.skip(
+            "Skipped on ROCm: this test only works with 'fork', but ROCm uses 'spawn'."
+        )
+
+    assert not torch.cuda.is_initialized(), (
+        "fork needs to be used for the engine "
+        "core process and this isn't possible if cuda is already initialized"
+    )
+
+    # Store original method to call for non-failing requests
+    original_preprocess = EngineCore.preprocess_add_request
+
+    # Monkeypatch to make preprocess_add_request raise an exception
+    # only for requests with "FAIL" in the first token
+    def conditional_failing_preprocess(self, request: EngineCoreRequest):
+        # Fail if the first token id is 333
+        if request.prompt_token_ids and request.prompt_token_ids[0] == 333:
+            raise ValueError("Simulated preprocessing error!")
+        return original_preprocess(self, request)
+
+    monkeypatch.setattr(
+        EngineCore, "preprocess_add_request", conditional_failing_preprocess
+    )
+
+    llm = LLM(model=MODEL_NAME)
+
+    # Create a failing request by crafting a request with an invalid token
+    # We need to use a direct approach since LLM.generate tokenizes for us
+    from vllm.inputs import TokensPrompt
+
+    # This should raise an exception due to the preprocessing failure
+    # Special token id to trigger the failure
+    failing_prompt = TokensPrompt(prompt_token_ids=[333])
+    outputs = llm.generate(failing_prompt, SamplingParams(max_tokens=10))  # type: ignore
+    assert len(outputs) == 1
+    assert len(outputs[0].outputs[0].token_ids) == 0
+    assert outputs[0].finished
+    assert outputs[0].outputs[0].finish_reason == "error"
+
+    # Verify the engine is still functional with a normal request
+    outputs = llm.generate("Hello, my name is", SamplingParams(max_tokens=10))
+    assert len(outputs) == 1
+    assert len(outputs[0].outputs[0].token_ids) > 0
+    assert outputs[0].outputs[0].finish_reason in ("stop", "length")
diff --git a/tests/v1/engine/test_process_multi_modal_uuids.py b/tests/v1/engine/test_process_multi_modal_uuids.py
index 1b11b8af49d17..1a16e391316f1 100644
--- a/tests/v1/engine/test_process_multi_modal_uuids.py
+++ b/tests/v1/engine/test_process_multi_modal_uuids.py
@@ -6,6 +6,7 @@ import pytest
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
 from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig
+from vllm.multimodal import MultiModalUUIDDict
 from vllm.sampling_params import SamplingParams
 from vllm.v1.engine import input_processor as input_processor_mod
 from vllm.v1.engine.input_processor import InputProcessor
@@ -166,7 +167,7 @@ def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
         monkeypatch, mm_cache_gb=0.0, enable_prefix_caching=False
     )
 
-    captured: dict[str, object] = {}
+    captured: dict[str, MultiModalUUIDDict] = {}
 
     def fake_preprocess(
         prompt, *, tokenization_kwargs=None, lora_request=None, mm_uuids=None
@@ -196,7 +197,16 @@ def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
     )
 
     # Expect request-id-based overrides are passed through
-    assert captured["mm_uuids"] == {
-        "image": [f"{request_id}-image-0", f"{request_id}-image-1"],
-        "video": [f"{request_id}-video-0"],
-    }
+    mm_uuids = captured["mm_uuids"]
+    assert set(mm_uuids.keys()) == {"image", "video"}
+    assert len(mm_uuids["image"]) == 2
+    assert len(mm_uuids["video"]) == 1
+    assert mm_uuids["image"][0].startswith(f"{request_id}-image-") and mm_uuids[
+        "image"
+    ][0].endswith("-0")
+    assert mm_uuids["image"][1].startswith(f"{request_id}-image-") and mm_uuids[
+        "image"
+    ][1].endswith("-1")
+    assert mm_uuids["video"][0].startswith(f"{request_id}-video-") and mm_uuids[
+        "video"
+    ][0].endswith("-0")
diff --git a/tests/v1/engine/utils.py b/tests/v1/engine/utils.py
index 3541ef89bfc14..d14775668147e 100644
--- a/tests/v1/engine/utils.py
+++ b/tests/v1/engine/utils.py
@@ -343,6 +343,7 @@ class MockEngineCore:
         eos_token_id: int | None = None,
         stop_token_ids: list[int] | None = None,
         ignore_eos: bool = False,
+        request_ids: list[str] | None = None,
     ) -> None:
         self.num_requests = len(tokens_list)
         self.tokens_list = tokens_list
@@ -355,6 +356,11 @@ class MockEngineCore:
         self.eos_token_id = eos_token_id
         self.stop_token_ids = stop_token_ids
         self.ignore_eos = ignore_eos
+        self.request_ids = (
+            request_ids
+            if request_ids is not None
+            else [f"request-{i}" for i in range(self.num_requests)]
+        )
 
     def get_outputs(self) -> list[EngineCoreOutput]:
         do_logprobs = self.do_logprobs
@@ -386,7 +392,7 @@ class MockEngineCore:
                     prompt_logprobs = None
                 new_token_id = token_ids[token_idx]
                 output = EngineCoreOutput(
-                    request_id=f"request-{req_idx}",
+                    request_id=self.request_ids[req_idx],
                     new_token_ids=[new_token_id],
                     new_logprobs=logprobs,
                     new_prompt_logprobs_tensors=prompt_logprobs,
diff --git a/tests/v1/entrypoints/openai/serving_responses/test_image.py b/tests/v1/entrypoints/openai/serving_responses/test_image.py
index be5693bbf2736..644d8ce00686e 100644
--- a/tests/v1/entrypoints/openai/serving_responses/test_image.py
+++ b/tests/v1/entrypoints/openai/serving_responses/test_image.py
@@ -8,7 +8,7 @@ import pytest
 import pytest_asyncio
 
 from tests.utils import RemoteOpenAIServer
-from vllm.multimodal.utils import encode_image_base64
+from vllm.multimodal.utils import encode_image_url
 
 # Use a small vision model for testing
 MODEL_NAME = "Qwen/Qwen2.5-VL-3B-Instruct"
@@ -52,9 +52,9 @@ async def client(image_server):
 
 
 @pytest.fixture(scope="session")
-def base64_encoded_image(local_asset_server) -> dict[str, str]:
+def url_encoded_image(local_asset_server) -> dict[str, str]:
     return {
-        image_url: encode_image_base64(local_asset_server.get_image_asset(image_url))
+        image_url: encode_image_url(local_asset_server.get_image_asset(image_url))
         for image_url in TEST_IMAGE_ASSETS
     }
 
@@ -95,7 +95,7 @@ async def test_single_chat_session_image_base64encoded(
     client: openai.AsyncOpenAI,
     model_name: str,
     raw_image_url: str,
-    base64_encoded_image: dict[str, str],
+    url_encoded_image: dict[str, str],
 ):
     content_text = "What's in this image?"
     messages = [
@@ -104,7 +104,7 @@ async def test_single_chat_session_image_base64encoded(
             "content": [
                 {
                     "type": "input_image",
-                    "image_url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}",  # noqa: E501
+                    "image_url": url_encoded_image[raw_image_url],
                     "detail": "auto",
                 },
                 {"type": "input_text", "text": content_text},
diff --git a/tests/v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
index 8199fd516cd43..f6b4498ceb371 100755
--- a/tests/v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
+++ b/tests/v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
@@ -8,9 +8,12 @@ SCRIPT="v1/kv_connector/nixl_integration/run_accuracy_test.sh"
 configs=(
   "GPU_MEMORY_UTILIZATION=0.6 PREFILLER_TP_SIZE=2 DECODER_TP_SIZE=2"
   "GPU_MEMORY_UTILIZATION=0.6 PREFILLER_TP_SIZE=1 DECODER_TP_SIZE=2"
+  "GPU_MEMORY_UTILIZATION=0.6 PREFILLER_TP_SIZE=2 DECODER_TP_SIZE=1"
   "GPU_MEMORY_UTILIZATION=0.8 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny" # MLA case
   "GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=1 DECODER_TP_SIZE=2 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny"
+  "GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=2 DECODER_TP_SIZE=1 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny"
   "DP_EP=1 GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=1 DECODER_TP_SIZE=2 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny" # MLA+P-TP1, D-DPEP=2 (TP=1) 
+  "DP_EP=1 GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=2 DECODER_TP_SIZE=2 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny" # MLA+P-TP2, D-DPEP=2 (TP=1) 
 )
 
 run_tests() {
diff --git a/tests/v1/kv_connector/unit/test_example_connector.py b/tests/v1/kv_connector/unit/test_example_connector.py
index 75edb79fb4af4..8312231716935 100644
--- a/tests/v1/kv_connector/unit/test_example_connector.py
+++ b/tests/v1/kv_connector/unit/test_example_connector.py
@@ -9,7 +9,7 @@ from PIL import Image
 from vllm import LLM, EngineArgs, SamplingParams
 from vllm.assets.image import ImageAsset
 from vllm.config import KVTransferConfig
-from vllm.multimodal.utils import encode_image_base64
+from vllm.multimodal.utils import encode_image_url
 from vllm.platforms import current_platform
 
 MODEL_NAME = "RedHatAI/Qwen2.5-VL-3B-Instruct-quantized.w8a8"
@@ -74,7 +74,7 @@ def process_prompt(processor, llm: LLM, question: str, image_urls: list[Image]):
     placeholders = [
         {
             "type": "image_url",
-            "image_url": {"url": f"data:image;base64,{encode_image_base64(image_pil)}"},
+            "image_url": {"url": encode_image_url(image_pil)},
         }
         for image_pil in image_urls
     ]
diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py
index 25f4308079595..f4389a405196f 100644
--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@@ -41,10 +41,13 @@ from vllm.distributed.kv_transfer.kv_transfer_state import (
     has_kv_transfer_group,
 )
 from vllm.forward_context import ForwardContext
+from vllm.outputs import RequestOutput
 from vllm.platforms import current_platform
 from vllm.platforms.interface import Platform
 from vllm.sampling_params import SamplingParams
 from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
+from vllm.v1.engine import EngineCoreRequest
+from vllm.v1.engine.output_processor import OutputProcessor
 from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput
 from vllm.v1.request import RequestStatus
 
@@ -391,6 +394,8 @@ class FakeNixlConnectorWorker(NixlConnectorWorker):
         super().__init__(*args, **kwargs)
         self._hand_shake_latency = hand_shake_latency
         self.kv_cache_layout = kv_cache_layout
+        # Mock register_kv_caches attribute needed for tests that do not call it.
+        self.src_xfer_handles_by_block_size = {self.block_size: 1}
 
     def _nixl_handshake(
         self, host: str, port: int, remote_tp_size: int, expected_engine_id: str
@@ -407,22 +412,43 @@ class FakeNixlConnectorWorker(NixlConnectorWorker):
 
         assert expected_engine_id == self.REMOTE_ENGINE_ID
 
-        remote_agent_name = self.add_remote_agent(
-            NixlAgentMetadata(
-                engine_id=self.REMOTE_ENGINE_ID,
-                agent_metadata=FakeNixlWrapper.AGENT_METADATA,
-                kv_caches_base_addr=[0],
-                device_id=0,
-                num_blocks=1,
-                block_lens=self.block_len_per_layer,
-                # `self.kv_cache_layout` is only forced to HND when vllm engine
-                # is started. We mock HND here.
-                kv_cache_layout="HND",
-                block_size=self.block_size,
-            ),
-            remote_tp_size=remote_tp_size,
-        )
-        return {0: remote_agent_name}
+        # Adjust remote block length metadata to satisfy heterogeneous TP
+        # invariants enforced during handshake validation.
+        remote_block_lens = list(self.block_len_per_layer)
+        tp_ratio = self.kv_topo.tp_ratio(remote_tp_size)
+        if remote_tp_size > self.world_size:
+            # P TP > D TP case, block_len of remote is smaller
+            remote_block_lens = [
+                block_len // (-tp_ratio) for block_len in remote_block_lens
+            ]
+        elif remote_tp_size < self.world_size:
+            remote_block_lens = [
+                block_len * tp_ratio for block_len in remote_block_lens
+            ]
+
+        # When remote tp_size > local tp_size, handshake with multiple
+        # remote ranks.
+        num_hanshakes = 1 if tp_ratio > 0 else -tp_ratio
+        remote_agents: dict[int, str] = {}
+        for remote_tp_rank in range(num_hanshakes):
+            remote_agent_name = self.add_remote_agent(
+                NixlAgentMetadata(
+                    engine_id=self.REMOTE_ENGINE_ID,
+                    agent_metadata=FakeNixlWrapper.AGENT_METADATA,
+                    kv_caches_base_addr=[0],
+                    device_id=remote_tp_rank,
+                    num_blocks=1,
+                    block_lens=remote_block_lens,
+                    # `self.kv_cache_layout` is only forced to HND when vllm engine
+                    # is started. We mock HND here.
+                    kv_cache_layout="HND",
+                    block_size=self.block_size,
+                ),
+                remote_tp_rank=remote_tp_rank,
+                remote_tp_size=remote_tp_size,
+            )
+            remote_agents[remote_tp_rank] = remote_agent_name
+        return remote_agents
 
 
 class TestNixlHandshake:
@@ -453,7 +479,13 @@ class TestNixlHandshake:
             vllm_config, connector.engine_id, hand_shake_latency=0
         )
         assert isinstance(connector.connector_worker.nixl_wrapper, FakeNixlWrapper)
-        connector.connector_worker.nixl_wrapper.set_cycles_before_xfer_done(3)
+        worker = connector.connector_worker
+        worker.nixl_wrapper.set_cycles_before_xfer_done(3)
+        # simulate handshake
+        worker.dst_xfer_side_handles = {
+            FakeNixlConnectorWorker.REMOTE_ENGINE_ID: {0: 1}
+        }
+        worker.kv_cache_layout = "HND"
         num_xfers = 4
         while True:
             # For the same request_id, initiate multiple xfers across different
@@ -567,6 +599,171 @@ class TestNixlHandshake:
                 return
         raise TimeoutError("Took too long to complete async handshake.")
 
+    @patch(
+        "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
+        FakeNixlWrapper,
+    )
+    @pytest.mark.parametrize("local_tp_size", [1, 2])
+    def test_prefill_tp_size_greater_than_decode_tp_size(
+        self, local_tp_size: int, dist_init
+    ):
+        """
+        Verify remote TP > local TP handshake succeeds with different
+        remote configurations.
+        """
+
+        vllm_config = create_vllm_config()
+        local_tp_size = 1
+        vllm_config.parallel_config.tensor_parallel_size = local_tp_size
+
+        connector = NixlConnector(vllm_config, KVConnectorRole.WORKER)
+        connector.connector_worker = FakeNixlConnectorWorker(
+            vllm_config, connector.engine_id, hand_shake_latency=0
+        )
+        worker = connector.connector_worker
+
+        # Minimal local registration params used by add_remote_agent
+        worker.slot_size_per_layer = [4096]
+        worker.block_len_per_layer = [4096 * worker.block_size]
+        worker.num_blocks = 1
+        worker.dst_num_blocks[worker.engine_id] = worker.num_blocks
+        worker.src_blocks_data = [(0, worker.block_len_per_layer[0], worker.tp_rank)]
+
+        def check_handshake(remote_tp_size: int):
+            tp_ratio = remote_tp_size // local_tp_size
+            assert set(remote_agents.keys()) == set(range(tp_ratio))
+
+            remote_engine_id = worker.REMOTE_ENGINE_ID
+            assert worker._tp_size[remote_engine_id] == remote_tp_size
+            assert -tp_ratio == worker.kv_topo.tp_ratio_from_engine_id(remote_engine_id)
+            # ensure src_xfer_handles_by_tp_ratio is populated with tpratio chunks
+            assert -tp_ratio in worker.src_xfer_handles_by_tp_ratio
+            assert len(worker.src_xfer_handles_by_tp_ratio[-tp_ratio]) == tp_ratio
+            assert remote_engine_id in worker.dst_xfer_side_handles
+            assert set(worker.dst_xfer_side_handles[remote_engine_id].keys()) == set(
+                range(tp_ratio)
+            )
+
+        remote_agents = worker._nixl_handshake(
+            host="localhost",
+            port=1234,
+            remote_tp_size=2,
+            expected_engine_id=worker.REMOTE_ENGINE_ID,
+        )
+        check_handshake(2)
+
+        # NOTE flexiblity: a second remote with higher number of ranks is
+        # discovered. This is not a scenario we actively support right now, but
+        # the connector allows it.
+        worker.REMOTE_ENGINE_ID = "remote_engine_2"
+        remote_agents = worker._nixl_handshake(
+            host="localhost",
+            port=1234,
+            remote_tp_size=6,
+            expected_engine_id=worker.REMOTE_ENGINE_ID,
+        )
+        check_handshake(6)
+
+    @patch(
+        "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
+        FakeNixlWrapper,
+    )
+    @pytest.mark.parametrize("local_tp_size", [1, 2])
+    def test_prefill_tp_size_greater_than_decode_tp_size_mla(
+        self, local_tp_size: int, dist_init
+    ):
+        """
+        Verify remote TP > local TP handshake succeeds with different
+        remote configurations for an MLA model.
+        """
+        vllm_config = create_vllm_config()
+        d_tp_size = 1
+        p_tp_size = 2
+
+        # Build two separate connectors/workers to emulate P TP=2 ranks.
+        conn_p0 = NixlConnector(vllm_config, KVConnectorRole.WORKER)
+        conn_p1 = NixlConnector(vllm_config, KVConnectorRole.WORKER)
+        conn_p0.connector_worker = FakeNixlConnectorWorker(
+            vllm_config, conn_p0.engine_id, hand_shake_latency=0
+        )
+        conn_p1.connector_worker = FakeNixlConnectorWorker(
+            vllm_config, conn_p1.engine_id, hand_shake_latency=0
+        )
+
+        # Force P world size to 2 for both workers and emulate distinct tp_ranks.
+        # Also enable MLA path so that expected_finished_count is updated.
+        for rank, worker in enumerate(
+            (conn_p0.connector_worker, conn_p1.connector_worker)
+        ):
+            worker.world_size = p_tp_size
+            worker.kv_topo.remote_tp_size = {worker.engine_id: p_tp_size}
+            worker.tp_rank = rank
+            worker.use_mla = True
+
+        req_id = "req-ep-dp2-p0"
+        now = time.perf_counter()
+        # Register a request on P that is waiting for consumers to read
+        # (both workers track it).
+        conn_p0.connector_worker._reqs_to_send[req_id] = now + 10.0
+        conn_p0.connector_worker._reqs_to_process.add(req_id)
+        conn_p1.connector_worker._reqs_to_send[req_id] = now + 10.0
+        conn_p1.connector_worker._reqs_to_process.add(req_id)
+
+        # Simulate a read notification coming from D with (tp=1, dp=2).
+        notif = f"{req_id}:{d_tp_size}".encode()
+        # D0-0->P0 notif
+        conn_p0.connector_worker.nixl_wrapper.get_new_notifs = lambda: {
+            "agent": [notif]
+        }  # type: ignore[method-assign]
+        conn_p1.connector_worker.nixl_wrapper.get_new_notifs = lambda: {
+            "agent": [notif]
+        }  # type: ignore[method-assign]
+
+        # Trigger notification processing via get_finished().
+        done_sending0, _ = conn_p0.get_finished(finished_req_ids=set())
+        done_sending1, _ = conn_p1.get_finished(finished_req_ids=set())
+        assert req_id in done_sending0 and req_id in done_sending1
+
+        # E2E aggregation: ensure the aggregated output marks the request
+        # as finished using the connector's expected_finished_count.
+        from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput
+
+        aggregator = KVOutputAggregator.from_connector(conn_p0, world_size=2)
+
+        out0 = ModelRunnerOutput(
+            req_ids=[req_id],
+            req_id_to_index={req_id: 0},
+            sampled_token_ids=[[0]],
+            logprobs=None,
+            prompt_logprobs_dict={},
+            pooler_output=[None],
+            kv_connector_output=KVConnectorOutput(
+                finished_sending=done_sending0,
+                finished_recving=None,
+            ),
+        )
+        out1 = ModelRunnerOutput(
+            req_ids=[req_id],
+            req_id_to_index={req_id: 0},
+            sampled_token_ids=[[0]],
+            logprobs=None,
+            prompt_logprobs_dict={},
+            pooler_output=[None],
+            kv_connector_output=KVConnectorOutput(
+                finished_sending=done_sending1,
+                finished_recving=None,
+            ),
+        )
+        aggregated = aggregator.aggregate([out0, out1], output_rank=0)
+        assert aggregated.kv_connector_output is not None
+        assert aggregated.kv_connector_output.finished_sending == {req_id}
+
+        # Producers cleaned up state for the finished request.
+        assert req_id not in conn_p0.connector_worker._reqs_to_send
+        assert req_id not in conn_p0.connector_worker._reqs_to_process
+        assert req_id not in conn_p1.connector_worker._reqs_to_send
+        assert req_id not in conn_p1.connector_worker._reqs_to_process
+
     @patch(
         "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
         FakeNixlWrapper,
@@ -585,6 +782,9 @@ class TestNixlHandshake:
         connector.connector_worker = FakeNixlConnectorWorker(
             vllm_config, connector.engine_id
         )
+        # Register (mocked) local xfer handler
+        # worker = connector.connector_worker
+        # worker.src_xfer_handles_by_block_size = {worker.block_size: 1}
         metadata = NixlConnectorMetadata()
         total_reqs = 5
         for i in range(total_reqs):
@@ -672,7 +872,6 @@ class TestNixlHandshake:
             with pytest.raises(RuntimeError):
                 # mismatched layout is expected to fail
                 worker.add_remote_agent(meta, remote_tp_size=2)
-            with pytest.raises(AssertionError):
                 worker.add_remote_agent(meta, remote_tp_size=1)
 
     @patch(
@@ -1069,6 +1268,22 @@ def test_abort_timeout_on_prefiller(monkeypatch, distributed_executor_backend):
         run_test_and_cleanup()
 
 
+class RequestIdMapper:
+    """Helper class to map external request IDs to internal request IDs."""
+
+    def __init__(self, output_processor: OutputProcessor):
+        self.req_id_mapping: dict[str, str] = {}
+        self.original_add_request = output_processor.add_request
+        output_processor.add_request = self._add_request
+
+    def _add_request(self, request: EngineCoreRequest, *args, **kwargs):
+        self.req_id_mapping[request.external_req_id] = request.request_id
+        return self.original_add_request(request, *args, **kwargs)
+
+    def __call__(self, external_req_id: str) -> str:
+        return self.req_id_mapping[external_req_id]
+
+
 def _run_abort_timeout_test(llm: LLM, timeout: int):
     """Helper function to run the abort timeout test logic."""
     remote_prefill_opts = {
@@ -1090,24 +1305,34 @@ def _run_abort_timeout_test(llm: LLM, timeout: int):
         0
     ].req_to_blocks
 
+    id_mapper = RequestIdMapper(llm.llm_engine.output_processor)
+
+    def req_id(outputs: list[RequestOutput]) -> str:
+        assert len(outputs) == 1
+        return id_mapper(outputs[0].request_id)
+
     padding = "Just making this request a little longer so that we're sure "
     "we're not hitting the small-request lower bound beneath which we don't "
     "actually trigger the whole kv transfer, but rather just recompute the "
     "blocks on D."
-    _ = llm.generate([f"What is the capital of Japan? {padding}"], sampling_params)
+    req0_id = req_id(
+        llm.generate([f"What is the capital of Japan? {padding}"], sampling_params)
+    )
 
     # Request finished but not freed
-    assert "0" in scheduler.finished_req_ids and "0" in req_to_blocks
+    assert req0_id in scheduler.finished_req_ids and req0_id in req_to_blocks
     # Some other request, 0 still not freed
-    _ = llm.generate([f"What is the capital of Italy? {padding}"], sampling_params)
-    assert "0" in req_to_blocks
-    assert "1" in scheduler.finished_req_ids and "1" in req_to_blocks
+    req1_id = req_id(
+        llm.generate([f"What is the capital of Italy? {padding}"], sampling_params)
+    )
+    assert req0_id in req_to_blocks
+    assert req1_id in scheduler.finished_req_ids and req1_id in req_to_blocks
 
     # Wait for timeout and trigger another scheduler loop
     time.sleep(timeout)
     _ = llm.generate([f"What is the capital of France? {padding}"], sampling_params)
     # Request-0 times out and is cleared!
-    assert "0" not in req_to_blocks
+    assert req0_id not in req_to_blocks
     # Need to shutdown the background thread to release NIXL side channel port
     llm.llm_engine.engine_core.shutdown()
 
@@ -1357,8 +1582,11 @@ def test_shutdown_cleans_up_resources(dist_init):
         patch.object(nixl_wrapper, "deregister_memory") as mock_dereg,
     ):
         worker._recving_transfers = {"req1": [123]}
-        worker.src_xfer_side_handle = 456
-        worker.dst_xfer_side_handles = {"engine1": 789}
+        # Mock register_kv_cache which registers local handle
+        worker.src_xfer_handles_by_block_size = {worker.block_size: 455}
+        # P TP = 2 * D TP case, we should register 2 local handles
+        worker.src_xfer_handles_by_tp_ratio = {-2: [456, 457]}
+        worker.dst_xfer_side_handles = {"engine1": {0: 789}}
         worker._remote_agents = {"engine1": {0: "agent1"}}
         worker._registered_descs = ["desc1", "desc2"]
 
@@ -1379,8 +1607,10 @@ def test_shutdown_cleans_up_resources(dist_init):
         mock_listener.join.assert_called_once()
 
         mock_rel_xfer.assert_called_once_with(123)
-        assert mock_rel_dlist.call_count == 2
-        mock_rel_dlist.assert_any_call(456)  # src handle
+        assert mock_rel_dlist.call_count == 4
+        mock_rel_dlist.assert_any_call(455)  # src handle (whole region)
+        mock_rel_dlist.assert_any_call(456)  # src handle (1st chunk)
+        mock_rel_dlist.assert_any_call(457)  # src handle (2nd chunk)
         mock_rel_dlist.assert_any_call(789)  # dst handle
         mock_rem_agent.assert_called_once_with("agent1")
         assert mock_dereg.call_count == 2
diff --git a/tests/v1/metrics/test_perf_metrics.py b/tests/v1/metrics/test_perf_metrics.py
new file mode 100644
index 0000000000000..b6cda7bef3d41
--- /dev/null
+++ b/tests/v1/metrics/test_perf_metrics.py
@@ -0,0 +1,897 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Tests for the analytic estimators in metrics/flops.py.
+"""
+
+import types
+from types import SimpleNamespace
+
+from transformers.models.deepseek_v3.configuration_deepseek_v3 import DeepseekV3Config
+from transformers.models.llama4.configuration_llama4 import (
+    Llama4Config,
+    Llama4TextConfig,
+)
+from transformers.models.qwen3.configuration_qwen3 import Qwen3Config
+from transformers.models.qwen3_moe.configuration_qwen3_moe import Qwen3MoeConfig
+
+from vllm.config.model import ModelConfig, get_hf_text_config
+from vllm.v1.metrics.perf import (
+    AttentionMetrics,
+    BaseConfigParser,
+    ExecutionContext,
+    FfnMetrics,
+    ModelMetrics,
+    ParsedArgs,
+    UnembedMetrics,
+)
+
+
+class MockModelConfig:
+    """Mock ModelConfig that implements the getter methods used by parsers."""
+
+    def __init__(self, hf_config, dtype):
+        self.hf_config = hf_config
+        self.hf_text_config = get_hf_text_config(hf_config)
+        self.dtype = dtype
+        self.is_attention_free = False
+
+    def __getattr__(self, name):
+        # 1. Check if ModelConfig actually has this attribute
+        if not hasattr(ModelConfig, name):
+            raise AttributeError(
+                f"'{type(self).__name__}' object has no attribute '{name}' "
+                f"and neither does 'ModelConfig'."
+            )
+
+        # 2. Fetch the attribute from the ModelConfig CLASS
+        attr = getattr(ModelConfig, name)
+
+        # 3. Case A: It is a @property
+        if isinstance(attr, property):
+            # Manually invoke the property's getter, passing 'self' (this mock instance)
+            return attr.__get__(self, self.__class__)
+
+        # 4. Case B: It is a standard method (function)
+        if isinstance(attr, types.FunctionType):
+            # Bind the function to 'self' so it acts like a method of
+            # this instance. This creates a bound method where 'self' is
+            # automatically passed as the first arg.
+            return types.MethodType(attr, self)
+
+        # 5. Case C: It is a class attribute / static variable
+        return attr
+
+
+def create_mock_vllm_config(
+    hf_config,
+    model_dtype="bfloat16",
+    cache_dtype="auto",
+    quant_config=None,
+    data_parallel_size=1,
+    tensor_parallel_size=1,
+    pipeline_parallel_size=1,
+    enable_expert_parallel=False,
+) -> SimpleNamespace:
+    vllm_config = SimpleNamespace()
+    vllm_config.model_config = MockModelConfig(hf_config, model_dtype)
+
+    vllm_config.cache_config = SimpleNamespace()
+    vllm_config.cache_config.cache_dtype = cache_dtype
+
+    vllm_config.quant_config = quant_config
+
+    vllm_config.parallel_config = SimpleNamespace()
+    vllm_config.parallel_config.data_parallel_size = data_parallel_size
+    vllm_config.parallel_config.tensor_parallel_size = tensor_parallel_size
+    vllm_config.parallel_config.pipeline_parallel_size = pipeline_parallel_size
+    vllm_config.parallel_config.enable_expert_parallel = enable_expert_parallel
+
+    return vllm_config
+
+
+#### Parser Tests ####
+
+
+def test_base_config_parser():
+    """Test BaseConfigParser extracts base model attributes correctly."""
+    hf_config = Qwen3Config(
+        vocab_size=50000,
+        hidden_size=2048,
+        num_attention_heads=16,
+        num_hidden_layers=24,
+    )
+    vllm_config = create_mock_vllm_config(hf_config, model_dtype="float16")
+
+    parser = BaseConfigParser()
+    args = ParsedArgs()
+    result = parser.parse(args, vllm_config)
+
+    assert result.vocab_size == 50000
+    assert result.hidden_size == 2048
+    assert result.num_attention_heads == 16
+    assert result.num_hidden_layers == 24
+    assert result.weight_byte_size == 2  # float16 is 2 bytes
+    assert result.activation_byte_size == 2  # default activation size
+
+
+def test_base_attention_config_parser_with_gqa():
+    """Test BaseAttentionConfigParser with grouped query attention."""
+    hf_config = Qwen3Config(
+        hidden_size=4096,
+        num_attention_heads=32,
+        num_key_value_heads=8,  # GQA with 4:1 ratio
+        head_dim=128,
+    )
+    vllm_config = create_mock_vllm_config(hf_config)
+
+    parser_chain = AttentionMetrics.get_parser()
+    result = parser_chain.parse(vllm_config)
+
+    assert result.num_key_value_heads == 8
+    assert result.head_dim == 128
+
+
+def test_base_attention_config_parser_without_gqa():
+    """
+    Test BaseAttentionConfigParser defaults to MHA when num_key_value_heads not
+    specified.
+    """
+    hf_config = Qwen3Config(
+        hidden_size=4096,
+        num_attention_heads=32,
+        # No num_key_value_heads specified
+    )
+    vllm_config = create_mock_vllm_config(hf_config)
+
+    parser_chain = AttentionMetrics.get_parser()
+    result = parser_chain.parse(vllm_config)
+
+    # Should default to MHA (num_key_value_heads = num_attention_heads)
+    assert result.num_key_value_heads == 32
+
+
+def test_base_ffn_config_parser_dense():
+    """Test BaseFfnConfigParser for dense FFN."""
+    hf_config = Qwen3Config(
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+    )
+    vllm_config = create_mock_vllm_config(hf_config)
+
+    parser_chain = FfnMetrics.get_parser()
+    result = parser_chain.parse(vllm_config)
+
+    assert result.intermediate_size == 11008
+    assert result.num_experts == 0
+    assert result.num_experts_per_tok == 0
+    assert result.num_moe_layers == 0  # No MoE
+
+
+def test_base_ffn_config_parser_moe():
+    """Test BaseFfnConfigParser for MoE FFN."""
+    hf_config = Qwen3MoeConfig(
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_experts=64,
+        num_experts_per_tok=8,
+        moe_intermediate_size=14336,
+        n_shared_experts=2,
+    )
+    vllm_config = create_mock_vllm_config(hf_config)
+
+    parser_chain = FfnMetrics.get_parser()
+    result = parser_chain.parse(vllm_config)
+
+    assert result.num_experts == 64
+    assert result.num_experts_per_tok == 8
+    assert result.moe_intermediate_size == 14336
+    assert result.num_shared_experts == 2
+    assert result.num_moe_layers == 32  # All layers are MoE by default
+
+
+def test_interleave_moe_layer_step_parser():
+    """Test InterleaveMoeLayerStepParser correctly computes MoE layer count."""
+    hf_config = Llama4Config(
+        text_config=Llama4TextConfig(
+            num_hidden_layers=32,
+            num_local_experts=64,
+            interleave_moe_layer_step=4,  # Every 4th layer is MoE
+        ),
+    )
+
+    vllm_config = create_mock_vllm_config(hf_config)
+
+    parser_chain = FfnMetrics.get_parser()
+    result = parser_chain.parse(vllm_config)
+
+    assert result.num_moe_layers == 8
+
+
+def test_moe_layer_freq_parser():
+    """Test MoeLayerFreqParser correctly computes MoE layer count."""
+    hf_config = DeepseekV3Config(
+        num_hidden_layers=30,
+        n_routed_experts=64,
+        moe_layer_freq=3,  # Every 3rd layer after first_k_dense_replace
+        first_k_dense_replace=6,  # First 6 layers are dense
+    )
+    vllm_config = create_mock_vllm_config(hf_config)
+
+    parser_chain = FfnMetrics.get_parser()
+    result = parser_chain.parse(vllm_config)
+
+    # Layers >= 6 and divisible by 3: 6, 9, 12, 15, 18, 21, 24, 27
+    expected_moe_layers = len(
+        [layer for layer in range(30) if layer >= 6 and layer % 3 == 0]
+    )
+    assert expected_moe_layers == 8
+    assert result.num_moe_layers == expected_moe_layers
+
+
+#### ComponentMetrics Tests ####
+
+
+def test_attention_metrics_scaling():
+    """Test that attention metrics scale proportionally with model dimensions."""
+    base_hf_config = Qwen3Config(
+        hidden_size=2048,
+        num_attention_heads=16,
+        num_key_value_heads=16,
+        num_hidden_layers=12,
+        head_dim=128,
+    )
+
+    base_vllm_config = create_mock_vllm_config(base_hf_config)
+    base_metrics = AttentionMetrics.from_vllm_config(base_vllm_config)
+
+    # Test scaling with number of layers
+    double_layers_hf_config = Qwen3Config(
+        hidden_size=2048,
+        num_attention_heads=16,
+        num_key_value_heads=16,
+        num_hidden_layers=24,  # Double the layers
+        head_dim=128,
+    )
+    double_layers_vllm_config = create_mock_vllm_config(double_layers_hf_config)
+    double_layers_metrics = AttentionMetrics.from_vllm_config(double_layers_vllm_config)
+
+    ctx = ExecutionContext.from_single_request(
+        num_tokens=100, context_len=512, is_prefill=True
+    )
+
+    # FLOPS should double when layers double
+    base_flops = base_metrics.get_num_flops(ctx)
+    double_flops = double_layers_metrics.get_num_flops(ctx)
+    assert double_flops == 2 * base_flops
+
+    # Read/write bytes should also scale proportionally
+    base_read = base_metrics.get_read_bytes(ctx)
+    double_read = double_layers_metrics.get_read_bytes(ctx)
+    assert double_read == 2 * base_read
+
+    base_write = base_metrics.get_write_bytes(ctx)
+    double_write = double_layers_metrics.get_write_bytes(ctx)
+    assert double_write == 2 * base_write
+
+
+def test_attention_metrics_grouped_query():
+    """Test attention metrics handle grouped query attention correctly."""
+    mha_hf_config = Qwen3Config(
+        hidden_size=4096,
+        num_attention_heads=32,
+        num_key_value_heads=32,  # MHA
+        num_hidden_layers=1,
+    )
+    mha_config = create_mock_vllm_config(mha_hf_config)
+
+    gqa_hf_config = Qwen3Config(
+        hidden_size=4096,
+        num_attention_heads=32,
+        num_key_value_heads=8,  # GQA with 4:1 ratio
+        num_hidden_layers=1,
+    )
+    gqa_config = create_mock_vllm_config(gqa_hf_config)
+
+    mha_metrics = AttentionMetrics.from_vllm_config(mha_config)
+    gqa_metrics = AttentionMetrics.from_vllm_config(gqa_config)
+
+    ctx = ExecutionContext.from_single_request(
+        num_tokens=1, context_len=1024, is_prefill=False
+    )
+
+    # GQA should have less KV cache reads since fewer KV heads
+    mha_read = mha_metrics.get_read_bytes(ctx)
+    gqa_read = gqa_metrics.get_read_bytes(ctx)
+    assert gqa_read < mha_read
+
+
+def test_ffn_metrics_scaling():
+    """Test FFN metrics scale proportionally with model dimensions."""
+    base_hf_config = Qwen3Config(
+        hidden_size=2048,
+        intermediate_size=8192,
+        num_hidden_layers=12,
+    )
+    base_vllm_config = create_mock_vllm_config(base_hf_config)
+    base_metrics = FfnMetrics.from_vllm_config(base_vllm_config)
+
+    # Test scaling with intermediate size
+    larger_ffn_hf_config = Qwen3Config(
+        hidden_size=2048,
+        intermediate_size=16384,  # Double intermediate size
+        num_hidden_layers=12,
+    )
+    larger_ffn_vllm_config = create_mock_vllm_config(larger_ffn_hf_config)
+    larger_ffn_metrics = FfnMetrics.from_vllm_config(larger_ffn_vllm_config)
+
+    ctx = ExecutionContext.from_single_request(
+        num_tokens=100, context_len=512, is_prefill=True
+    )
+
+    # FLOPS should double when intermediate size doubles
+    base_flops = base_metrics.get_num_flops(ctx)
+    larger_flops = larger_ffn_metrics.get_num_flops(ctx)
+    assert larger_flops == base_flops * 2
+
+
+def test_moe_metrics_vs_dense():
+    """Test MoE metrics versus dense metrics."""
+    dense_hf_config = Qwen3Config(
+        hidden_size=2048,
+        intermediate_size=8192,
+        num_hidden_layers=12,
+    )
+    dense_config = create_mock_vllm_config(dense_hf_config)
+
+    moe_hf_config = Qwen3MoeConfig(
+        hidden_size=2048,
+        intermediate_size=8192,
+        num_hidden_layers=12,
+        num_experts=64,
+        num_experts_per_tok=2,  # 2 routed expert
+        moe_intermediate_size=8192,
+        n_shared_experts=0,
+    )
+    moe_config = create_mock_vllm_config(moe_hf_config)
+
+    dense_metrics = FfnMetrics.from_vllm_config(dense_config)
+    moe_metrics = FfnMetrics.from_vllm_config(moe_config)
+
+    ctx = ExecutionContext.from_single_request(
+        num_tokens=100, context_len=512, is_prefill=True
+    )
+
+    # MoE should have different compute/memory characteristics
+    dense_flops = dense_metrics.get_num_flops(ctx)
+    moe_flops = moe_metrics.get_num_flops(ctx)
+
+    # 2 routed experts vs 1 dense.
+    assert moe_flops == dense_flops * 2
+
+
+def test_unembed_metrics_scaling():
+    """Test unembedding metrics scale with vocab size."""
+    small_vocab_hf_config = Qwen3Config(
+        hidden_size=2048,
+        vocab_size=32000,
+    )
+    small_vocab_config = create_mock_vllm_config(small_vocab_hf_config)
+
+    large_vocab_hf_config = Qwen3Config(
+        hidden_size=2048,
+        vocab_size=64000,  # Double vocab size
+    )
+    large_vocab_config = create_mock_vllm_config(large_vocab_hf_config)
+
+    small_vocab_metrics = UnembedMetrics.from_vllm_config(small_vocab_config)
+    large_vocab_metrics = UnembedMetrics.from_vllm_config(large_vocab_config)
+
+    ctx = ExecutionContext.from_single_request(
+        num_tokens=100, context_len=512, is_prefill=True
+    )
+
+    # FLOPS should double when vocab size doubles
+    small_flops = small_vocab_metrics.get_num_flops(ctx)
+    large_flops = large_vocab_metrics.get_num_flops(ctx)
+    assert large_flops == 2 * small_flops
+
+
+def test_prefill_vs_decode_differences():
+    """Test that prefill and decode have different memory access patterns."""
+    hf_config = Qwen3Config(
+        hidden_size=2048,
+        num_attention_heads=16,
+        num_key_value_heads=16,
+        num_hidden_layers=1,
+    )
+    config = create_mock_vllm_config(hf_config)
+
+    metrics = AttentionMetrics.from_vllm_config(config)
+
+    prefill_ctx = ExecutionContext.from_single_request(
+        num_tokens=512, context_len=512, is_prefill=True
+    )
+    decode_ctx = ExecutionContext.from_single_request(
+        num_tokens=1, context_len=512, is_prefill=False
+    )
+
+    prefill_read = metrics.get_read_bytes(prefill_ctx)
+    decode_read = metrics.get_read_bytes(decode_ctx)
+
+    assert prefill_read != decode_read
+
+
+def test_model_metrics_aggregation():
+    """Test ModelMetrics correctly aggregates across components."""
+    hf_config = Qwen3Config(
+        hidden_size=2048,
+        num_attention_heads=16,
+        num_hidden_layers=12,
+        vocab_size=32000,
+        intermediate_size=8192,
+    )
+    config = create_mock_vllm_config(hf_config)
+
+    model_metrics = ModelMetrics(config)
+    ctx = ExecutionContext.from_single_request(
+        num_tokens=100, context_len=512, is_prefill=True
+    )
+
+    # Should have metrics for attention, ffn, and unembed
+    total_flops = model_metrics.get_num_flops(ctx)
+    breakdown = model_metrics.get_num_flops_breakdown(ctx)
+
+    # Breakdown should sum to total
+    assert total_flops == sum(breakdown.values())
+
+
+def test_moe_expert_activation_proportional_scaling():
+    """Test that routed expert metrics scale proportionally with num_experts_per_tok."""
+    base_moe_config = Qwen3MoeConfig(
+        hidden_size=2048,
+        intermediate_size=8192,
+        num_hidden_layers=12,
+        num_experts=64,
+        num_experts_per_tok=1,  # 1 expert per token
+        moe_intermediate_size=8192,
+        n_shared_experts=2,
+    )
+
+    double_experts_config = Qwen3MoeConfig(
+        hidden_size=2048,
+        intermediate_size=8192,
+        num_hidden_layers=12,
+        num_experts=64,
+        num_experts_per_tok=2,  # 2 experts per token (double)
+        moe_intermediate_size=8192,
+        n_shared_experts=2,  # Same shared experts
+    )
+
+    triple_experts_config = Qwen3MoeConfig(
+        hidden_size=2048,
+        intermediate_size=8192,
+        num_hidden_layers=12,
+        num_experts=64,
+        num_experts_per_tok=3,  # 3 experts per token (triple)
+        moe_intermediate_size=8192,
+        n_shared_experts=2,  # Same shared experts
+    )
+
+    base_vllm_config = create_mock_vllm_config(base_moe_config)
+    double_vllm_config = create_mock_vllm_config(double_experts_config)
+    triple_vllm_config = create_mock_vllm_config(triple_experts_config)
+
+    base_metrics = FfnMetrics.from_vllm_config(base_vllm_config)
+    double_metrics = FfnMetrics.from_vllm_config(double_vllm_config)
+    triple_metrics = FfnMetrics.from_vllm_config(triple_vllm_config)
+
+    ctx = ExecutionContext.from_single_request(
+        num_tokens=100, context_len=512, is_prefill=True
+    )
+
+    # Get total metrics - the key insight is that differences should be proportional
+    base_flops = base_metrics.get_num_flops(ctx)
+    double_flops = double_metrics.get_num_flops(ctx)
+    triple_flops = triple_metrics.get_num_flops(ctx)
+
+    # The difference between double and base should equal one additional expert
+    one_expert_diff = double_flops - base_flops
+
+    # The difference between triple and base should equal two additional experts
+    two_expert_diff = triple_flops - base_flops
+
+    # Proportional scaling: 2 * (1 expert diff) should equal (2 expert diff)
+    assert two_expert_diff == 2 * one_expert_diff
+
+    # Same logic applies to memory operations
+    base_read = base_metrics.get_read_bytes(ctx)
+    double_read = double_metrics.get_read_bytes(ctx)
+    triple_read = triple_metrics.get_read_bytes(ctx)
+
+    one_expert_read_diff = double_read - base_read
+    two_expert_read_diff = triple_read - base_read
+
+    assert two_expert_read_diff == 2 * one_expert_read_diff
+
+    # Same for write bytes
+    base_write = base_metrics.get_write_bytes(ctx)
+    double_write = double_metrics.get_write_bytes(ctx)
+    triple_write = triple_metrics.get_write_bytes(ctx)
+
+    one_expert_write_diff = double_write - base_write
+    two_expert_write_diff = triple_write - base_write
+
+    assert two_expert_write_diff == 2 * one_expert_write_diff
+
+
+def test_quantization_config_parser_fp8():
+    """Test quantization parsers with fp8."""
+
+    class MockQuantConfig:
+        def get_name(self):
+            return "fp8"
+
+    hf_config = Qwen3Config(
+        hidden_size=2048, num_attention_heads=16, num_hidden_layers=1
+    )
+    vllm_config = create_mock_vllm_config(hf_config, quant_config=MockQuantConfig())
+
+    attn_result = AttentionMetrics.get_parser().parse(vllm_config)
+    assert attn_result.weight_byte_size == 1  # fp8
+
+    ffn_result = FfnMetrics.get_parser().parse(vllm_config)
+    assert ffn_result.weight_byte_size == 1  # fp8
+
+
+def test_quantization_config_parser_mxfp4():
+    """Test quantization parsers with mxfp4."""
+
+    class MockQuantConfig:
+        def get_name(self):
+            return "mxfp4"
+
+    hf_config = Qwen3Config(
+        hidden_size=2048, intermediate_size=8192, num_hidden_layers=1
+    )
+    vllm_config = create_mock_vllm_config(hf_config, quant_config=MockQuantConfig())
+
+    ffn_result = FfnMetrics.get_parser().parse(vllm_config)
+    assert ffn_result.weight_byte_size == 0.5  # mxfp4
+
+
+#### Per-GPU Tests ####
+
+
+def test_attention_per_gpu_with_tensor_parallelism():
+    """Test attention metrics with tensor parallelism - per_gpu vs global."""
+    hf_config = Qwen3Config(
+        hidden_size=4096,
+        num_attention_heads=32,
+        num_key_value_heads=8,
+        num_hidden_layers=24,
+    )
+
+    # Test with TP=4
+    vllm_config = create_mock_vllm_config(hf_config, tensor_parallel_size=4)
+    metrics = AttentionMetrics.from_vllm_config(vllm_config)
+
+    ctx = ExecutionContext.from_single_request(
+        num_tokens=128, context_len=1024, is_prefill=True
+    )
+
+    # Get global and per-gpu metrics
+    global_flops = metrics.get_num_flops(ctx, per_gpu=False)
+    per_gpu_flops = metrics.get_num_flops(ctx, per_gpu=True)
+
+    # With TP=4, global flops should be 4x per-gpu flops (heads divided by 4)
+    assert global_flops == 4 * per_gpu_flops
+
+    # Same for read/write bytes
+    global_read = metrics.get_read_bytes(ctx, per_gpu=False)
+    per_gpu_read = metrics.get_read_bytes(ctx, per_gpu=True)
+    # Reads should scale similarly (weight reads are divided by TP)
+    assert global_read > per_gpu_read
+
+    global_write = metrics.get_write_bytes(ctx, per_gpu=False)
+    per_gpu_write = metrics.get_write_bytes(ctx, per_gpu=True)
+    assert global_write > per_gpu_write
+
+
+def test_attention_per_gpu_with_pipeline_parallelism():
+    """Test attention metrics with pipeline parallelism - per_gpu vs global."""
+    hf_config = Qwen3Config(
+        hidden_size=2048,
+        num_attention_heads=16,
+        num_hidden_layers=32,
+    )
+
+    # Test with PP=4
+    vllm_config = create_mock_vllm_config(hf_config, pipeline_parallel_size=4)
+    metrics = AttentionMetrics.from_vllm_config(vllm_config)
+
+    ctx = ExecutionContext.from_single_request(
+        num_tokens=100, context_len=512, is_prefill=False
+    )
+
+    # Get global and per-gpu metrics
+    global_flops = metrics.get_num_flops(ctx, per_gpu=False)
+    per_gpu_flops = metrics.get_num_flops(ctx, per_gpu=True)
+
+    # With PP=4, global flops should be 4x per-gpu flops (layers divided by 4)
+    assert global_flops == 4 * per_gpu_flops
+
+    global_read = metrics.get_read_bytes(ctx, per_gpu=False)
+    per_gpu_read = metrics.get_read_bytes(ctx, per_gpu=True)
+    assert global_read == 4 * per_gpu_read
+
+
+def test_ffn_per_gpu_with_tensor_parallelism():
+    """Test FFN metrics with tensor parallelism - per_gpu vs global."""
+    hf_config = Qwen3Config(
+        hidden_size=4096,
+        intermediate_size=14336,
+        num_hidden_layers=32,
+    )
+
+    # Test with DP=2, TP=4 (ffn_tp_size will be 8)
+    vllm_config = create_mock_vllm_config(
+        hf_config,
+        data_parallel_size=2,
+        tensor_parallel_size=4,
+    )
+    metrics = FfnMetrics.from_vllm_config(vllm_config)
+
+    # ffn_tp_size should be dp_size * tp_size = 8 (when EP not enabled)
+    assert metrics.ffn_tp_size == 8
+
+    ctx = ExecutionContext.from_single_request(
+        num_tokens=128, context_len=2048, is_prefill=True
+    )
+
+    # Get global and per-gpu metrics
+    global_flops = metrics.get_num_flops(ctx, per_gpu=False)
+    per_gpu_flops = metrics.get_num_flops(ctx, per_gpu=True)
+
+    # With ffn_tp_size=8, global should be 8x per-gpu
+    assert global_flops == 8 * per_gpu_flops
+
+
+def test_ffn_per_gpu_with_pipeline_parallelism():
+    """Test FFN metrics with pipeline parallelism - per_gpu vs global."""
+    hf_config = Qwen3Config(
+        hidden_size=2048,
+        intermediate_size=8192,
+        num_hidden_layers=24,
+    )
+
+    # Test with PP=6
+    vllm_config = create_mock_vllm_config(hf_config, pipeline_parallel_size=6)
+    metrics = FfnMetrics.from_vllm_config(vllm_config)
+
+    ctx = ExecutionContext.from_single_request(
+        num_tokens=100, context_len=512, is_prefill=True
+    )
+
+    # Get global and per-gpu metrics
+    global_flops = metrics.get_num_flops(ctx, per_gpu=False)
+    per_gpu_flops = metrics.get_num_flops(ctx, per_gpu=True)
+
+    # With PP=6, global should be 6x per-gpu (layers divided by 6)
+    assert global_flops == 6 * per_gpu_flops
+
+
+def test_moe_per_gpu_with_expert_parallelism():
+    """
+    Test MoE metrics with expert parallelism - verifies num_activated_experts bug fix.
+    """
+    hf_config = Qwen3MoeConfig(
+        hidden_size=2048,
+        intermediate_size=8192,
+        num_hidden_layers=24,
+        num_experts=64,
+        num_experts_per_tok=8,
+        moe_intermediate_size=14336,
+        n_shared_experts=2,
+    )
+
+    # Test with DP=2, TP=4, EP enabled (ffn_ep_size will be 8)
+    vllm_config = create_mock_vllm_config(
+        hf_config,
+        data_parallel_size=2,
+        tensor_parallel_size=4,
+        enable_expert_parallel=True,
+    )
+    metrics = FfnMetrics.from_vllm_config(vllm_config)
+
+    # When EP enabled, ffn_ep_size = dp_size * tp_size = 8
+    assert metrics.ffn_ep_size == 8
+    assert metrics.ffn_tp_size == 1
+
+    ctx = ExecutionContext.from_single_request(
+        num_tokens=100, context_len=512, is_prefill=True
+    )
+
+    # Get per-gpu metrics
+    per_gpu_read_breakdown = metrics.get_read_bytes_breakdown(ctx, per_gpu=True)
+    global_read_breakdown = metrics.get_read_bytes_breakdown(ctx, per_gpu=False)
+
+    # Verify that routed expert weight reads are reasonable
+    # With per_gpu=True, each GPU has 64/8 = 8 experts
+    # T=100, E_per_gpu=8/8=1, so T*E=100 expert activations
+    # num_activated_experts should be min(100, 8) = 8
+
+    # Check that weight reads scale appropriately
+    # Global has all 64 experts, per-gpu has 8 experts
+    # So weight reads should reflect this difference
+    if "routed_up_gate_weights" in per_gpu_read_breakdown:
+        per_gpu_weight_reads = per_gpu_read_breakdown["routed_up_gate_weights"]
+        global_weight_reads = global_read_breakdown["routed_up_gate_weights"]
+
+        # The ratio should reflect the expert count difference
+        # This verifies the bug fix works correctly
+        assert per_gpu_weight_reads < global_weight_reads
+
+        # Global should read more experts than per-gpu
+        # Exact ratio depends on num_activated_experts calculation
+        ratio = global_weight_reads / per_gpu_weight_reads
+        # Should be > 1 since global has more experts to read
+        assert ratio > 1
+
+
+def test_moe_per_gpu_expert_activation_accounting():
+    """
+    Test that MoE correctly accounts for expert activations with small batch sizes.
+    """
+    hf_config = Qwen3MoeConfig(
+        hidden_size=2048,
+        intermediate_size=8192,
+        num_hidden_layers=12,
+        num_experts=64,
+        num_experts_per_tok=8,
+        moe_intermediate_size=14336,
+        n_shared_experts=0,  # No shared experts for this test
+    )
+
+    # Test with EP=8
+    vllm_config = create_mock_vllm_config(
+        hf_config,
+        data_parallel_size=8,
+        enable_expert_parallel=True,
+    )
+    metrics = FfnMetrics.from_vllm_config(vllm_config)
+
+    # Small batch: T=10, E_per_gpu=8/8=1
+    # Each GPU: T*E = 10*1 = 10 activations
+    # Experts per GPU: 64/8 = 8
+    # So num_activated_experts should be min(10, 8) = 8
+    small_ctx = ExecutionContext.from_single_request(
+        num_tokens=10, context_len=512, is_prefill=True
+    )
+    small_read = metrics.get_read_bytes_breakdown(small_ctx, per_gpu=True)
+
+    # Large batch: T=1000, E_per_gpu=1
+    # Each GPU: T*E = 1000*1 = 1000 activations
+    # Experts per GPU: 8
+    # So num_activated_experts should be min(1000, 8) = 8 (all experts activated)
+    large_ctx = ExecutionContext.from_single_request(
+        num_tokens=1000, context_len=512, is_prefill=True
+    )
+    large_read = metrics.get_read_bytes_breakdown(large_ctx, per_gpu=True)
+
+    # Weight reads should be similar (both activate all 8 experts per GPU)
+    # But activation reads should differ (proportional to T*E)
+    if "routed_up_gate_weights" in small_read:
+        small_weight = small_read["routed_up_gate_weights"]
+        large_weight = large_read["routed_up_gate_weights"]
+
+        # Weight reads should be the same (both read all 8 experts)
+        assert small_weight == large_weight
+
+        # But input activation reads should scale with T*E
+        small_input = small_read["routed_up_gate_input"]
+        large_input = large_read["routed_up_gate_input"]
+        assert large_input == 100 * small_input  # 1000/10 = 100x
+
+
+def test_unembed_per_gpu_with_tensor_parallelism():
+    """Test unembed metrics with tensor parallelism - per_gpu vs global."""
+    hf_config = Qwen3Config(
+        hidden_size=4096,
+        vocab_size=128000,
+    )
+
+    # Test with TP=8
+    vllm_config = create_mock_vllm_config(hf_config, tensor_parallel_size=8)
+    metrics = UnembedMetrics.from_vllm_config(vllm_config)
+
+    ctx = ExecutionContext.from_single_request(
+        num_tokens=100, context_len=512, is_prefill=True
+    )
+
+    # Get global and per-gpu metrics
+    global_flops = metrics.get_num_flops(ctx, per_gpu=False)
+    per_gpu_flops = metrics.get_num_flops(ctx, per_gpu=True)
+
+    # With TP=8, vocab is divided by 8, so global should be 8x per-gpu
+    assert global_flops == 8 * per_gpu_flops
+
+    # For read bytes, weight reads scale with TP but input reads don't (replicated)
+    global_read_breakdown = metrics.get_read_bytes_breakdown(ctx, per_gpu=False)
+    per_gpu_read_breakdown = metrics.get_read_bytes_breakdown(ctx, per_gpu=True)
+
+    # Input reads should be the same (replicated across TP ranks)
+    assert global_read_breakdown["input"] == per_gpu_read_breakdown["input"]
+
+    # Weight reads should scale 8x (divided by TP)
+    assert global_read_breakdown["weight"] == 8 * per_gpu_read_breakdown["weight"]
+
+
+def test_model_metrics_per_gpu_aggregation():
+    """Test ModelMetrics correctly aggregates per_gpu metrics across components."""
+    hf_config = Qwen3Config(
+        hidden_size=2048,
+        num_attention_heads=16,
+        num_hidden_layers=12,
+        vocab_size=32000,
+        intermediate_size=8192,
+    )
+
+    # Test with mixed parallelism: TP=2, PP=2
+    vllm_config = create_mock_vllm_config(
+        hf_config,
+        tensor_parallel_size=2,
+        pipeline_parallel_size=2,
+    )
+
+    model_metrics = ModelMetrics(vllm_config)
+    ctx = ExecutionContext.from_single_request(
+        num_tokens=100, context_len=512, is_prefill=True
+    )
+
+    # Get breakdowns for both modes
+    per_gpu_breakdown = model_metrics.get_num_flops_breakdown(ctx, per_gpu=True)
+    global_breakdown = model_metrics.get_num_flops_breakdown(ctx, per_gpu=False)
+
+    # Verify breakdown sums match totals
+    per_gpu_total = model_metrics.get_num_flops(ctx, per_gpu=True)
+    global_total = model_metrics.get_num_flops(ctx, per_gpu=False)
+
+    assert per_gpu_total == sum(per_gpu_breakdown.values())
+    assert global_total == sum(global_breakdown.values())
+
+    # Global should be larger than per-gpu due to parallelism
+    assert global_total > per_gpu_total
+
+    # With TP=2 and PP=2, the ratio depends on which parallelism applies to
+    # which component but we can verify that global is reasonably larger
+    ratio = global_total / per_gpu_total
+    assert ratio > 1  # Should be between PP and TP*PP depending on component mix
+
+
+def test_attention_per_gpu_heads_not_evenly_divisible():
+    """Test attention with heads not evenly divisible by TP."""
+    hf_config = Qwen3Config(
+        hidden_size=2048,
+        num_attention_heads=17,  # Not divisible by 4
+        num_key_value_heads=5,  # Not divisible by 4
+        num_hidden_layers=8,
+    )
+
+    vllm_config = create_mock_vllm_config(hf_config, tensor_parallel_size=4)
+    metrics = AttentionMetrics.from_vllm_config(vllm_config)
+
+    ctx = ExecutionContext.from_single_request(
+        num_tokens=64, context_len=256, is_prefill=True
+    )
+
+    # Should not crash and should handle max(1, ...) correctly
+    per_gpu_flops = metrics.get_num_flops(ctx, per_gpu=True)
+    global_flops = metrics.get_num_flops(ctx, per_gpu=False)
+
+    # Both should be positive
+    assert per_gpu_flops > 0
+    assert global_flops > 0
+    assert global_flops > per_gpu_flops
diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py
index 76a0e8e25a4ae..1e2cc2241ba95 100644
--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@@ -547,6 +547,13 @@ def test_spec_decode_logprobs(
     sampling_params = SamplingParams(
         temperature=0, logprobs=top_logprobs, max_tokens=10, ignore_eos=False
     )
+    penalty_sampling_params = SamplingParams(
+        temperature=0,
+        logprobs=top_logprobs,
+        max_tokens=10,
+        ignore_eos=False,
+        presence_penalty=-1.0,
+    )
     method, model_name, spec_model_name = model_setup
     max_model_len = 256
 
@@ -558,14 +565,17 @@ def test_spec_decode_logprobs(
         seed=42,
         logprobs_mode=logprobs_mode,
         gpu_memory_utilization=0.4,
+        enable_prefix_caching=False,
+    )
+    ref_results = ref_llm.generate(
+        [prompt, prompt], [sampling_params, penalty_sampling_params]
     )
-    ref_results = ref_llm.generate([prompt], sampling_params)
     # Collect logprobs outputs from reference LLM.
     ref_logprobs = []
-    for output in ref_results[0].outputs:
-        for logprobs in output.logprobs:
-            for token_id in logprobs:
-                ref_logprobs.append(logprobs[token_id])
+    for results in ref_results:
+        for output in results.outputs:
+            for logprobs in output.logprobs:
+                ref_logprobs.extend(logprobs.values())
     del ref_llm
     torch.cuda.empty_cache()
     cleanup_dist_env_and_memory()
@@ -587,14 +597,17 @@ def test_spec_decode_logprobs(
         # Force prefill chunking
         enable_chunked_prefill=True,
         max_num_batched_tokens=32,
+        enable_prefix_caching=False,
+    )
+    spec_results = spec_llm.generate(
+        [prompt, prompt], [sampling_params, penalty_sampling_params]
     )
-    spec_results = spec_llm.generate([prompt], sampling_params)
     # Collect logprobs outputs from spec decode LLM.
     spec_logprobs = []
-    for output in spec_results[0].outputs:
-        for logprobs in output.logprobs:
-            for token_id in logprobs:
-                spec_logprobs.append(logprobs[token_id])
+    for results in spec_results:
+        for output in results.outputs:
+            for logprobs in output.logprobs:
+                spec_logprobs.extend(logprobs.values())
     del spec_llm
     torch.cuda.empty_cache()
     cleanup_dist_env_and_memory()
diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
index f63cd3a6e42aa..a5e326e82c592 100644
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -306,10 +306,16 @@ def test_prepare_inputs_padded():
 
     proposer = _create_proposer("eagle", num_speculative_tokens)
 
-    output_metadata, token_indices_to_sample = proposer.prepare_inputs_padded(
-        common_attn_metadata, spec_decode_metadata, valid_sampled_tokens_count
+    output_metadata, token_indices_to_sample, num_rejected_tokens_gpu = (
+        proposer.prepare_inputs_padded(
+            common_attn_metadata, spec_decode_metadata, valid_sampled_tokens_count
+        )
     )
 
+    # Verify num_rejected_tokens_gpu is calculated correctly
+    expected_num_rejected = torch.tensor([1, 0, 2], dtype=torch.int32, device=device)
+    assert torch.equal(num_rejected_tokens_gpu, expected_num_rejected)
+
     assert output_metadata.max_query_len == 3
     assert torch.equal(output_metadata.query_start_loc, expected_query_start_loc)
     assert torch.equal(token_indices_to_sample, expected_token_indices_to_sample)
diff --git a/tests/v1/tpu/test_mha_attn.py b/tests/v1/tpu/test_mha_attn.py
index 5debdf85bea8d..84968dee6b60c 100644
--- a/tests/v1/tpu/test_mha_attn.py
+++ b/tests/v1/tpu/test_mha_attn.py
@@ -3,7 +3,7 @@
 """
 Test:
 
-* Tests for MultiHeadAttention layer
+* Tests for MMEncoderAttention layer
 """
 
 import pytest
@@ -12,7 +12,7 @@ import torch_xla
 import torch_xla.core
 import torch_xla.core.xla_model
 
-from vllm.attention.layer import MultiHeadAttention
+from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
 from vllm.attention.selector import _cached_get_attn_backend
 from vllm.platforms import current_platform
 
@@ -69,7 +69,7 @@ def test_mha_attn_forward(
     k = torch.randn(batch_size, seq_len, num_kv_heads * head_size, device=device)
     v = torch.randn(batch_size, seq_len, num_kv_heads * head_size, device=device)
     scale = 1.0 / head_size**0.5
-    attn = MultiHeadAttention(
+    attn = MMEncoderAttention(
         num_heads, head_size, scale=scale, num_kv_heads=num_kv_heads
     )
     output = attn(q, k, v)
diff --git a/tests/v1/tpu/test_multimodal.py b/tests/v1/tpu/test_multimodal.py
index 5bf823417d4dc..3caa7c14b393b 100644
--- a/tests/v1/tpu/test_multimodal.py
+++ b/tests/v1/tpu/test_multimodal.py
@@ -4,7 +4,7 @@
 import openai
 import pytest
 
-from vllm.multimodal.utils import encode_image_base64
+from vllm.multimodal.utils import encode_image_url
 from vllm.platforms import current_platform
 
 from ...entrypoints.openai.test_vision import TEST_IMAGE_ASSETS
@@ -12,11 +12,9 @@ from ...utils import RemoteOpenAIServer
 
 
 @pytest.fixture(scope="session")
-def base64_encoded_image(local_asset_server) -> dict[str, str]:
+def url_encoded_image(local_asset_server) -> dict[str, str]:
     return {
-        image_asset: encode_image_base64(
-            local_asset_server.get_image_asset(image_asset)
-        )
+        image_asset: encode_image_url(local_asset_server.get_image_asset(image_asset))
         for image_asset in TEST_IMAGE_ASSETS
     }
 
@@ -24,19 +22,16 @@ def base64_encoded_image(local_asset_server) -> dict[str, str]:
 @pytest.mark.asyncio
 @pytest.mark.skipif(not current_platform.is_tpu(), reason="This test needs a TPU")
 @pytest.mark.parametrize("model_name", ["llava-hf/llava-1.5-7b-hf"])
-async def test_basic_vision(model_name: str, base64_encoded_image: dict[str, str]):
+async def test_basic_vision(model_name: str, url_encoded_image: dict[str, str]):
     pytest.skip("Skip this test until it's fixed.")
 
-    def whats_in_this_image_msg(b64):
+    def whats_in_this_image_msg(url):
         return [
             {
                 "role": "user",
                 "content": [
                     {"type": "text", "text": "What's in this image?"},
-                    {
-                        "type": "image_url",
-                        "image_url": {"url": f"data:image/jpeg;base64,{b64}"},
-                    },
+                    {"type": "image_url", "image_url": {"url": url}},
                 ],
             }
         ]
@@ -63,14 +58,14 @@ async def test_basic_vision(model_name: str, base64_encoded_image: dict[str, str
 
         # Other requests now should be much faster
         for image_url in TEST_IMAGE_ASSETS:
-            image_base64 = base64_encoded_image[image_url]
-            chat_completion_from_base64 = await client.chat.completions.create(
+            image_url = url_encoded_image[image_url]
+            chat_completion_from_url = await client.chat.completions.create(
                 model=model_name,
-                messages=whats_in_this_image_msg(image_base64),
+                messages=whats_in_this_image_msg(image_url),
                 max_completion_tokens=24,
                 temperature=0.0,
             )
-            result = chat_completion_from_base64
+            result = chat_completion_from_url
             assert result
             choice = result.choices[0]
             assert choice.finish_reason == "length"
diff --git a/vllm/_aiter_ops.py b/vllm/_aiter_ops.py
index c32bf04c71c1f..299c8219120ae 100644
--- a/vllm/_aiter_ops.py
+++ b/vllm/_aiter_ops.py
@@ -4,6 +4,7 @@ import functools
 from collections.abc import Callable
 
 import torch
+from torch._ops import OpOverload
 
 import vllm.envs as envs
 from vllm.platforms import current_platform
@@ -24,14 +25,13 @@ def is_aiter_found() -> bool:
 # we keep this global outside to not cause torch compile breaks.
 IS_AITER_FOUND = is_aiter_found()
 
-# Can't use dtypes.fp8 directly inside an op
-# because it returns wrong result on gfx942.
-# This is a workaround to get the correct FP8 dtype.
-# This might because that the get_gfx() is wrapped as a custom op.
-if IS_AITER_FOUND:
-    from aiter import dtypes
 
-    AITER_FP8_DTYPE = dtypes.fp8
+def is_aiter_found_and_supported() -> bool:
+    if current_platform.is_rocm() and IS_AITER_FOUND:
+        from vllm.platforms.rocm import on_gfx9
+
+        return on_gfx9()
+    return False
 
 
 def if_aiter_supported(func: Callable) -> Callable:
@@ -43,17 +43,24 @@ def if_aiter_supported(func: Callable) -> Callable:
     def wrapper(*args, **kwargs):
         # checks the platform, device arch and aiter library existence.
 
-        if current_platform.is_rocm() and IS_AITER_FOUND:
-            from vllm.platforms.rocm import on_gfx9
-
-            if on_gfx9():
-                return func(*args, **kwargs)
+        if is_aiter_found_and_supported():
+            return func(*args, **kwargs)
 
         return None
 
     return wrapper
 
 
+# Can't use dtypes.fp8 directly inside an op
+# because it returns wrong result on gfx942.
+# This is a workaround to get the correct FP8 dtype.
+# This might because that the get_gfx() is wrapped as a custom op.
+if is_aiter_found_and_supported():
+    from aiter import dtypes
+
+    AITER_FP8_DTYPE = dtypes.fp8
+
+
 def _rocm_aiter_fused_moe_impl(
     hidden_states: torch.Tensor,
     w1: torch.Tensor,
@@ -427,16 +434,16 @@ def _rocm_aiter_rmsnorm2d_fwd_with_add_impl(
     from aiter import rmsnorm2d_fwd_with_add
 
     residual_out = torch.empty_like(residual)
-    output = torch.empty_like(x)
+    out = torch.empty_like(x)
     rmsnorm2d_fwd_with_add(
-        output,  # output
+        out,  # output
         x,  # input
         residual,  # residual input
         residual_out,  # residual output
         weight,
         variance_epsilon,
     )
-    return output, residual_out
+    return out, residual_out
 
 
 def _rocm_aiter_rmsnorm2d_fwd_with_add_fake(
@@ -445,7 +452,84 @@ def _rocm_aiter_rmsnorm2d_fwd_with_add_fake(
     weight: torch.Tensor,
     variance_epsilon: float,
 ) -> tuple[torch.Tensor, torch.Tensor]:
-    return torch.empty_like(x), torch.empty_like(residual)
+    residual_out = torch.empty_like(residual)
+    out = torch.empty_like(x)
+    return out, residual_out
+
+
+def _rocm_aiter_rmsnorm_fused_add_dynamic_quant_impl(
+    x: torch.Tensor,
+    residual: torch.Tensor,
+    weight: torch.Tensor,
+    epsilon: float,
+    quant_dtype: torch.dtype,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    import aiter as rocm_aiter
+
+    assert quant_dtype in [torch.int8, _FP8_DTYPE]
+
+    y_scale = torch.empty(x.shape[0], 1, dtype=torch.float32, device=x.device)
+    out = torch.empty(x.shape, dtype=quant_dtype, device=x.device)
+    residual_out = torch.empty_like(x)
+
+    rocm_aiter.rmsnorm2d_fwd_with_add_dynamicquant(
+        out,
+        x,
+        residual,
+        residual_out,
+        y_scale,
+        weight,
+        epsilon,
+        use_model_sensitive_rmsnorm=0,
+    )
+
+    return out, residual_out, y_scale
+
+
+def _rocm_aiter_rmsnorm_fused_add_dynamic_quant_fake(
+    x: torch.Tensor,
+    residual: torch.Tensor,
+    weight: torch.Tensor,
+    epsilon: float,
+    quant_dtype: torch.dtype,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    y_scale = torch.empty(x.shape[0], 1, dtype=torch.float32, device=x.device)
+    out = torch.empty(x.shape, dtype=quant_dtype, device=x.device)
+    residual_out = torch.empty_like(x)
+
+    return out, residual_out, y_scale
+
+
+def _rocm_aiter_rmsnorm_fused_dynamic_quant_impl(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    epsilon: float,
+    quant_dtype: torch.dtype,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    import aiter as rocm_aiter
+
+    assert quant_dtype in [torch.int8, _FP8_DTYPE]
+
+    y_scale = torch.empty(x.shape[0], 1, dtype=torch.float32, device=x.device)
+    out = torch.empty(x.shape, dtype=quant_dtype, device=x.device)
+
+    rocm_aiter.rmsnorm2d_fwd_with_dynamicquant(
+        out, x, y_scale, weight, epsilon, use_model_sensitive_rmsnorm=0
+    )
+
+    return out, y_scale
+
+
+def _rocm_aiter_rmsnorm_fused_dynamic_quant_fake(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    epsilon: float,
+    quant_dtype: torch.dtype,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    y_scale = torch.empty(x.shape[0], 1, dtype=torch.float32, device=x.device)
+    out = torch.empty(x.shape, dtype=quant_dtype, device=x.device)
+
+    return out, y_scale
 
 
 def _rocm_aiter_per_tensor_quant_impl(
@@ -521,7 +605,11 @@ def _rocm_aiter_rmsnorm_with_add_fp8_group_quant_impl(
         dtype_quant=AITER_FP8_DTYPE,
         res1=residual,
     )
-    return (x_quant, x_quant_scales, res)
+    return (
+        x_quant,
+        res,
+        x_quant_scales,
+    )
 
 
 def _rocm_aiter_rmsnorm_with_add_fp8_group_quant_fake(
@@ -535,8 +623,8 @@ def _rocm_aiter_rmsnorm_with_add_fp8_group_quant_fake(
     scale_shape = (M, (N + group_size - 1) // group_size)
     return (
         torch.empty_like(x, dtype=AITER_FP8_DTYPE, device=x.device),
-        torch.empty(scale_shape, dtype=torch.float32, device=x.device),
         torch.empty_like(residual, device=residual.device),
+        torch.empty(scale_shape, dtype=torch.float32, device=x.device),
     )
 
 
@@ -755,7 +843,7 @@ class rocm_aiter_ops:
 
     @classmethod
     @if_aiter_supported
-    def is_linear_fp8_enaled(cls) -> bool:
+    def is_linear_fp8_enabled(cls) -> bool:
         return cls.is_linear_enabled()
 
     @classmethod
@@ -895,6 +983,20 @@ class rocm_aiter_ops:
                 dispatch_key=current_platform.dispatch_key,
             )
 
+            direct_register_custom_op(
+                op_name="rocm_aiter_rmsnorm_fused_dynamic_quant",
+                op_func=_rocm_aiter_rmsnorm_fused_dynamic_quant_impl,
+                fake_impl=_rocm_aiter_rmsnorm_fused_dynamic_quant_fake,
+                dispatch_key=current_platform.dispatch_key,
+            )
+
+            direct_register_custom_op(
+                op_name="rocm_aiter_rmsnorm_fused_add_dynamic_quant",
+                op_func=_rocm_aiter_rmsnorm_fused_add_dynamic_quant_impl,
+                fake_impl=_rocm_aiter_rmsnorm_fused_add_dynamic_quant_fake,
+                dispatch_key=current_platform.dispatch_key,
+            )
+
             direct_register_custom_op(
                 op_name="rocm_aiter_rmsnorm_fp8_group_quant",
                 op_func=_rocm_aiter_rmsnorm_fp8_group_quant_impl,
@@ -930,13 +1032,54 @@ class rocm_aiter_ops:
             direct_register_custom_op(
                 op_name="rocm_aiter_per_token_quant",
                 op_func=_rocm_aiter_per_token_quant_impl,
-                mutates_args=["scale"],
                 fake_impl=_rocm_aiter_per_token_quant_fake,
                 dispatch_key=current_platform.dispatch_key,
             )
 
             _OPS_REGISTERED = True
 
+    @staticmethod
+    def get_rmsnorm_fused_add_op() -> OpOverload:
+        return torch.ops.vllm.rocm_aiter_rmsnorm2d_fwd_with_add.default
+
+    @staticmethod
+    def get_rmsnorm_op() -> OpOverload:
+        return torch.ops.vllm.rocm_aiter_rms_norm.default
+
+    @staticmethod
+    def get_rmsnorm_fused_add_dynamic_quant_op() -> OpOverload:
+        return torch.ops.vllm.rocm_aiter_rmsnorm_fused_add_dynamic_quant.default
+
+    @staticmethod
+    def get_rmsnorm_fused_dynamic_quant_op() -> OpOverload:
+        return torch.ops.vllm.rocm_aiter_rmsnorm_fused_dynamic_quant.default
+
+    @staticmethod
+    def get_rmsnorm_group_fused_quant_op() -> OpOverload:
+        return torch.ops.vllm.rocm_aiter_rmsnorm_fp8_group_quant.default
+
+    @staticmethod
+    def get_rmsnorm_group_add_fused_quant_op() -> OpOverload:
+        return torch.ops.vllm.rocm_aiter_rmsnorm_with_add_fp8_group_quant.default
+
+    @staticmethod
+    def get_per_token_quant_op() -> OpOverload:
+        return torch.ops.vllm.rocm_aiter_per_token_quant.default
+
+    @staticmethod
+    def get_group_quant_op() -> OpOverload:
+        return torch.ops.vllm.rocm_aiter_group_fp8_quant.default
+
+    @staticmethod
+    def get_act_mul_fused_fp8_group_quant_op() -> OpOverload:
+        return torch.ops.vllm.rocm_aiter_act_mul_and_fp8_group_quant.default
+
+    @staticmethod
+    def rms_norm(
+        x: torch.Tensor, weight: torch.Tensor, variance_epsilon: float
+    ) -> torch.Tensor:
+        return torch.ops.vllm.rocm_aiter_rms_norm(x, weight, variance_epsilon)
+
     @staticmethod
     def rms_norm2d_with_add(
         x: torch.Tensor,
@@ -948,12 +1091,6 @@ class rocm_aiter_ops:
             x, residual, weight, variance_epsilon
         )
 
-    @staticmethod
-    def rms_norm(
-        x: torch.Tensor, weight: torch.Tensor, variance_epsilon: float
-    ) -> torch.Tensor:
-        return torch.ops.vllm.rocm_aiter_rms_norm(x, weight, variance_epsilon)
-
     @staticmethod
     def gemm_a8w8(
         A: torch.Tensor,
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 2319655008c50..78bd8d4e64115 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -788,20 +788,6 @@ def cutlass_scaled_mm_supports_fp4(cuda_device_capability: int) -> bool:
     return torch.ops._C.cutlass_scaled_mm_supports_fp4(cuda_device_capability)
 
 
-def cutlass_blockwise_scaled_grouped_mm(
-    output: torch.Tensor,
-    a: torch.Tensor,
-    b: torch.Tensor,
-    scales_a: torch.Tensor,
-    scales_b: torch.Tensor,
-    problem_sizes: torch.Tensor,
-    expert_offsets: torch.Tensor,
-):
-    torch.ops._C.cutlass_blockwise_scaled_grouped_mm(
-        output, a, b, scales_a, scales_b, problem_sizes, expert_offsets
-    )
-
-
 def cutlass_scaled_fp4_mm(
     a: torch.Tensor,
     b: torch.Tensor,
@@ -2919,6 +2905,42 @@ def cpu_gemm_wna16(
     return output
 
 
+def cpu_prepack_moe_weight(
+    weight: torch.Tensor,
+    isa: str,
+) -> torch.Tensor:
+    output = torch.empty_like(weight)
+    torch.ops._C.prepack_moe_weight(weight, output, isa)
+    return output
+
+
+def cpu_fused_moe(
+    input: torch.Tensor,
+    w13: torch.Tensor,
+    w2: torch.Tensor,
+    w13_bias: torch.Tensor | None,
+    w2_bias: torch.Tensor | None,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    act: str,
+    isa: str,
+) -> torch.Tensor:
+    output = torch.empty_like(input)
+    torch.ops._C.cpu_fused_moe(
+        output,
+        input,
+        w13,
+        w2,
+        w13_bias,
+        w2_bias,
+        topk_weights,
+        topk_ids,
+        act,
+        isa,
+    )
+    return output
+
+
 if hasattr(torch.ops._qutlass_C, "matmul_mxf4_bf16_tn"):
 
     @register_fake("_qutlass_C::matmul_mxf4_bf16_tn")
diff --git a/vllm/attention/backends/registry.py b/vllm/attention/backends/registry.py
index ed0021db204ac..416b996df9f22 100644
--- a/vllm/attention/backends/registry.py
+++ b/vllm/attention/backends/registry.py
@@ -77,7 +77,8 @@ class AttentionBackendEnum(Enum, metaclass=_AttentionBackendEnumMeta):
     )
     CPU_ATTN = "vllm.v1.attention.backends.cpu_attn.CPUAttentionBackend"
     # Placeholder for third-party/custom backends - must be registered before use
-    CUSTOM = ""
+    # set to None to avoid alias with other backend, whose value is an empty string
+    CUSTOM = None
 
     def get_path(self, include_classname: bool = True) -> str:
         """Get the class path for this backend (respects overrides).
@@ -139,7 +140,8 @@ class MambaAttentionBackendEnum(Enum, metaclass=_AttentionBackendEnumMeta):
     LINEAR = "vllm.v1.attention.backends.linear_attn.LinearAttentionBackend"
     GDN_ATTN = "vllm.v1.attention.backends.gdn_attn.GDNAttentionBackend"
     # Placeholder for third-party/custom backends - must be registered before use
-    CUSTOM = ""
+    # set to None to avoid alias with other backend, whose value is an empty string
+    CUSTOM = None
 
     def get_path(self, include_classname: bool = True) -> str:
         """Get the class path for this backend (respects overrides).
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 7ef77db8fbb5b..1d882eb87bfde 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -2,12 +2,10 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer."""
 
-import functools
 from typing import cast
 
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
 
 import vllm.envs as envs
 from vllm.attention.backends.abstract import (
@@ -16,13 +14,10 @@ from vllm.attention.backends.abstract import (
     MLAAttentionImpl,
 )
 from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.attention.layers.mm_encoder_attention import maybe_get_vit_flash_attn_backend
 from vllm.attention.selector import get_attn_backend
-from vllm.attention.utils.fa_utils import get_flash_attn_version
 from vllm.attention.utils.kv_sharing_utils import validate_kv_sharing_target
 from vllm.attention.utils.kv_transfer_utils import maybe_transfer_kv_layer
 from vllm.config import CacheConfig, get_current_vllm_config
-from vllm.config.multimodal import MultiModalConfig
 from vllm.config.vllm import VllmConfig
 from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.logger import init_logger
@@ -36,7 +31,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
-from vllm.model_executor.models.vision import get_vit_attn_backend
 from vllm.platforms import current_platform
 from vllm.utils.torch_utils import (
     direct_register_custom_op,
@@ -412,132 +406,6 @@ class Attention(nn.Module, AttentionLayerBase):
             )
 
 
-class MultiHeadAttention(nn.Module):
-    """Multi-headed attention without any cache, used for ViT."""
-
-    def __init__(
-        self,
-        num_heads: int,
-        head_size: int,
-        scale: float,
-        num_kv_heads: int | None = None,
-        # This has no effect, it is only here to make it easier to swap
-        # between Attention and MultiHeadAttention
-        prefix: str = "",
-        multimodal_config: MultiModalConfig | None = None,
-    ) -> None:
-        super().__init__()
-        self.num_heads = num_heads
-        self.head_size = head_size
-        self.scale = scale
-        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
-        self.layer_name = prefix
-
-        assert self.num_heads % self.num_kv_heads == 0, (
-            f"num_heads ({self.num_heads}) is not "
-            f"divisible by num_kv_heads ({self.num_kv_heads})"
-        )
-        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
-
-        # During model initialization, the default dtype is set as the model
-        # weight and activation dtype.
-        dtype = torch.get_default_dtype()
-
-        # Determine the attention backend
-        attn_backend_override = None
-        if multimodal_config is not None:
-            attn_backend_override = multimodal_config.mm_encoder_attn_backend
-
-        self.attn_backend = get_vit_attn_backend(
-            head_size=head_size,
-            dtype=dtype,
-            attn_backend_override=attn_backend_override,
-        )
-
-        self._flash_attn_varlen_func = maybe_get_vit_flash_attn_backend(
-            self.attn_backend,
-        )
-
-        self.is_flash_attn_backend = self.attn_backend in {
-            AttentionBackendEnum.FLASH_ATTN,
-            AttentionBackendEnum.ROCM_AITER_FA,
-        }
-
-        self.fa_version = None
-        if (
-            self.attn_backend == AttentionBackendEnum.FLASH_ATTN
-            and current_platform.is_cuda()
-        ):
-            self.fa_version = get_flash_attn_version()
-            assert self._flash_attn_varlen_func is not None
-            self._flash_attn_varlen_func = functools.partial(
-                self._flash_attn_varlen_func, fa_version=self.fa_version
-            )
-
-        logger.info_once(
-            f"Using {self.attn_backend} for MultiHeadAttention in multimodal encoder."
-        )
-
-    def forward(
-        self,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-    ) -> torch.Tensor:
-        """Input shape:
-        (batch_size x seq_len x hidden_size) or
-        (batch_size x seq_len x num_heads x head_size)
-        """
-        bsz, q_len = query.size()[:2]
-        kv_len = key.size(1)
-
-        query = query.view(bsz, q_len, self.num_heads, self.head_size)
-        key = key.view(bsz, kv_len, self.num_kv_heads, self.head_size)
-        value = value.view(bsz, kv_len, self.num_kv_heads, self.head_size)
-
-        if (num_repeat := self.num_queries_per_kv) > 1:
-            # Handle MQA and GQA
-            key = torch.repeat_interleave(key, num_repeat, dim=2)
-            value = torch.repeat_interleave(value, num_repeat, dim=2)
-
-        if self.is_flash_attn_backend:
-            assert self._flash_attn_varlen_func is not None
-            cu_seqlens_q = torch.arange(
-                0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device=query.device
-            )
-            cu_seqlens_k = torch.arange(
-                0, (bsz + 1) * kv_len, step=kv_len, dtype=torch.int32, device=key.device
-            )
-
-            out = self._flash_attn_varlen_func(
-                query.flatten(0, 1),
-                key.flatten(0, 1),
-                value.flatten(0, 1),
-                cu_seqlens_q=cu_seqlens_q,
-                cu_seqlens_k=cu_seqlens_k,
-                max_seqlen_q=q_len,
-                max_seqlen_k=kv_len,
-                softmax_scale=self.scale,
-            )
-        elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
-            query, key, value = (x.transpose(1, 2) for x in (query, key, value))
-            out = F.scaled_dot_product_attention(query, key, value, scale=self.scale)
-            out = out.transpose(1, 2)
-        elif self.attn_backend == AttentionBackendEnum.PALLAS:
-            query, key, value = (x.transpose(1, 2) for x in (query, key, value))
-            from torch_xla.experimental.custom_kernel import flash_attention
-
-            out = flash_attention(query, key, value, sm_scale=self.scale)
-            out = out.transpose(1, 2)
-        else:
-            # ViT attention hasn't supported this backend yet
-            raise NotImplementedError(
-                f"ViT attention hasn't supported {self.attn_backend} backend yet."
-            )
-
-        return out.reshape(bsz, q_len, -1)
-
-
 class MLAAttention(nn.Module, AttentionLayerBase):
     """Multi-Head Latent Attention layer.
 
diff --git a/vllm/attention/layers/mm_encoder_attention.py b/vllm/attention/layers/mm_encoder_attention.py
index 8b3dee1340b9f..1c1623b13f55a 100644
--- a/vllm/attention/layers/mm_encoder_attention.py
+++ b/vllm/attention/layers/mm_encoder_attention.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from collections.abc import Callable
 
 import torch
 
@@ -19,27 +18,6 @@ from vllm.model_executor.models.vision import get_vit_attn_backend
 logger = init_logger(__name__)
 
 
-def maybe_get_vit_flash_attn_backend(
-    attn_backend: AttentionBackendEnum | None,
-) -> Callable | None:
-    # At this point,
-    # we already have the attn_backend,
-    # overriding logic is done in the platform-specific implementation.
-    # so we don't need to override backend here.
-    # Just return the attn_backend and flash_attn_varlen_func.
-
-    if attn_backend == AttentionBackendEnum.FLASH_ATTN:
-        from vllm.attention.utils.fa_utils import flash_attn_varlen_func
-    elif attn_backend == AttentionBackendEnum.ROCM_AITER_FA:
-        from aiter import flash_attn_varlen_func
-    else:
-        flash_attn_varlen_func = None
-
-    # if attn_backend is TORCH_SDPA,
-    # it will reach here and the flash_attn_varlen_func will be None.
-    return flash_attn_varlen_func
-
-
 @CustomOp.register("mm_encoder_attn")
 class MMEncoderAttention(CustomOp):
     """Multi-headed attention without any cache, used for multimodal encoder."""
@@ -98,21 +76,17 @@ class MMEncoderAttention(CustomOp):
             AttentionBackendEnum.ROCM_AITER_FA,
         }
 
-        self.flash_attn_varlen_func = maybe_get_vit_flash_attn_backend(
-            self.attn_backend,
+        self._fa_version = (
+            get_flash_attn_version() if self.is_flash_attn_backend else None
         )
 
-        if self.is_flash_attn_backend:
-            assert self.flash_attn_varlen_func is not None
-            self._fa_version = get_flash_attn_version()
-
         logger.info_once(f"Using {self.attn_backend} for MMEncoderAttention.")
 
     @classmethod
     def enabled(cls) -> bool:
         return True
 
-    def reshape_qkv_to_4d(
+    def maybe_reshape_qkv_to_4d(
         self,
         query: torch.Tensor,
         key: torch.Tensor,
@@ -136,30 +110,6 @@ class MMEncoderAttention(CustomOp):
 
         return query, key, value
 
-    def reshape_qkv_to_3d(
-        self,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        bsz: int,
-        q_len: int,
-        kv_len: int,
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """
-        Reshape query, key, value to 3D tensors:
-        (batch_size * seq_len, num_heads, head_size)
-        """
-        query = query.view(bsz * q_len, self.num_heads, self.head_size)
-        key = key.view(bsz * kv_len, self.num_kv_heads, self.head_size)
-        value = value.view(bsz * kv_len, self.num_kv_heads, self.head_size)
-
-        if (num_repeat := self.num_queries_per_kv) > 1:
-            # Handle MQA and GQA
-            key = torch.repeat_interleave(key, num_repeat, dim=1)
-            value = torch.repeat_interleave(value, num_repeat, dim=1)
-
-        return query, key, value
-
     def _forward_sdpa(
         self,
         query: torch.Tensor,
@@ -167,13 +117,15 @@ class MMEncoderAttention(CustomOp):
         value: torch.Tensor,
         cu_seqlens: torch.Tensor | None = None,
     ) -> torch.Tensor:
-        # TODO(Isotr0py): Migrate MultiHeadAttention
-        assert cu_seqlens is not None
-
+        """Input shape:
+        (batch_size x seq_len x hidden_size) or
+        (batch_size x seq_len x num_heads x head_size)
+        """
         bsz, q_len = query.size()[:2]
         kv_len = key.size(1)
+        is_reshaped = query.dim() != 4
 
-        query, key, value = self.reshape_qkv_to_4d(
+        query, key, value = self.maybe_reshape_qkv_to_4d(
             query, key, value, bsz, q_len, kv_len
         )
 
@@ -183,6 +135,8 @@ class MMEncoderAttention(CustomOp):
             v=value,
             cu_seqlens=cu_seqlens,
         )
+        if is_reshaped:
+            output = output.reshape(bsz, q_len, -1)
         return output
 
     def _forward_fa(
@@ -193,13 +147,21 @@ class MMEncoderAttention(CustomOp):
         cu_seqlens: torch.Tensor | None = None,
         max_seqlen: torch.Tensor | None = None,  # Only used for Flash Attention
     ) -> torch.Tensor:
-        assert self.flash_attn_varlen_func is not None, (
-            "Flash attention function is not set."
-        )
-        # # TODO(Isotr0py): Migrate MultiHeadAttention
-        assert cu_seqlens is not None and max_seqlen is not None
+        """Input shape:
+        (batch_size x seq_len x hidden_size) or
+        (batch_size x seq_len x num_heads x head_size)
+        """
+        assert (cu_seqlens is not None and max_seqlen is not None) or (
+            cu_seqlens is None and max_seqlen is None
+        ), "cu_seqlens and max_seqlen should be both set or both None."
 
-        bsz = query.shape[0]
+        bsz, q_len = query.size()[:2]
+        kv_len = key.size(1)
+        is_reshaped = query.dim() != 4
+
+        query, key, value = self.maybe_reshape_qkv_to_4d(
+            query, key, value, bsz, q_len, kv_len
+        )
 
         output = vit_flash_attn_wrapper(
             q=query,
@@ -211,6 +173,8 @@ class MMEncoderAttention(CustomOp):
             is_rocm_aiter=(self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA),
             fa_version=self._fa_version,
         )
+        if is_reshaped:
+            output = output.reshape(bsz, q_len, -1)
         return output
 
     def forward_native(
diff --git a/vllm/attention/ops/merge_attn_states.py b/vllm/attention/ops/merge_attn_states.py
index 16106f3c93a6a..f347fb3fbba51 100644
--- a/vllm/attention/ops/merge_attn_states.py
+++ b/vllm/attention/ops/merge_attn_states.py
@@ -15,7 +15,7 @@ def merge_attn_states(
     output_lse: torch.Tensor | None = None,
 ) -> None:
     # NOTE(DefTruth): Currently, custom merge_attn_states CUDA kernel
-    # is not support for FP8 dtype, fallback to use Triton kernel.
+    # does not support FP8 dtype, fallback to use Triton kernel.
     def supported_dtypes(o: torch.Tensor) -> bool:
         return o.dtype in [torch.float32, torch.half, torch.bfloat16]
 
diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/attention/ops/triton_unified_attention.py
index a1877bb4429b9..c946dbd8a2c4e 100644
--- a/vllm/attention/ops/triton_unified_attention.py
+++ b/vllm/attention/ops/triton_unified_attention.py
@@ -86,6 +86,9 @@ def kernel_unified_attention_2d(
     USE_SOFTCAP: tl.constexpr,  # bool
     USE_SINKS: tl.constexpr,  # bool
     SLIDING_WINDOW: tl.constexpr,  # int
+    USE_MM_PREFIX: tl.constexpr,  # bool
+    MAX_MM_RANGES: tl.constexpr,  # int
+    mm_prefix_range_ptr,  # [num_seqs] - prefix length for each sequence
     stride_k_cache_0: tl.int64,  # int
     stride_k_cache_1: tl.int64,  # int
     stride_k_cache_2: tl.int64,  # int
@@ -186,9 +189,14 @@ def kernel_unified_attention_2d(
         + 1
     )
 
-    # adjust for potential padding in the last q_block by considering the
-    # actual sequence length
-    max_seq_prefix_len = tl.minimum(max_seq_prefix_len, seq_len)
+    if USE_MM_PREFIX:
+        # image bidirectional attention ranges require a full range
+        # including q_block padding to make sure doc mask is correct
+        max_seq_prefix_len = tl.maximum(max_seq_prefix_len, seq_len)
+    else:
+        # adjust for potential padding in the last q_block by considering the
+        # actual sequence length
+        max_seq_prefix_len = tl.minimum(max_seq_prefix_len, seq_len)
 
     # calculate the number of tiles that need to be processed to
     # cover the longest sequence prefix (due to causal masking, tiles beyond
@@ -199,7 +207,8 @@ def kernel_unified_attention_2d(
     # Default: keep previous global behavior
     tile_start = 0
     tile_end = num_tiles
-    if SLIDING_WINDOW > 0:
+    # TODO(Isotr0py): sliding window pruning with image bidirectional mask
+    if SLIDING_WINDOW > 0 and not USE_MM_PREFIX:
         # Query rows covered by this Q-block
         qpos_lo = q_block_local_idx * BLOCK_Q
         qpos_hi = tl.minimum(
@@ -270,7 +279,38 @@ def kernel_unified_attention_2d(
         else:
             V = V_load
 
-        seq_mask = seq_offset[None, :] < context_len + query_pos[:, None] + 1
+        # Compute attention mask: causal by default (key <= query)
+        query_abs_pos = context_len + query_pos[:, None]
+        seq_mask = seq_offset[None, :] <= query_abs_pos
+
+        # Apply sliding window to base mask BEFORE mm_prefix OR.
+        # Order must match FlexAttention: (causal AND sliding_window) OR mm_prefix
+        if SLIDING_WINDOW > 0:
+            seq_mask = seq_mask & ((query_abs_pos - seq_offset) < SLIDING_WINDOW)
+
+        # PrefixLM: extend mask with bidirectional ranges for multimodal tokens.
+        # Applied AFTER sliding window so mm_prefix ranges override SW restriction.
+        if USE_MM_PREFIX:
+            for i in range(MAX_MM_RANGES):
+                range_start = tl.load(
+                    mm_prefix_range_ptr + seq_idx * MAX_MM_RANGES * 2 + i * 2
+                )
+                range_end = tl.load(
+                    mm_prefix_range_ptr + seq_idx * MAX_MM_RANGES * 2 + i * 2 + 1
+                )
+
+                is_valid = range_start < range_end
+                q_in_range = (
+                    (query_abs_pos >= range_start)
+                    & (query_abs_pos <= range_end)
+                    & is_valid
+                )
+                k_in_range = (
+                    (seq_offset[None, :] >= range_start)
+                    & (seq_offset[None, :] <= range_end)
+                    & is_valid
+                )
+                seq_mask |= q_in_range & k_in_range
 
         # S : (BLOCK_M, TILE_SIZE)
         S = tl.zeros(shape=(BLOCK_M, TILE_SIZE), dtype=tl.float32)
@@ -284,13 +324,6 @@ def kernel_unified_attention_2d(
             query_mask_1[:, None] & query_mask_0[:, None] & seq_mask, S, float("-inf")
         )
 
-        if SLIDING_WINDOW > 0:
-            S = tl.where(
-                (context_len + query_pos[:, None] - seq_offset) < SLIDING_WINDOW,
-                S,
-                float("-inf"),
-            )
-
         if USE_ALIBI_SLOPES:
             S += alibi_slope[:, None] * (seq_offset - context_len)
 
@@ -330,6 +363,12 @@ def kernel_unified_attention_2d(
         L = L * alpha + l_j
         M = m_j
 
+        if SLIDING_WINDOW:
+            qpos_lo = q_block_local_idx * BLOCK_Q
+            V = tl.where(
+                (context_len + qpos_lo - seq_offset[:, None]) < SLIDING_WINDOW, V, 0.0
+            )
+
         # acc : (BLOCK_M, HEAD_SIZE_PADDED)
         acc += tl.dot(P.to(V.dtype), V)
 
@@ -398,6 +437,9 @@ def kernel_unified_attention_3d(
     num_seqs: tl.int32,
     BLOCK_M: tl.constexpr,  # int
     NUM_SEGMENTS_PER_SEQ: tl.constexpr,  # int
+    USE_MM_PREFIX: tl.constexpr,  # bool
+    MAX_MM_RANGES: tl.constexpr,  # int
+    mm_prefix_range_ptr,  # [num_seqs] - prefix length for each sequence
 ):
     q_block_global_idx = tl.program_id(0)
     kv_head_idx = tl.program_id(1)
@@ -559,7 +601,38 @@ def kernel_unified_attention_3d(
         else:
             V = V_load
 
-        seq_mask = seq_offset[None, :] < context_len + query_pos[:, None] + 1
+        # Compute attention mask: causal by default (key <= query)
+        query_abs_pos = context_len + query_pos[:, None]
+        seq_mask = seq_offset[None, :] <= query_abs_pos
+
+        # Apply sliding window to base mask BEFORE mm_prefix OR.
+        # Order must match FlexAttention: (causal AND sliding_window) OR mm_prefix
+        if SLIDING_WINDOW > 0:
+            seq_mask = seq_mask & ((query_abs_pos - seq_offset) < SLIDING_WINDOW)
+
+        # PrefixLM: extend mask with bidirectional ranges for multimodal tokens.
+        # Applied AFTER sliding window so mm_prefix ranges override SW restriction.
+        if USE_MM_PREFIX:
+            for i in range(MAX_MM_RANGES):
+                range_start = tl.load(
+                    mm_prefix_range_ptr + seq_idx * MAX_MM_RANGES * 2 + i * 2
+                )
+                range_end = tl.load(
+                    mm_prefix_range_ptr + seq_idx * MAX_MM_RANGES * 2 + i * 2 + 1
+                )
+
+                is_valid = range_start < range_end
+                q_in_range = (
+                    (query_abs_pos >= range_start)
+                    & (query_abs_pos <= range_end)
+                    & is_valid
+                )
+                k_in_range = (
+                    (seq_offset[None, :] >= range_start)
+                    & (seq_offset[None, :] <= range_end)
+                    & is_valid
+                )
+                seq_mask |= q_in_range & k_in_range
 
         # S : (BLOCK_M, TILE_SIZE)
         S = tl.zeros(shape=(BLOCK_M, TILE_SIZE), dtype=tl.float32)
@@ -572,13 +645,6 @@ def kernel_unified_attention_3d(
             query_mask_1[:, None] & query_mask_0[:, None] & seq_mask, S, float("-inf")
         )
 
-        if SLIDING_WINDOW > 0:
-            S = tl.where(
-                (context_len + query_pos[:, None] - seq_offset) < SLIDING_WINDOW,
-                S,
-                float("-inf"),
-            )
-
         if USE_ALIBI_SLOPES:
             S += alibi_slope[:, None] * (seq_offset - context_len)
 
@@ -618,6 +684,12 @@ def kernel_unified_attention_3d(
         L = L * alpha + l_j
         M = m_j
 
+        if SLIDING_WINDOW:
+            qpos_lo = q_block_local_idx * BLOCK_Q
+            V = tl.where(
+                (context_len + qpos_lo - seq_offset[:, None]) < SLIDING_WINDOW, V, 0.0
+            )
+
         # acc : (BLOCK_M, HEAD_SIZE_PADDED)
         acc += tl.dot(P.to(V.dtype), V)
 
@@ -732,6 +804,38 @@ def reduce_segments(
     tl.store(output_ptr + output_offset, acc, mask=dim_mask)
 
 
+def _is_gemma3_attention(head_size: int, sliding_window: int) -> bool:
+    """Detect Gemma3 models via unique (head_size, sliding_window) signature.
+
+    Gemma3 models are the only ones using sliding_window=1024 with
+    head_size 128 (27B) or 256 (1B, 4B, 12B). Other SWA models use
+    different window sizes (Mistral=4096, Phi-3=2047).
+    """
+    return sliding_window == 1024 and head_size in (128, 256)
+
+
+def _get_tile_size(
+    head_size: int,
+    sliding_window: int,
+    element_size: int,
+    is_prefill: bool,
+) -> int:
+    """Select tile size with Gemma3-specific optimization.
+
+    For Gemma3, use 32 for both prefill and decode to better utilize
+    the larger head dimension (128/256). For other models, use
+    the default vLLM behavior.
+    """
+    if _is_gemma3_attention(head_size, sliding_window):
+        # Gemma3: use 32 for decode (default is 16)
+        return 32
+
+    # Default behavior
+    if is_prefill:
+        return 32
+    return 16 if element_size >= 2 else 32
+
+
 def unified_attention(
     q,
     k,
@@ -759,6 +863,8 @@ def unified_attention(
     qq_bias=None,
     # Optional tensor for sinks
     sinks=None,
+    # Optional tensor for prefix lengths (PrefixLM support)
+    mm_prefix_range=None,
 ):
     assert causal, "Only causal attention is supported"
     assert q_descale is None, "Q scales not supported"
@@ -766,6 +872,17 @@ def unified_attention(
     if sinks is not None:
         assert sinks.shape[0] == q.shape[1], "Sinks must be num_query_heads size"
 
+    use_mm_prefix = False
+    max_mm_ranges = 0
+    if mm_prefix_range is not None:
+        if mm_prefix_range.ndim == 3:
+            use_mm_prefix = True
+            max_mm_ranges = mm_prefix_range.shape[1]
+        else:
+            raise ValueError(
+                f"Unsupported mm_prefix_range shape: {mm_prefix_range.shape}"
+            )
+
     use_alibi_slopes = alibi_slopes is not None
     use_qq_bias = qq_bias is not None
 
@@ -792,11 +909,21 @@ def unified_attention(
     #    = floor(q.shape[0] / BLOCK_Q) + num_seqs
     total_num_q_blocks = q.shape[0] // BLOCK_Q + num_seqs
 
-    # Assigning default tile sizes for prefill and decode.
-    # Note: each tile size must be at least 32 for "fp8" (q.element_size() == 1)
-    # and at least 16 for all other data types.
-    TILE_SIZE_PREFILL = 32
-    TILE_SIZE_DECODE = 16 if q.element_size() >= 2 else 32
+    # Tile sizes for prefill and decode. Gemma3 models use optimized values.
+    # Note: tile size must be at least 32 for fp8 (element_size == 1).
+    sliding_window_val = 1 + window_size[0] if window_size[0] >= 0 else 0
+    TILE_SIZE_PREFILL = _get_tile_size(
+        head_size,
+        sliding_window_val,
+        q.element_size(),
+        is_prefill=True,
+    )
+    TILE_SIZE_DECODE = _get_tile_size(
+        head_size,
+        sliding_window_val,
+        q.element_size(),
+        is_prefill=False,
+    )
 
     # Launch the 2D kernel if
     # 1. No intermediate tiled softmax buffers for the 3D kernel have been allocated, or
@@ -847,6 +974,9 @@ def unified_attention(
             USE_QQ_BIAS=use_qq_bias,
             USE_SOFTCAP=(softcap > 0),
             USE_SINKS=(sinks is not None),
+            USE_MM_PREFIX=use_mm_prefix,
+            MAX_MM_RANGES=max_mm_ranges,
+            mm_prefix_range_ptr=mm_prefix_range,
             SLIDING_WINDOW=(1 + window_size[0]),
             stride_k_cache_0=k.stride(0),
             stride_k_cache_1=k.stride(1),
@@ -895,6 +1025,9 @@ def unified_attention(
             USE_QQ_BIAS=use_qq_bias,
             USE_SOFTCAP=(softcap > 0),
             USE_SINKS=(sinks is not None),
+            USE_MM_PREFIX=use_mm_prefix,
+            MAX_MM_RANGES=max_mm_ranges,
+            mm_prefix_range_ptr=mm_prefix_range,
             SLIDING_WINDOW=(1 + window_size[0]),
             stride_k_cache_0=k.stride(0),
             stride_k_cache_1=k.stride(1),
diff --git a/vllm/attention/ops/vit_attn_wrappers.py b/vllm/attention/ops/vit_attn_wrappers.py
index 5a74e1310133d..2204382a35e2a 100644
--- a/vllm/attention/ops/vit_attn_wrappers.py
+++ b/vllm/attention/ops/vit_attn_wrappers.py
@@ -24,11 +24,11 @@ def flash_attn_maxseqlen_wrapper(
     q: torch.Tensor,
     k: torch.Tensor,
     v: torch.Tensor,
-    cu_seqlens: torch.Tensor,
-    max_seqlen: torch.Tensor,
     batch_size: int,
     is_rocm_aiter: bool,
-    fa_version: int,
+    fa_version: int | None,
+    cu_seqlens: torch.Tensor | None = None,
+    max_seqlen: torch.Tensor | None = None,
 ) -> torch.Tensor:
     kwargs = {}
     if is_rocm_aiter:
@@ -36,7 +36,16 @@ def flash_attn_maxseqlen_wrapper(
     else:
         from vllm.attention.utils.fa_utils import flash_attn_varlen_func
 
-        kwargs["fa_version"] = fa_version
+        if not current_platform.is_rocm() and fa_version is not None:
+            kwargs["fa_version"] = fa_version
+
+    q_len = q.size(1)
+    if cu_seqlens is None:
+        cu_seqlens = torch.arange(
+            0, (batch_size + 1) * q_len, step=q_len, dtype=torch.int32, device=q.device
+        )
+    max_seqlen = q_len if max_seqlen is None else max_seqlen.item()
+
     q, k, v = (einops.rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
     output = flash_attn_varlen_func(
         q,
@@ -44,8 +53,8 @@ def flash_attn_maxseqlen_wrapper(
         v,
         cu_seqlens_q=cu_seqlens,
         cu_seqlens_k=cu_seqlens,
-        max_seqlen_q=max_seqlen.item(),
-        max_seqlen_k=max_seqlen.item(),
+        max_seqlen_q=max_seqlen,
+        max_seqlen_k=max_seqlen,
         dropout_p=0.0,
         causal=False,
         **kwargs,
@@ -62,7 +71,7 @@ def flash_attn_maxseqlen_wrapper_fake(
     max_seqlen: torch.Tensor,
     batch_size: int,
     is_rocm_aiter: bool,
-    fa_version: int,
+    fa_version: int | None,
 ) -> torch.Tensor:
     return torch.empty_like(q)
 
@@ -78,24 +87,42 @@ def vit_flash_attn_wrapper(
     q: torch.Tensor,
     k: torch.Tensor,
     v: torch.Tensor,
-    cu_seqlens: torch.Tensor,
-    max_seqlen: torch.Tensor,
     batch_size: int,
     is_rocm_aiter: bool,
-    fa_version: int,
+    fa_version: int | None,
+    cu_seqlens: torch.Tensor | None = None,
+    max_seqlen: torch.Tensor | None = None,
 ) -> torch.Tensor:
     return torch.ops.vllm.flash_attn_maxseqlen_wrapper(
-        q, k, v, cu_seqlens, max_seqlen, batch_size, is_rocm_aiter, fa_version
+        q,
+        k,
+        v,
+        batch_size,
+        is_rocm_aiter,
+        fa_version,
+        cu_seqlens,
+        max_seqlen,
     )
 
 
+def apply_sdpa(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor) -> torch.Tensor:
+    """
+    Input shape:
+    (batch_size x seq_len x num_heads x head_size)
+    """
+    q, k, v = (einops.rearrange(x, "b s h d -> b h s d") for x in [q, k, v])
+    output = F.scaled_dot_product_attention(q, k, v, dropout_p=0.0)
+    output = einops.rearrange(output, "b h s d -> b s h d ")
+    return output
+
+
 # TODO: Once we have a torch 2.10, we can use tensor slices
 # so we won't need to wrap this in custom ops
 def torch_sdpa_wrapper(
     q: torch.Tensor,
     k: torch.Tensor,
     v: torch.Tensor,
-    cu_seqlens: torch.Tensor,
+    cu_seqlens: torch.Tensor | None = None,
 ) -> torch.Tensor:
     # Never remove the contiguous logic for ROCm
     # Without it, hallucinations occur with the backend
@@ -104,6 +131,9 @@ def torch_sdpa_wrapper(
         k = k.contiguous()
         v = v.contiguous()
 
+    if cu_seqlens is None:
+        return apply_sdpa(q, k, v)
+
     outputs = []
 
     lens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
@@ -111,11 +141,7 @@ def torch_sdpa_wrapper(
     k_chunks = torch.split(k, lens, dim=1)
     v_chunks = torch.split(v, lens, dim=1)
     for q_i, k_i, v_i in zip(q_chunks, k_chunks, v_chunks):
-        q_i, k_i, v_i = (
-            einops.rearrange(x, "b s h d -> b h s d") for x in [q_i, k_i, v_i]
-        )
-        output_i = F.scaled_dot_product_attention(q_i, k_i, v_i, dropout_p=0.0)
-        output_i = einops.rearrange(output_i, "b h s d -> b s h d ")
+        output_i = apply_sdpa(q_i, k_i, v_i)
         outputs.append(output_i)
     context_layer = torch.cat(outputs, dim=1)
     return context_layer
@@ -141,6 +167,6 @@ def vit_torch_sdpa_wrapper(
     q: torch.Tensor,
     k: torch.Tensor,
     v: torch.Tensor,
-    cu_seqlens: torch.Tensor,
+    cu_seqlens: torch.Tensor | None = None,
 ) -> torch.Tensor:
     return torch.ops.vllm.torch_sdpa_wrapper(q, k, v, cu_seqlens)
diff --git a/vllm/benchmarks/latency.py b/vllm/benchmarks/latency.py
index 99c1c846f19af..a9d149666e8ba 100644
--- a/vllm/benchmarks/latency.py
+++ b/vllm/benchmarks/latency.py
@@ -79,10 +79,6 @@ def add_cli_args(parser: argparse.ArgumentParser):
 
 def main(args: argparse.Namespace):
     engine_args = EngineArgs.from_cli_args(args)
-    if args.profile and not engine_args.profiler_config.profiler == "torch":
-        raise ValueError(
-            "The torch profiler is not enabled. Please provide profiler_config."
-        )
 
     # Lazy import to avoid importing LLM when the bench command is not selected.
     from vllm import LLM, SamplingParams
@@ -125,8 +121,8 @@ def main(args: argparse.Namespace):
                 ),
             )
 
-    def run_to_completion(profile_dir: str | None = None):
-        if profile_dir:
+    def run_to_completion(do_profile: bool = False):
+        if do_profile:
             llm.start_profile()
             llm_generate()
             llm.stop_profile()
@@ -139,18 +135,24 @@ def main(args: argparse.Namespace):
 
     print("Warming up...")
     for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
-        run_to_completion(profile_dir=None)
+        run_to_completion(do_profile=False)
 
     if args.profile:
-        profile_dir = engine_args.profiler_config.torch_profiler_dir
-        print(f"Profiling (results will be saved to '{profile_dir}')...")
-        run_to_completion(profile_dir=profile_dir)
+        profiler_config = engine_args.profiler_config
+        if profiler_config.profiler == "torch":
+            print(
+                "Profiling with torch profiler (results will be saved to"
+                f" {profiler_config.torch_profiler_dir})..."
+            )
+        elif profiler_config.profiler == "cuda":
+            print("Profiling with cuda profiler ...")
+        run_to_completion(do_profile=True)
         return
 
     # Benchmark.
     latencies = []
-    for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
-        latencies.append(run_to_completion(profile_dir=None))
+    for _ in tqdm(range(args.num_iters), desc="Bench iterations"):
+        latencies.append(run_to_completion(do_profile=False))
     latencies = np.array(latencies)
     percentages = [10, 25, 50, 75, 90, 99]
     percentiles = np.percentile(latencies, percentages)
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index a1eec7d74483f..2fb6265560b19 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -520,6 +520,7 @@ class VllmBackend:
         self,
         vllm_config: VllmConfig,
         prefix: str = "",
+        is_encoder: bool = False,
     ):
         # if the model is initialized with a non-empty prefix,
         # then usually it's enough to use that prefix,
@@ -530,7 +531,7 @@ class VllmBackend:
         self.prefix = prefix or model_tag
 
         # Mark compilation for encoder.
-        self.is_encoder = model_is_encoder
+        self.is_encoder = is_encoder or model_is_encoder
 
         # Passes to run on the graph post-grad.
         self.pass_manager = resolve_obj_by_qualname(
@@ -797,7 +798,7 @@ class VllmBackend:
             or not self.compilation_config.cudagraph_copy_inputs
         ):
             return VllmSerializableFunction(
-                graph, example_inputs, self.prefix, self.split_gm
+                graph, example_inputs, self.prefix, self.split_gm, self.is_encoder
             )
 
         # index of tensors that have symbolic shapes (batch size)
@@ -835,5 +836,5 @@ class VllmBackend:
             return self.split_gm(*list_args)
 
         return VllmSerializableFunction(
-            graph, example_inputs, self.prefix, copy_and_call
+            graph, example_inputs, self.prefix, copy_and_call, self.is_encoder
         )
diff --git a/vllm/compilation/caching.py b/vllm/compilation/caching.py
index fc02a08f74265..8c9ec87bcad56 100644
--- a/vllm/compilation/caching.py
+++ b/vllm/compilation/caching.py
@@ -37,12 +37,15 @@ class VllmSerializableFunction(SerializableCallable):
     serializing the Dynamo fx graph plus example inputs.
     """
 
-    def __init__(self, graph_module, example_inputs, prefix, optimized_call):
+    def __init__(
+        self, graph_module, example_inputs, prefix, optimized_call, is_encoder=False
+    ):
         assert isinstance(graph_module, torch.fx.GraphModule)
         self.graph_module = graph_module
         self.example_inputs = example_inputs
         self.prefix = prefix
         self.optimized_call = optimized_call
+        self.is_encoder = is_encoder
         self.shape_env = None
         sym_input = next(
             (i for i in self.example_inputs if isinstance(i, torch.SymInt)), None
@@ -106,7 +109,10 @@ class VllmSerializableFunction(SerializableCallable):
         state["graph_module"] = GraphPickler.loads(state["graph_module"], fake_mode)
         state["graph_module"].recompile()
         state["example_inputs"] = GraphPickler.loads(state["example_inputs"], fake_mode)
-        vllm_backend = VllmBackend(get_current_vllm_config(), state["prefix"])
+        is_encoder = state.get("is_encoder", False)
+        vllm_backend = VllmBackend(
+            get_current_vllm_config(), state["prefix"], is_encoder
+        )
 
         def optimized_call(*example_inputs):
             """
diff --git a/vllm/compilation/matcher_utils.py b/vllm/compilation/matcher_utils.py
index ec9ed34f561b4..7301aa3e5932d 100644
--- a/vllm/compilation/matcher_utils.py
+++ b/vllm/compilation/matcher_utils.py
@@ -6,11 +6,13 @@ import torch
 from torch._higher_order_ops import auto_functionalized
 from torch._ops import OpOverload
 
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.config import get_current_vllm_config
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    GroupShape,
     QuantKey,
     _normalize_quant_group_shape,
     kFp8Dynamic64Sym,
@@ -150,26 +152,50 @@ class MatcherRotaryEmbedding(MatcherCustomOp):
 
 
 class MatcherRMSNorm(MatcherCustomOp):
-    def __init__(self, epsilon: float, enabled: bool | None = None):
+    def __init__(
+        self,
+        epsilon: float,
+        enabled: bool | None = None,
+        match_rocm_aiter: bool = False,
+    ):
         if enabled is None:
             enabled = RMSNorm.enabled()
 
         super().__init__(enabled)
         self.epsilon = epsilon
+        self._rmsnorm_op = RMS_OP
+        self.match_rocm_aiter = match_rocm_aiter
+
+        if match_rocm_aiter:
+            self._rmsnorm_op = rocm_aiter_ops.get_rmsnorm_op()
 
     def inputs(self):
         input = self.empty(5, 16) if self.enabled else self.empty_f32(5, 16)
         weight = self.empty(16)
         return [input, weight]
 
+    def forward_rocm_aiter(
+        self,
+        input: torch.Tensor,
+        weight: torch.Tensor,
+    ) -> torch.Tensor:
+        return self._rmsnorm_op(
+            x=input,
+            weight=weight,
+            variance_epsilon=self.epsilon,
+        )
+
     def forward_custom(
         self,
         input: torch.Tensor,
         weight: torch.Tensor,
     ) -> torch.Tensor:
+        if self.match_rocm_aiter:
+            return self.forward_rocm_aiter(input, weight)
+
         result = torch.empty_like(input)
         _, result = auto_functionalized(
-            RMS_OP,
+            self._rmsnorm_op,
             result=result,
             input=input,
             weight=weight,
@@ -189,12 +215,23 @@ class MatcherRMSNorm(MatcherCustomOp):
 
 
 class MatcherFusedAddRMSNorm(MatcherCustomOp):
-    def __init__(self, epsilon: float, enabled: bool | None = None):
+    def __init__(
+        self,
+        epsilon: float,
+        enabled: bool | None = None,
+        match_rocm_aiter: bool = False,
+    ):
         if enabled is None:
             enabled = RMSNorm.enabled()
 
         super().__init__(enabled)
         self.epsilon = epsilon
+        self.match_rocm_aiter = match_rocm_aiter
+
+        self._rmsnorm_op = RMS_ADD_OP
+
+        if match_rocm_aiter:
+            self._rmsnorm_op = rocm_aiter_ops.get_rmsnorm_fused_add_op()
 
     def inputs(self):
         input = self.empty(5, 16) if self.enabled else self.empty_f32(5, 16)
@@ -202,14 +239,27 @@ class MatcherFusedAddRMSNorm(MatcherCustomOp):
         residual = self.empty(5, 16)
         return [input, weight, residual]
 
+    def forward_rocm_aiter(
+        self,
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        residual: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        return self._rmsnorm_op(
+            x=input, residual=residual, weight=weight, variance_epsilon=self.epsilon
+        )
+
     def forward_custom(
         self,
         input: torch.Tensor,
         weight: torch.Tensor,
         residual: torch.Tensor,
     ) -> tuple[torch.Tensor, torch.Tensor]:
+        if self.match_rocm_aiter:
+            return self.forward_rocm_aiter(input, weight, residual)
+
         _, result, residual = auto_functionalized(
-            RMS_ADD_OP,
+            self._rmsnorm_op,
             input=input,
             residual=residual,
             weight=weight,
@@ -236,22 +286,46 @@ class MatcherQuantFP8(MatcherCustomOp):
         enabled: bool | None = None,
         has_col_major_scales: bool = False,
         is_e8m0: bool = False,
+        match_rocm_aiter: bool = False,
     ):
         if enabled is None:
             enabled = QuantFP8.enabled()
 
         super().__init__(enabled)
         self.quant_key = quant_key
-        assert quant_key in QUANT_OPS, f"unsupported quantization scheme {quant_key}"
-        self.QUANT_OP = QUANT_OPS[quant_key]
-
         self.has_col_major_scales = has_col_major_scales
         self.is_e8m0 = is_e8m0
+        self.match_rocm_aiter = match_rocm_aiter
+
+        if match_rocm_aiter:
+            assert not quant_key.scale.group_shape.is_per_tensor(), (
+                "ROCm aiter fusion pass does not support per tensor quantization"
+            )
+            if quant_key.scale.group_shape.is_per_token():
+                self.QUANT_OP = rocm_aiter_ops.get_per_token_quant_op()
+            else:
+                assert quant_key.scale.group_shape.col == 128, (
+                    "ROCm aiter fusion pass currently supports "
+                    "quantization operation with group_size 128"
+                )
+                if current_platform.is_fp8_fnuz():
+                    self.QUANT_OP = rocm_aiter_ops.get_group_quant_op()
+                else:
+                    self.QUANT_OP = (
+                        torch.ops.vllm.triton_per_token_group_quant_fp8.default
+                    )
+
+        else:
+            assert quant_key in QUANT_OPS, (
+                f"unsupported quantization scheme {quant_key}"
+            )
+            self.QUANT_OP = QUANT_OPS[quant_key]
+
+            assert quant_key.dtype == current_platform.fp8_dtype(), (
+                "Only QuantFP8 supported by"
+            )
+            assert quant_key.scale2 is None
 
-        assert quant_key.dtype == current_platform.fp8_dtype(), (
-            "Only QuantFP8 supported by"
-        )
-        assert quant_key.scale2 is None
         self.quant_fp8 = QuantFP8(
             quant_key.scale.static,
             quant_key.scale.group_shape,
@@ -259,11 +333,29 @@ class MatcherQuantFP8(MatcherCustomOp):
             use_ue8m0=is_e8m0,
         )
 
+    def forward_rocm_aiter(
+        self,
+        input: torch.Tensor,
+        scale: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        quant_key_group_shape = self.quant_key.scale.group_shape
+        if quant_key_group_shape == GroupShape.PER_TOKEN:
+            return self.QUANT_OP(
+                x=input,
+                quant_dtype=self.quant_key.dtype,
+                scale=scale,
+            )
+        else:
+            return self.QUANT_OP(input, quant_key_group_shape.col)
+
     def forward_custom(
         self,
         input: torch.Tensor,
         scale: torch.Tensor | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
+        if self.match_rocm_aiter:
+            return self.forward_rocm_aiter(input, scale)
+
         result = torch.empty(
             input.shape, device=input.device, dtype=self.quant_key.dtype
         )
diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py
index 4ebb386f75ed8..4c2dee505a941 100644
--- a/vllm/compilation/pass_manager.py
+++ b/vllm/compilation/pass_manager.py
@@ -16,7 +16,7 @@ from .vllm_inductor_pass import VllmInductorPass
 
 if rocm_aiter_ops.is_enabled():
     from vllm.compilation.rocm_aiter_fusion import (
-        RocmAiterRMSNormFp8GroupQuantFusionPass,
+        RocmAiterRMSNormFusionPass,
         RocmAiterSiluMulFp8GroupQuantFusionPass,
     )
 
@@ -117,7 +117,9 @@ class PostGradPassManager(CustomGraphPass):
             if self.pass_config.fuse_norm_quant:
                 self.passes += [RMSNormQuantFusionPass(config)]
                 if rocm_aiter_ops.is_enabled():
-                    self.passes += [RocmAiterRMSNormFp8GroupQuantFusionPass(config)]
+                    self.passes += [
+                        RocmAiterRMSNormFusionPass(config),
+                    ]
             if self.pass_config.fuse_act_quant:
                 self.passes += [ActivationQuantFusionPass(config)]
                 if rocm_aiter_ops.is_enabled():
diff --git a/vllm/compilation/piecewise_backend.py b/vllm/compilation/piecewise_backend.py
index 58d3e2a14b22a..12cc49971e08b 100644
--- a/vllm/compilation/piecewise_backend.py
+++ b/vllm/compilation/piecewise_backend.py
@@ -170,8 +170,7 @@ class PiecewiseBackend:
         range_entry = self._find_range_for_shape(runtime_shape)
 
         assert range_entry is not None, (
-            f"Shape out of considered range: {runtime_shape} "
-            "[1, max_num_batched_tokens]"
+            f"Shape: {runtime_shape} out of considered ranges: {self.compile_ranges}"
         )
 
         self._maybe_compile_for_range_entry(range_entry, args)
diff --git a/vllm/compilation/rocm_aiter_fusion.py b/vllm/compilation/rocm_aiter_fusion.py
index 8b5db9de38181..f66bb76b97f05 100644
--- a/vllm/compilation/rocm_aiter_fusion.py
+++ b/vllm/compilation/rocm_aiter_fusion.py
@@ -9,60 +9,195 @@ from torch._inductor.pattern_matcher import PatternMatcherPass
 from torch._ops import OpOverload
 
 import vllm.model_executor.layers.quantization.utils.fp8_utils  # noqa: F401
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.compilation.activation_quant_fusion import ActivationQuantPattern
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    GroupShape,
+    QuantKey,
+    ScaleDesc,
+)
 from vllm.platforms import current_platform
 
-from .fusion import empty_bf16
+from .fusion import (
+    FusedRMSQuantKey,
+)
 from .inductor_pass import enable_fake_mode
-from .matcher_utils import MatcherSiluAndMul
+from .matcher_utils import (
+    MatcherFusedAddRMSNorm,
+    MatcherQuantFP8,
+    MatcherRMSNorm,
+    MatcherSiluAndMul,
+)
 from .vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass
 
 logger = init_logger(__name__)
 FP8_DTYPE = current_platform.fp8_dtype()
 
-AITER_RMS_GROUP_QUANT_OP = torch.ops.vllm.rocm_aiter_rmsnorm_fp8_group_quant.default
-AITER_RMS_ADD_GROUP_QUANT_OP = (
-    torch.ops.vllm.rocm_aiter_rmsnorm_with_add_fp8_group_quant.default
-)
 
-AITER_RMS_OP = torch.ops.vllm.rocm_aiter_rms_norm.default
-AITER_RMS_ADD_OP = torch.ops.vllm.rocm_aiter_rmsnorm2d_fwd_with_add.default
+class AiterRMSNormQuantPattern:
+    def __init__(
+        self, epsilon: float, key: FusedRMSQuantKey, match_aiter_quant: bool = True
+    ):
+        self.epsilon = epsilon
+        self.quant_dtype = key.quant.dtype
 
-AITER_GROUP_FP8_QUANT_OP = torch.ops.vllm.rocm_aiter_group_fp8_quant.default
-TRITON_GROUP_FP8_QUANT_OP = torch.ops.vllm.triton_per_token_group_quant_fp8.default
-
-FUSED_SILU_MUL_QUANT_OP = torch.ops.vllm.rocm_aiter_act_mul_and_fp8_group_quant.default
+        self.rmsnorm_matcher = (
+            MatcherRMSNorm(epsilon, match_rocm_aiter=True)
+            if not key.fused_add
+            else MatcherFusedAddRMSNorm(epsilon, match_rocm_aiter=True)
+        )
+        self.quant_matcher = MatcherQuantFP8(
+            key.quant,
+            match_rocm_aiter=match_aiter_quant,
+        )
 
 
-class AiterRMSFp8GroupQuantPattern:
+class AiterRMSNormDynamicQuantPattern(AiterRMSNormQuantPattern):
+    """AITER RMSNorm + Dynamic Quantization pattern."""
+
+    FUSED_OP = rocm_aiter_ops.get_rmsnorm_fused_dynamic_quant_op()
+
+    def __init__(
+        self,
+        epsilon: float,
+        quant_dtype: torch.dtype,
+        match_aiter_quant: bool = True,
+        group_shape: GroupShape = GroupShape.PER_TOKEN,
+        symmetric=True,
+    ):
+        scale = ScaleDesc(torch.float32, False, group_shape)
+        key = FusedRMSQuantKey(
+            fused_add=False,
+            quant=QuantKey(dtype=quant_dtype, scale=scale, symmetric=symmetric),
+        )
+
+        super().__init__(epsilon, key, match_aiter_quant)
+
+    def register(self, pm_pass):
+        def pattern(
+            input: torch.Tensor,
+            weight: torch.Tensor,
+        ):
+            result_rms = self.rmsnorm_matcher(input, weight)
+            result, scale = self.quant_matcher(result_rms)
+            return result, scale
+
+        def replacement(
+            input: torch.Tensor,
+            weight: torch.Tensor,
+        ):
+            result = self.FUSED_OP(
+                x=input,
+                weight=weight,
+                epsilon=self.epsilon,
+                quant_dtype=self.quant_dtype,
+            )
+
+            return result[0], result[1]
+
+        pm.register_replacement(
+            pattern,
+            replacement,
+            self.rmsnorm_matcher.inputs(),
+            pm.fwd_only,
+            pm_pass,
+        )
+
+
+class AiterFusedAddRMSNormDynamicQuantPattern(AiterRMSNormQuantPattern):
+    """AITER RMSNorm Fused Add + Dynamic Quantization pattern."""
+
+    FUSED_OP = rocm_aiter_ops.get_rmsnorm_fused_add_dynamic_quant_op()
+
+    def __init__(
+        self,
+        epsilon: float,
+        quant_dtype: torch.dtype,
+        match_aiter_quant: bool = True,
+        group_shape: GroupShape = GroupShape.PER_TOKEN,
+        symmetric=True,
+    ):
+        scale = ScaleDesc(torch.float32, False, group_shape)
+        key = FusedRMSQuantKey(
+            fused_add=True,
+            quant=QuantKey(dtype=quant_dtype, scale=scale, symmetric=symmetric),
+        )
+
+        super().__init__(epsilon, key, match_aiter_quant)
+
+    def register(self, pm_pass):
+        def pattern(
+            input: torch.Tensor,
+            weight: torch.Tensor,
+            residual: torch.Tensor,
+        ):
+            result_rms, residual_out = self.rmsnorm_matcher(input, weight, residual)
+            result, scale = self.quant_matcher(result_rms)
+
+            return result, residual_out, scale
+
+        def replacement(
+            input: torch.Tensor, weight: torch.Tensor, residual: torch.Tensor
+        ):
+            result = self.FUSED_OP(
+                x=input,
+                residual=residual,
+                weight=weight,
+                epsilon=self.epsilon,
+                quant_dtype=self.quant_dtype,
+            )
+
+            return result[0], result[1], result[2]
+
+        pm.register_replacement(
+            pattern,
+            replacement,
+            self.rmsnorm_matcher.inputs(),
+            pm.fwd_only,
+            pm_pass,
+        )
+
+
+class AiterRMSFp8GroupQuantPattern(AiterRMSNormQuantPattern):
     """
     This pattern fuses aiter rms_norm & group fp8 quant custom
     ops into an aiter rms_norm_group_fp8_quant op.
     """
 
-    def __init__(self, epsilon: float, quant_dtype: torch.dtype, quant_op: OpOverload):
-        self.epsilon = epsilon
-        self.quant_dtype = quant_dtype
-        self.quant_op = quant_op
+    FUSED_OP = rocm_aiter_ops.get_rmsnorm_group_fused_quant_op()
+
+    def __init__(
+        self,
+        epsilon: float,
+        quant_dtype: torch.dtype,
+        group_shape: GroupShape,
+        match_aiter_quant: bool = True,
+        symmetric=True,
+    ):
+        scale = ScaleDesc(torch.float32, False, group_shape)
+        key = FusedRMSQuantKey(
+            fused_add=False,
+            quant=QuantKey(dtype=quant_dtype, scale=scale, symmetric=symmetric),
+        )
+
+        super().__init__(epsilon, key, match_aiter_quant)
 
     def register(self, pm_pass: PatternMatcherPass):
         def pattern(
             input: torch.Tensor,
             weight: torch.Tensor,
         ):
-            at1 = AITER_RMS_OP(x=input, weight=weight, variance_epsilon=self.epsilon)
-
-            at2 = self.quant_op(at1, 128)
-
-            return at2[0], at2[1]
+            result_rms = self.rmsnorm_matcher(input, weight)
+            result, scale = self.quant_matcher(result_rms)
+            return result, scale
 
         def replacement(
             input: torch.Tensor,
             weight: torch.Tensor,
         ):
-            at = AITER_RMS_GROUP_QUANT_OP(
+            at = self.FUSED_OP(
                 x=input,
                 weight=weight,
                 variance_epsilon=self.epsilon,
@@ -71,49 +206,52 @@ class AiterRMSFp8GroupQuantPattern:
 
             return at[0], at[1]
 
-        inputs = [
-            empty_bf16(5, 4),  # input
-            empty_bf16(1, 5),  # weight
-        ]
-
-        pm.register_replacement(pattern, replacement, inputs, pm.fwd_only, pm_pass)
+        pm.register_replacement(
+            pattern, replacement, self.rmsnorm_matcher.inputs(), pm.fwd_only, pm_pass
+        )
 
 
-class AiterFusedAddRMSFp8GroupQuantPattern:
+class AiterFusedAddRMSFp8GroupQuantPattern(AiterRMSNormQuantPattern):
     """
     This pattern fuses aiter rms_norm_with_add & group fp8 quant custom ops
     into a aiter rms_norm_with_add_group_fp8_quant op.
     """
 
-    def __init__(self, epsilon: float, quant_dtype: torch.dtype, quant_op: OpOverload):
-        self.epsilon = epsilon
-        self.quant_dtype = quant_dtype
-        self.quant_op = quant_op
+    FUSED_OP = rocm_aiter_ops.get_rmsnorm_group_add_fused_quant_op()
+
+    def __init__(
+        self,
+        epsilon: float,
+        quant_dtype: torch.dtype,
+        group_shape: GroupShape,
+        match_aiter_quant: bool = True,
+        symmetric=True,
+    ):
+        scale = ScaleDesc(torch.float32, False, group_shape)
+        key = FusedRMSQuantKey(
+            fused_add=True,
+            quant=QuantKey(dtype=quant_dtype, scale=scale, symmetric=symmetric),
+        )
+
+        super().__init__(epsilon, key, match_aiter_quant)
 
     def register(self, pm_pass: PatternMatcherPass):
         def pattern(
             input: torch.Tensor,
-            residual: torch.Tensor,
             weight: torch.Tensor,
+            residual: torch.Tensor,
         ):
-            at1 = AITER_RMS_ADD_OP(
-                x=input,
-                residual=residual,
-                weight=weight,
-                variance_epsilon=self.epsilon,
-            )
+            result_rms, residual_out = self.rmsnorm_matcher(input, weight, residual)
+            result, scale = self.quant_matcher(result_rms)
 
-            at2 = self.quant_op(at1[0], 128)
-
-            # result, scale, residual
-            return at2[0], at2[1], at1[1]
+            return result, residual_out, scale
 
         def replacement(
             input: torch.Tensor,
-            residual: torch.Tensor,
             weight: torch.Tensor,
+            residual: torch.Tensor,
         ):
-            at = AITER_RMS_ADD_GROUP_QUANT_OP(
+            at = self.FUSED_OP(
                 x=input,
                 residual=residual,
                 weight=weight,
@@ -124,18 +262,15 @@ class AiterFusedAddRMSFp8GroupQuantPattern:
             # result, scale, residual
             return at[0], at[1], at[2]
 
-        inputs = [
-            empty_bf16(5, 4),  # input
-            empty_bf16(5, 4),  # residual
-            empty_bf16(1, 5),  # weight
-        ]
-
-        pm.register_replacement(pattern, replacement, inputs, pm.fwd_only, pm_pass)
+        pm.register_replacement(
+            pattern, replacement, self.rmsnorm_matcher.inputs(), pm.fwd_only, pm_pass
+        )
 
 
-class RocmAiterRMSNormFp8GroupQuantFusionPass(VllmPatternMatcherPass):
+class RocmAiterRMSNormFusionPass(VllmPatternMatcherPass):
     """
-    This pass fuses rms_norm & quant custom ops into a fused rms_norm_quant op.
+    This pass fuses aiter rms_norm & vllm/aiter quant custom ops
+    into a fused rms_norm_quant op.
     It also supports fused_add_rms_norm.
     """
 
@@ -144,20 +279,33 @@ class RocmAiterRMSNormFp8GroupQuantFusionPass(VllmPatternMatcherPass):
         super().__init__(config)
 
         self.patterns: PatternMatcherPass = PatternMatcherPass(
-            pass_name="rocm_aiter_rms_norm_fp8_group_quant_fusion_pass"
+            pass_name="rocm_aiter_rms_norm_quant_fusion_pass"
         )
 
         # Make sure fused add patterns are before simple rms norm,
         # as the latter is a subset of the former in torch ops
         for epsilon in [1e-5, 1e-6]:
-            # Fuse rms_norm + dynamic group fp8 quant
-            for quant_op in [AITER_GROUP_FP8_QUANT_OP, TRITON_GROUP_FP8_QUANT_OP]:
-                AiterRMSFp8GroupQuantPattern(epsilon, FP8_DTYPE, quant_op).register(
-                    self.patterns
-                )
+            #  Fuse aiter rms_norm + aiter dynamic group fp8 quant
+            AiterRMSFp8GroupQuantPattern(
+                epsilon, FP8_DTYPE, GroupShape(1, 128)
+            ).register(self.patterns)
 
-                AiterFusedAddRMSFp8GroupQuantPattern(
-                    epsilon, FP8_DTYPE, quant_op
+            # Fuse aiter fused_add_rms_norm + aiter dynamic group fp8 quant
+            AiterFusedAddRMSFp8GroupQuantPattern(
+                epsilon, FP8_DTYPE, GroupShape(1, 128)
+            ).register(self.patterns)
+
+            for match_aiter_quant in [True, False]:
+                # Fuse aiter rms_norm + (aiter / vllm built-in)
+                # dynamic per-token fp8 quant
+                AiterRMSNormDynamicQuantPattern(
+                    epsilon, FP8_DTYPE, match_aiter_quant=match_aiter_quant
+                ).register(self.patterns)
+
+                # Fuse aiter fused_add_rms_norm + (aiter / vllm built-in)
+                # dynamic per-token fp8 quant
+                AiterFusedAddRMSNormDynamicQuantPattern(
+                    epsilon, FP8_DTYPE, match_aiter_quant=match_aiter_quant
                 ).register(self.patterns)
 
         self.dump_patterns(config, self.patterns)
@@ -169,6 +317,8 @@ class RocmAiterRMSNormFp8GroupQuantFusionPass(VllmPatternMatcherPass):
 
     def uuid(self) -> Any:
         fusion_patterns = [
+            AiterRMSNormDynamicQuantPattern,
+            AiterFusedAddRMSNormDynamicQuantPattern,
             AiterRMSFp8GroupQuantPattern,
             AiterFusedAddRMSFp8GroupQuantPattern,
         ]
@@ -181,6 +331,8 @@ class AiterSiluMulFp8GroupQuantPattern(ActivationQuantPattern):
     ops into an aiter silu_and_mul_group_fp8_quant op.
     """
 
+    FUSED_SILU_MUL_QUANT_OP = rocm_aiter_ops.get_act_mul_fused_fp8_group_quant_op()
+
     def __init__(self, quant_op: OpOverload):
         self.silu_and_mul_matcher = MatcherSiluAndMul()
         self.quant_op = quant_op
@@ -196,7 +348,7 @@ class AiterSiluMulFp8GroupQuantPattern(ActivationQuantPattern):
         def replacement(
             input: torch.Tensor,
         ):
-            at = FUSED_SILU_MUL_QUANT_OP(x=input, group_size=128)
+            at = self.FUSED_SILU_MUL_QUANT_OP(x=input, group_size=128)
             return at[0], at[1]
 
         inputs = [
@@ -216,6 +368,11 @@ class RocmAiterSiluMulFp8GroupQuantFusionPass(VllmPatternMatcherPass):
     https://github.com/pytorch/pytorch/pull/139321#issuecomment-2452354980
     """
 
+    AITER_GROUP_FP8_QUANT_OP = rocm_aiter_ops.get_group_quant_op()
+    TRITON_GROUP_FP8_QUANT_OP = torch.ops.vllm.triton_per_token_group_quant_fp8.default
+
+    QUANT_OPS = [AITER_GROUP_FP8_QUANT_OP, TRITON_GROUP_FP8_QUANT_OP]
+
     @enable_fake_mode
     def __init__(self, config: VllmConfig):
         super().__init__(config)
@@ -224,7 +381,7 @@ class RocmAiterSiluMulFp8GroupQuantFusionPass(VllmPatternMatcherPass):
             pass_name="rocm_aiter_silu_mul_fp8_group_quant_fusion_pass"
         )
 
-        for quant_op in [AITER_GROUP_FP8_QUANT_OP, TRITON_GROUP_FP8_QUANT_OP]:
+        for quant_op in self.QUANT_OPS:
             AiterSiluMulFp8GroupQuantPattern(quant_op).register(self.patterns)
 
         self.dump_patterns(config, self.patterns)
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 0e91dd57420a8..3c77fad41d077 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -18,6 +18,7 @@ from vllm.config.lora import LoRAConfig
 from vllm.config.model import (
     ModelConfig,
     iter_architecture_defaults,
+    str_dtype_to_torch_dtype,
     try_match_architecture_defaults,
 )
 from vllm.config.multimodal import MultiModalConfig
@@ -72,6 +73,7 @@ __all__ = [
     # From vllm.config.model
     "ModelConfig",
     "iter_architecture_defaults",
+    "str_dtype_to_torch_dtype",
     "try_match_architecture_defaults",
     # From vllm.config.multimodal
     "MultiModalConfig",
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 4a98494b3c7b3..cd527e4198557 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -437,14 +437,14 @@ class CompilationConfig:
 
     compile_ranges_split_points: list[int] | None = None
     """Split points that represent compile ranges for inductor.
-    The compile ranges are 
-    [1, split_points[0]], 
-    [split_points[0] + 1, split_points[1]], ..., 
+    The compile ranges are
+    [1, split_points[0]],
+    [split_points[0] + 1, split_points[1]], ...,
     [split_points[-1] + 1, max_num_batched_tokens].
     Compile sizes are also used single element ranges,
     the range is represented as [compile_sizes[i], compile_sizes[i]].
-    
-    If a range overlaps with the compile size, graph for compile size 
+
+    If a range overlaps with the compile size, graph for compile size
     will be prioritized, i.e. if we have a range [1, 8] and a compile size 4,
     graph for compile size 4 will be compiled and used instead of the graph
     for range [1, 8].
@@ -899,7 +899,7 @@ class CompilationConfig:
         self.compute_bs_to_padded_graph_size()
 
     def set_splitting_ops_for_v1(
-        self, all2all_backend: str | None = None, data_parallel_size: int | None = None
+        self, all2all_backend: str, data_parallel_size: int = 1
     ):
         # To compatible with OOT hardware plugin platform (for example vllm-ascend)
         # which currently only supports sequence parallelism in eager mode.
@@ -915,8 +915,6 @@ class CompilationConfig:
             "mode is CompilationMode.VLLM_COMPILE"
         )
 
-        added_default_splitting_ops = False
-
         if self.pass_config.fuse_attn_quant and not self.use_inductor_graph_partition:
             self.set_splitting_ops_for_attn_fusion()
         else:
@@ -930,7 +928,6 @@ class CompilationConfig:
                 # for details. Make a copy to avoid mutating the class-level
                 # list via reference.
                 self.splitting_ops = list(self._attention_ops)
-                added_default_splitting_ops = True
             elif len(self.splitting_ops) == 0:
                 if (
                     self.cudagraph_mode == CUDAGraphMode.PIECEWISE
@@ -958,44 +955,23 @@ class CompilationConfig:
                     self.cudagraph_mode = CUDAGraphMode.FULL
                 self.splitting_ops = []
 
-        # split MoE ops for cudagraph
-        moe_ops = [
-            "vllm::moe_forward",
-            "vllm::moe_forward_shared",
-        ]
-        backend = all2all_backend or envs.VLLM_ALL2ALL_BACKEND
-        dp_size = data_parallel_size if data_parallel_size is not None else 1
-        need_moe_splitting = (
-            backend == "deepep_high_throughput"
-            and dp_size > 1
-            # pure attn-fusion without inductor partition deliberately disables
-            # piecewise graphs and MoE splitting.
-            and not (
-                self.pass_config.fuse_attn_quant
-                and not self.use_inductor_graph_partition
+        # Disable CUDA graphs for DeepEP high-throughput since its not CG compatible
+        if (
+            all2all_backend == "deepep_high_throughput"
+            and data_parallel_size > 1
+            and self.cudagraph_mode != CUDAGraphMode.NONE
+        ):
+            # TODO: Piecewise Cuda graph might be enabled
+            # if torch compile cache key issue fixed
+            # See https://github.com/vllm-project/vllm/pull/25093
+            logger.info(
+                "DeepEP: Disabling CUDA Graphs since DeepEP high-throughput kernels "
+                "are optimized for prefill and are incompatible with CUDA Graphs. "
+                "In order to use CUDA Graphs for decode-optimized workloads, "
+                "use --all2all-backend with another option, such as "
+                "deepep_low_latency, pplx, or allgather_reducescatter."
             )
-        )
-
-        if need_moe_splitting and self.cudagraph_mode != CUDAGraphMode.NONE:
-            # if we just initialized default splitting_ops for this config,
-            # automatically append the MoE ops
-            if added_default_splitting_ops:
-                for op in moe_ops:
-                    if op not in self.splitting_ops:
-                        self.splitting_ops.append(op)
-
-            # make sure MoE ops are split out
-            if not any(op in self.splitting_ops for op in moe_ops):
-                self.cudagraph_mode = CUDAGraphMode.NONE
-                logger.warning_once(
-                    "DeepEP high throughput backend with data_parallel_size > 1 "
-                    "requires splitting MoE ops from cudagraphs. Please ensure "
-                    "'vllm::moe_forward' or 'vllm::moe_forward_shared' are "
-                    "present in CompilationConfig.splitting_ops."
-                )
-            elif self.cudagraph_mode.has_full_cudagraphs():
-                # fall back to piecewise when MoE splitting is required.
-                self.cudagraph_mode = CUDAGraphMode.PIECEWISE
+            self.cudagraph_mode = CUDAGraphMode.NONE
 
     def set_splitting_ops_for_attn_fusion(self):
         assert self.pass_config.fuse_attn_quant
diff --git a/vllm/config/model.py b/vllm/config/model.py
index 1de9d15cf8c52..e26b227de976c 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -11,7 +11,6 @@ import torch
 from pydantic import ConfigDict, Field, field_validator, model_validator
 from pydantic.dataclasses import dataclass
 from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE
-from transformers.configuration_utils import ALLOWED_LAYER_TYPES
 
 import vllm.envs as envs
 from vllm.attention.backends.registry import AttentionBackendEnum
@@ -29,6 +28,7 @@ from vllm.transformers_utils.config import (
     get_pooling_config,
     get_sentence_transformer_tokenizer_config,
     is_encoder_decoder,
+    is_rope_parameters_nested,
     try_get_dense_modules,
     try_get_generation_config,
     try_get_safetensors_metadata,
@@ -71,7 +71,7 @@ else:
 logger = init_logger(__name__)
 
 RunnerOption = Literal["auto", RunnerType]
-ConvertType = Literal["none", "embed", "classify", "reward"]
+ConvertType = Literal["none", "embed", "classify", "reward", "mm_encoder_only"]
 ConvertOption = Literal["auto", ConvertType]
 TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32"]
 ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
@@ -843,12 +843,18 @@ class ModelConfig:
             producer_name = quant_cfg.get("producer", {}).get("name")
             if producer_name == "modelopt":
                 quant_algo = quant_cfg.get("quantization", {}).get("quant_algo")
-                if quant_algo == "FP8":
-                    quant_cfg["quant_method"] = "modelopt"
-                elif quant_algo == "NVFP4":
-                    quant_cfg["quant_method"] = "modelopt_fp4"
-                elif quant_algo is not None:
-                    raise ValueError(f"Unknown ModelOpt quant algo: {quant_algo}")
+                if quant_algo is not None:
+                    quant_algo_upper = str(quant_algo).upper()
+                    if quant_algo_upper in {
+                        "FP8",
+                        "FP8_PER_CHANNEL_PER_TOKEN",
+                        "FP8_PB_WO",
+                    }:
+                        quant_cfg["quant_method"] = "modelopt"
+                    elif quant_algo_upper == "NVFP4":
+                        quant_cfg["quant_method"] = "modelopt_fp4"
+                    else:
+                        raise ValueError(f"Unknown ModelOpt quant algo: {quant_algo}")
 
         return quant_cfg
 
@@ -1088,11 +1094,10 @@ class ModelConfig:
         # The size of inputs_embeds is usually identical to the size
         # of the hidden states, however there are exceptions, such as
         # embedding models like CLIP and SigLIP
-        for target_attr in ("projection_dim", "projection_size"):
-            if hasattr(self.hf_text_config, target_attr):
-                return getattr(self.hf_text_config, target_attr)
-
-        return self.get_hidden_size()
+        names = ("projection_dim", "projection_size")
+        return getattr_iter(
+            self.hf_text_config, names, default_factory=self.get_hidden_size
+        )
 
     @property
     def is_deepseek_mla(self) -> bool:
@@ -1225,14 +1230,12 @@ class ModelConfig:
             # For ChatGLM:
             "multi_query_group_num",
         ]
-        for attr in attributes:
-            num_kv_heads = getattr(self.hf_text_config, attr, None)
-            if num_kv_heads is not None:
-                return num_kv_heads
-
         # For non-grouped-query attention models, the number of KV heads is
         # equal to the number of attention heads.
-        return self.hf_text_config.num_attention_heads
+        default_factory = lambda: self.hf_text_config.num_attention_heads
+        return getattr_iter(
+            self.hf_text_config, attributes, default_factory=default_factory
+        )
 
     def get_num_kv_heads(self, parallel_config: ParallelConfig) -> int:
         """Returns the number of KV heads per GPU."""
@@ -1536,6 +1539,10 @@ class ModelConfig:
     def is_multimodal_raw_input_only_model(self) -> bool:
         return self._model_info.supports_multimodal_raw_input_only
 
+    @property
+    def requires_raw_input_tokens(self) -> bool:
+        return self._model_info.requires_raw_input_tokens
+
     @property
     def is_cross_encoder(self) -> bool:
         return (
@@ -1849,6 +1856,11 @@ _STR_DTYPE_TO_TORCH_DTYPE = {
     "bfloat16": torch.bfloat16,
 }
 
+
+def str_dtype_to_torch_dtype(type: str):
+    return _STR_DTYPE_TO_TORCH_DTYPE.get(type)
+
+
 # model_type -> reason
 _FLOAT16_NOT_SUPPORTED_MODELS = {
     "gemma2": "Numerical instability. Please use bfloat16 or float32 instead.",
@@ -2114,9 +2126,7 @@ def _get_and_verify_max_len(
     # In Transformers v5 rope_parameters could be TypedDict or dict[str, TypedDict].
     # To simplify the verification, we convert it to dict[str, TypedDict].
     rope_parameters = getattr(hf_config, "rope_parameters", None)
-    if rope_parameters and not set(rope_parameters.keys()).issubset(
-        ALLOWED_LAYER_TYPES
-    ):
+    if rope_parameters and not is_rope_parameters_nested(rope_parameters):
         rope_parameters = {"": rope_parameters}
 
     # NOTE(woosuk): Gemma3's max_model_len (128K) is already scaled by RoPE
diff --git a/vllm/config/observability.py b/vllm/config/observability.py
index e40bf18a00ce2..4aca6b15684ac 100644
--- a/vllm/config/observability.py
+++ b/vllm/config/observability.py
@@ -64,6 +64,9 @@ class ObservabilityConfig:
     module in the model and attach informations such as input/output shapes to
     nvtx range markers. Noted that this doesn't work with CUDA graphs enabled."""
 
+    enable_mfu_metrics: bool = False
+    """Enable Model FLOPs Utilization (MFU) metrics."""
+
     @cached_property
     def collect_model_forward_time(self) -> bool:
         """Whether to collect model forward time for the request."""
diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index 3fe066ec32505..7fb3bef34b77a 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -36,6 +36,14 @@ ExpertPlacementStrategy = Literal["linear", "round_robin"]
 DistributedExecutorBackend = Literal["ray", "mp", "uni", "external_launcher"]
 DataParallelBackend = Literal["ray", "mp"]
 EPLBPolicyOption = Literal["default"]
+All2AllBackend = Literal[
+    "naive",
+    "pplx",
+    "deepep_high_throughput",
+    "deepep_low_latency",
+    "allgather_reducescatter",
+    "flashinfer_all2allv",
+]
 
 
 @config
@@ -126,24 +134,14 @@ class ParallelConfig:
       with 4 experts and 2 ranks, rank 0 will have experts [0, 2] and rank 1
       will have experts [1, 3]. This strategy can help improve load balancing
       for grouped expert models with no redundant experts."""
-    all2all_backend: (
-        Literal[
-            "naive",
-            "pplx",
-            "deepep_high_throughput",
-            "deepep_low_latency",
-            "allgather_reducescatter",
-            "flashinfer_all2allv",
-        ]
-        | None
-    ) = None
-    """All2All backend for MoE expert parallel communication. If not set, uses
-    the value from VLLM_ALL2ALL_BACKEND environment variable. Available options:
-    - "naive": Naive all2all implementation using broadcasts
-    - "allgather_reducescatter": All2all based on allgather and reducescatter
-    - "pplx": Use pplx kernels
-    - "deepep_high_throughput": Use deepep high-throughput kernels
-    - "deepep_low_latency": Use deepep low-latency kernels
+    all2all_backend: All2AllBackend = "allgather_reducescatter"
+    """All2All backend for MoE expert parallel communication. Available options:
+
+    - "naive": Naive all2all implementation using broadcasts\n
+    - "allgather_reducescatter": All2all based on allgather and reducescatter\n
+    - "pplx": Use pplx kernels\n
+    - "deepep_high_throughput": Use deepep high-throughput kernels\n
+    - "deepep_low_latency": Use deepep low-latency kernels\n
     - "flashinfer_all2allv": Use flashinfer alltoallv kernels for mnnvl"""
 
     max_parallel_loading_workers: int | None = None
@@ -495,20 +493,17 @@ class ParallelConfig:
         from vllm.config.utils import get_hash_factors, hash_factors
 
         factors = get_hash_factors(self, ignored_factors)
-        # Explicitly include backend affecting env factor as before
-        factors["VLLM_ALL2ALL_BACKEND"] = str(envs.VLLM_ALL2ALL_BACKEND)
         return hash_factors(factors)
 
     def __post_init__(self) -> None:
         # Set all2all_backend from env var if not specified, with deprecation warning
-        if self.all2all_backend is None:
+        if envs.is_set("VLLM_ALL2ALL_BACKEND"):
+            logger.warning_once(
+                "VLLM_ALL2ALL_BACKEND environment variable is deprecated and "
+                "will be removed in v0.15.0. Please use the "
+                "--all2all-backend command-line argument instead."
+            )
             self.all2all_backend = envs.VLLM_ALL2ALL_BACKEND
-            if envs.is_set("VLLM_ALL2ALL_BACKEND"):
-                logger.warning_once(
-                    "VLLM_ALL2ALL_BACKEND environment variable is deprecated and "
-                    "will be removed in a future release. Please use the "
-                    "--all2all-backend command-line argument instead."
-                )
 
         # Continue with the rest of the initialization
         self.world_size = (
diff --git a/vllm/config/utils.py b/vllm/config/utils.py
index 470296517deb1..614373782d12f 100644
--- a/vllm/config/utils.py
+++ b/vllm/config/utils.py
@@ -9,7 +9,7 @@ import inspect
 import json
 import pathlib
 import textwrap
-from collections.abc import Iterable, Mapping, Sequence, Set
+from collections.abc import Callable, Iterable, Mapping, Sequence, Set
 from dataclasses import MISSING, Field, dataclass, field, fields, is_dataclass, replace
 from itertools import pairwise
 from typing import TYPE_CHECKING, Any, Protocol, TypeVar
@@ -74,7 +74,11 @@ def get_field(cls: ConfigType, name: str) -> Field:
 
 
 def getattr_iter(
-    object: object, names: Iterable[str], default: Any, warn: bool = False
+    object: object,
+    names: Iterable[str],
+    default: Any | None = None,
+    default_factory: Callable[[], Any] | None = None,
+    warn: bool = False,
 ) -> Any:
     """
     A helper function that retrieves an attribute from an object which may
@@ -96,7 +100,7 @@ def getattr_iter(
                     names[0],
                 )
             return getattr(object, name)
-    return default
+    return default_factory() if default_factory is not None else default
 
 
 def contains_object_print(text: str) -> bool:
diff --git a/vllm/distributed/ec_transfer/ec_connector/example_connector.py b/vllm/distributed/ec_transfer/ec_connector/example_connector.py
index c9aad9e9fc8f3..3518044ce2e00 100644
--- a/vllm/distributed/ec_transfer/ec_connector/example_connector.py
+++ b/vllm/distributed/ec_transfer/ec_connector/example_connector.py
@@ -73,6 +73,7 @@ class ECExampleConnector(ECConnectorBase):
                 data hashes (`mm_hash`) to encoder cache tensors.
             kwargs (dict): Additional keyword arguments for the connector.
         """
+        from vllm.platforms import current_platform
 
         # Get the metadata
         metadata: ECConnectorMetadata = self._get_connector_metadata()
@@ -91,7 +92,9 @@ class ECExampleConnector(ECConnectorBase):
             if mm_data.mm_hash in encoder_cache:
                 continue
             filename = self._generate_filename_debug(mm_data.mm_hash)
-            ec_cache = safetensors.torch.load_file(filename)["ec_cache"].cuda()
+            ec_cache = safetensors.torch.load_file(
+                filename, device=current_platform.device_type
+            )["ec_cache"]
             encoder_cache[mm_data.mm_hash] = ec_cache
             logger.debug("Success load encoder cache for hash %s", mm_data.mm_hash)
 
diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py
index 117d159e25e71..4f1ea1a0240c4 100644
--- a/vllm/distributed/kv_transfer/kv_connector/utils.py
+++ b/vllm/distributed/kv_transfer/kv_connector/utils.py
@@ -4,6 +4,7 @@
 KV cache helper for store.
 """
 
+from collections.abc import Iterator
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Literal
 
@@ -21,6 +22,8 @@ if TYPE_CHECKING:
 
 logger = init_logger(__name__)
 
+EngineId = str
+
 
 def get_kv_connector_cache_layout():
     # NOTE (NickLucche) When running disaggregated PD with NIXL, HND layout is
@@ -201,6 +204,26 @@ def copy_kv_blocks(
         copy_fn(src_tensor, dst_tensor, src_indices, dst_indices)
 
 
+def yield_req_data(
+    scheduler_output,
+) -> Iterator[tuple[str, tuple[list[int], ...], bool]]:
+    """
+    Yields:
+        (req_id, new_block_id_groups, preempted)
+    """
+    # new requests
+    for req_data in scheduler_output.scheduled_new_reqs:
+        yield req_data.req_id, req_data.block_ids, False
+
+    # cached requests
+    cached_reqs = scheduler_output.scheduled_cached_reqs
+    yield from zip(
+        cached_reqs.req_ids,
+        cached_reqs.new_block_ids,
+        (req_id in cached_reqs.resumed_req_ids for req_id in cached_reqs.req_ids),
+    )
+
+
 @dataclass
 class TpKVTopology:
     """
@@ -209,12 +232,12 @@ class TpKVTopology:
     """
 
     tp_rank: int
-    remote_tp_size: dict[str, int]
+    remote_tp_size: dict[EngineId, int]
     is_mla: bool
     total_num_kv_heads: int
     attn_backend: type[AttentionBackend]
-    engine_id: str
-    remote_block_size: dict[str, int]
+    engine_id: EngineId
+    remote_block_size: dict[EngineId, int]
 
     def __post_init__(self):
         # Figure out whether the first dimension of the cache is K/V
@@ -256,18 +279,28 @@ class TpKVTopology:
         Calculate the tensor parallel ratio between local and remote TP.
         We can think of it as the number of local TP workers-per-remote TP
         workers. Local workers will read from the same remote TP worker in
-        groups of size `tp_ratio`.
+        groups of size `tp_ratio`.If remote tp_size > local tp_size, the
+        ratio is flipped (remote_size/local_size) and the returned value is
+        negative.
         """
-        assert self.tp_size % remote_tp_size == 0, (
-            f"Local tensor parallel size {self.tp_size} is not divisible "
-            f"by remote tensor parallel size {remote_tp_size}."
+        if self.tp_size >= remote_tp_size:
+            assert self.tp_size % remote_tp_size == 0, (
+                f"Local tensor parallel size {self.tp_size} is not divisible "
+                f"by remote tensor parallel size {remote_tp_size}."
+            )
+            return self.tp_size // remote_tp_size
+
+        assert remote_tp_size % self.tp_size == 0, (
+            f"Remote tensor parallel size {remote_tp_size} is not divisible "
+            f"by local tensor parallel size {self.tp_size}."
         )
-        return self.tp_size // remote_tp_size
+        # P TP > D TP case, return the ratio as negative
+        return -remote_tp_size // self.tp_size
 
     def block_size_ratio(
         self,
         remote_block_size: int,
-    ) -> float:
+    ) -> int:
         """
         Calculate the block size ratio between local and remote TP.
         """
@@ -279,19 +312,19 @@ class TpKVTopology:
 
     def tp_ratio_from_engine_id(
         self,
-        remote_engine_id: str,
+        remote_engine_id: EngineId,
     ) -> int:
         remote_tp_size = self.remote_tp_size[remote_engine_id]
         return self.tp_ratio(remote_tp_size)
 
     def block_size_ratio_from_engine_id(
         self,
-        remote_engine_id: str,
-    ) -> float:
+        remote_engine_id: EngineId,
+    ) -> int:
         remote_block_size = self.remote_block_size[remote_engine_id]
         return self.block_size_ratio(remote_block_size)
 
-    def is_kv_replicated(self, engine_id: str) -> bool:
+    def is_kv_replicated(self, engine_id: EngineId) -> bool:
         """
         Whether the KV cache is replicated across TP workers due to the
         number of TP workers being greater than the number of KV heads.
@@ -299,24 +332,30 @@ class TpKVTopology:
         tp_size = self.remote_tp_size[engine_id]
         return tp_size // self.total_num_kv_heads >= 1
 
-    def replicates_kv_cache(self, remote_engine_id: str) -> bool:
+    def replicates_kv_cache(self, remote_engine_id: EngineId) -> bool:
         # MLA is always replicated as the hidden dim can't be split.
         return self.is_mla or self.is_kv_replicated(remote_engine_id)
 
-    def get_target_remote_rank(
+    def get_target_remote_ranks(
         self,
         remote_tp_size: int,
-    ) -> int:
+    ) -> list[int]:
         """
         Get the remote TP rank (on P) that the current local TP rank
-        (on D) will read from.
+        (on D) will read from. When remote tp_size > local tp_size, we
+        read from multiple remote ranks.
         """
         tp_ratio = self.tp_ratio(remote_tp_size)
-        return self.tp_rank // tp_ratio
+        if tp_ratio > 0:
+            return [self.tp_rank // tp_ratio]
 
-    def get_target_remote_rank_from_engine_id(
+        # P TP > D TP case, D reads from |tp_ratio| remote workers.
+        tp_ratio = -tp_ratio
+        return [self.tp_rank * tp_ratio + i for i in range(tp_ratio)]
+
+    def get_target_remote_ranks_from_engine_id(
         self,
-        remote_engine_id: str,
-    ) -> int:
+        remote_engine_id: EngineId,
+    ) -> list[int]:
         remote_tp_size = self.remote_tp_size[remote_engine_id]
-        return self.get_target_remote_rank(remote_tp_size)
+        return self.get_target_remote_ranks(remote_tp_size)
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py
index 6acfb73997f25..9db4dedf48b7b 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py
@@ -147,6 +147,14 @@ class LMCacheMPSchedulerAdapter:
         """
         return self.blocks_in_chunk
 
+    def _cleanup_lookup_result(self, request_id: str) -> None:
+        """
+        Clean up lookup future for a finished request to prevent memory leak.
+        Args:
+            request_id: The ID of the finished request.
+        """
+        self.lookup_futures.pop(request_id, None)
+
     # Helper functions
     def _create_key(self, block_hash: bytes) -> IPCCacheEngineKey:
         """Convert a block hash to an IPC cache engine key"""
@@ -262,6 +270,7 @@ class LMCacheMPWorkerAdapter:
     ):
         keys = []
         block_ids = []
+
         for op in ops:
             keys.extend(self._block_hashes_to_keys(op.block_hashes))
             block_ids.extend(op.block_ids)
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
index 78256a6552c22..29166be62c242 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
@@ -24,6 +24,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.lmcache_integration import (
 )
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.outputs import KVConnectorOutput
+from vllm.v1.request import RequestStatus
 from vllm.v1.utils import ConstantList
 
 if TYPE_CHECKING:
@@ -211,7 +212,7 @@ class LMCacheMPRequestTracker:
         """
         self.num_stored_blocks += num_new_blocks
 
-    def update_block_ids(
+    def append_block_ids(
         self,
         new_block_ids: list[int],
     ):
@@ -455,10 +456,6 @@ class LMCacheMPConnector(KVConnectorBase_V1):
         metadata = self._get_connector_metadata()
         assert isinstance(metadata, LMCacheMPConnectorMetadata)
 
-        with torch.cuda.stream(torch.cuda.current_stream()):
-            event = torch.cuda.Event(interprocess=True)
-            event.record()
-
         request_ids = []
         ops = []
 
@@ -468,10 +465,14 @@ class LMCacheMPConnector(KVConnectorBase_V1):
             request_ids.append(meta.request_id)
             ops.append(meta.op)
 
-        if len(request_ids) > 0:
-            self.worker_adapter.batched_submit_retrieve_requests(
-                request_ids, ops, event
-            )
+        if len(request_ids) == 0:
+            return
+
+        with torch.cuda.stream(torch.cuda.current_stream()):
+            event = torch.cuda.Event(interprocess=True)
+            event.record()
+
+        self.worker_adapter.batched_submit_retrieve_requests(request_ids, ops, event)
 
     def wait_for_layer_load(self, layer_name: str) -> None:
         """
@@ -518,10 +519,6 @@ class LMCacheMPConnector(KVConnectorBase_V1):
         metadata = self._get_connector_metadata()
         assert isinstance(metadata, LMCacheMPConnectorMetadata)
 
-        with torch.cuda.stream(torch.cuda.current_stream()):
-            event = torch.cuda.Event(interprocess=True)
-            event.record()
-
         request_ids = []
         ops = []
         for meta in metadata.requests:
@@ -530,8 +527,14 @@ class LMCacheMPConnector(KVConnectorBase_V1):
             request_ids.append(meta.request_id)
             ops.append(meta.op)
 
-        if len(request_ids) > 0:
-            self.worker_adapter.batched_submit_store_requests(request_ids, ops, event)
+        if len(request_ids) == 0:
+            return
+
+        with torch.cuda.stream(torch.cuda.current_stream()):
+            event = torch.cuda.Event(interprocess=True)
+            event.record()
+
+        self.worker_adapter.batched_submit_store_requests(request_ids, ops, event)
 
     def get_finished(
         self, finished_req_ids: set[str]
@@ -627,6 +630,9 @@ class LMCacheMPConnector(KVConnectorBase_V1):
             into account.
         """
         tracker = self._get_or_create_request_tracker(request)
+        # TODO: support loading KV for preempted requests in the future
+        if request.status == RequestStatus.PREEMPTED:
+            return 0, False
 
         self.scheduler_adapter.maybe_submit_lookup_request(
             request.request_id, convert_block_hashes_to_bytes(request.block_hashes)
@@ -683,7 +689,7 @@ class LMCacheMPConnector(KVConnectorBase_V1):
 
         # No matter we need to retrieve or not, we need to update
         # the block ids into the tracker
-        tracker.update_block_ids(block_ids)
+        tracker.append_block_ids(block_ids)
 
         # Update the state of the tracker
         condition = tracker.needs_retrieve()
@@ -695,6 +701,8 @@ class LMCacheMPConnector(KVConnectorBase_V1):
                 if condition
                 else LMCacheMPRequestState.READY
             )
+            # Clean up lookup future in scheduler adapter
+            self.scheduler_adapter._cleanup_lookup_result(request.request_id)
 
     def build_connector_meta(
         self, scheduler_output: SchedulerOutput
@@ -748,6 +756,8 @@ class LMCacheMPConnector(KVConnectorBase_V1):
             Optional KVTransferParams to be included in the request outputs
             returned by the engine.
         """
+        # Clean up request tracker to prevent memory leak
+        self._cleanup_request_tracker(request.request_id)
         return True, None
 
     def take_events(self) -> Iterable["KVCacheEvent"]:
@@ -866,7 +876,8 @@ class LMCacheMPConnector(KVConnectorBase_V1):
 
             # Update block ids
             new_block_ids = reformat_block_ids(cached_reqs.new_block_ids[idx])
-            request_tracker.update_block_ids(new_block_ids)
+            if request_id not in cached_reqs.resumed_req_ids:
+                request_tracker.append_block_ids(new_block_ids)
 
             # Update new scheduled tokens
             num_new_tokens = cached_reqs.num_computed_tokens[idx]
@@ -889,7 +900,34 @@ class LMCacheMPConnector(KVConnectorBase_V1):
         self, request: "Request"
     ) -> LMCacheMPRequestTracker:
         request_id = request.request_id
+        # Remove the old trackers that is created before the preemption
+        if (
+            request.status == RequestStatus.PREEMPTED
+            and request_id in self.request_trackers
+        ):
+            tracker = self.request_trackers[request_id]
+
+            # NOTE: since this function may be called multiple times
+            # for a single request (because get_num_new_matched_tokens
+            # may be called multiple times) for the same request, we
+            # will only do the remove if the tracker is not in the "fresh"
+            # state, i.e., PREFETCHING
+            if tracker.state != LMCacheMPRequestState.PREFETCHING:
+                self.request_trackers.pop(request_id)
+
         if request_id not in self.request_trackers:
             new_tracker = LMCacheMPRequestTracker(request)
             self.request_trackers[request_id] = new_tracker
         return self.request_trackers[request_id]
+
+    def _cleanup_request_tracker(self, request_id: str) -> None:
+        """
+        Clean up request tracker and associated lookup future for a request.
+        This should be called when a request is finished to prevent memory leak.
+        """
+        # Clean up request tracker
+        if self.request_trackers.pop(request_id, None):
+            logger.debug(
+                "[KVConnector] Cleaned up request_tracker for request %s",
+                request_id,
+            )
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index fb4b8ac391afb..757ca41e9844b 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -23,7 +23,11 @@ from vllm import envs
 from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.attention.selector import get_attn_backend
 from vllm.config import VllmConfig
-from vllm.distributed.kv_transfer.kv_connector.utils import TpKVTopology
+from vllm.distributed.kv_transfer.kv_connector.utils import (
+    EngineId,
+    TpKVTopology,
+    yield_req_data,
+)
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     CopyBlocksOp,
     KVConnectorBase_V1,
@@ -56,7 +60,6 @@ if TYPE_CHECKING:
     from vllm.v1.request import Request
 
 TransferHandle = int
-EngineId = str
 ReqId = str
 
 #
@@ -482,7 +485,7 @@ class NixlConnectorScheduler:
         # New requests are added by update_state_after_alloc in
         # the scheduler. Used to make metadata passed to Worker.
         self._reqs_need_recv: dict[ReqId, tuple[Request, list[int]]] = {}
-        self._reqs_need_save: dict[ReqId, tuple[Request, list[int]]] = {}
+        self._reqs_need_save: dict[ReqId, Request] = {}
         # Reqs to send and their expiration time
         self._reqs_need_send: dict[ReqId, float] = {}
         self._reqs_in_batch: set[ReqId] = set()
@@ -628,16 +631,7 @@ class NixlConnectorScheduler:
         if self.use_host_buffer and params.get("do_remote_decode"):
             # NOTE: when accelerator is not directly supported by Nixl,
             # prefilled blocks need to be saved to host memory before transfer.
-
-            # save all blocks
-            block_ids = blocks.get_block_ids()[0]
-            # TODO: skip the blocks that are already in the host xfer buffer.
-            # Currently, the host xfer buffer block is 1-to-1 mapped to device
-            # kv blocks, so host blocks won't be flushed as long as its device
-            # block is not overwritten; and it will be safe to skip saving them
-            # to host xfer buffer.
-            if block_ids:
-                self._reqs_need_save[request.request_id] = (request, block_ids)
+            self._reqs_need_save[request.request_id] = request
         elif params.get("do_remote_prefill"):
             if params.get("remote_block_ids"):
                 if all(
@@ -689,13 +683,32 @@ class NixlConnectorScheduler:
                 kv_transfer_params=req.kv_transfer_params,
             )
 
-        for req_id, (req, block_ids) in self._reqs_need_save.items():
+        # NOTE: For the prefill side, there might be a chance that an early added
+        # request is a chunked prefill, so we need to check if new blocks are added
+        for req_id, new_block_id_groups, _ in yield_req_data(scheduler_output):
+            req_to_save = self._reqs_need_save.get(req_id)
+            if req_to_save is None or new_block_id_groups is None:
+                continue
+            req = req_to_save
+
             assert req.kv_transfer_params is not None
             meta.add_new_req_to_save(
                 request_id=req_id,
-                local_block_ids=block_ids,
+                local_block_ids=new_block_id_groups[0],
                 kv_transfer_params=req.kv_transfer_params,
             )
+            assert scheduler_output.num_scheduled_tokens is not None
+            num_scheduled_tokens = scheduler_output.num_scheduled_tokens[req_id]
+            is_partial = (
+                req.num_computed_tokens + num_scheduled_tokens
+            ) < req.num_prompt_tokens
+            if not is_partial:
+                # For non-partial prefills, once new req_meta is scheduled, it
+                # can be removed from _reqs_need_save.
+                # For partial prefill case, we will retain the request in
+                # _reqs_need_save until all blocks are scheduled with req_meta.
+                # Therefore, only pop if `not is_partial`.
+                self._reqs_need_save.pop(req_id)
 
         meta.reqs_to_send = self._reqs_need_send
         meta.reqs_in_batch = self._reqs_in_batch
@@ -703,7 +716,6 @@ class NixlConnectorScheduler:
 
         # Clear the list once workers start the transfers
         self._reqs_need_recv.clear()
-        self._reqs_need_save.clear()
         self._reqs_in_batch = set()
         self._reqs_not_processed = set()
         self._reqs_need_send = {}
@@ -749,6 +761,8 @@ class NixlConnectorScheduler:
             # Also include the case of a P/D Prefill request with immediate
             # block free (eg abort). Stop tracking this request.
             self._reqs_not_processed.add(request.request_id)
+            # Clear _reqs_need_save if a request is aborted as partial prefill.
+            self._reqs_need_save.pop(request.request_id, None)
             return False, None
 
         # TODO: check whether block_ids actually ever be 0. If not we could
@@ -873,9 +887,10 @@ class NixlConnectorWorker:
         self.copy_blocks: CopyBlocksOp | None = None
 
         # Map of engine_id -> kv_caches_base_addr. For TP case, each local
-        # rank will still only pull from a single remote TP worker.
-        self.kv_caches_base_addr: dict[EngineId, list[int]] = {}
         self.device_id: int = 0
+        # Current rank may pull from multiple remote TP workers.
+        # EngineId, dict[int, list[int]] -> engine_id, tp_rank, base_addr_for_layer
+        self.kv_caches_base_addr = defaultdict[EngineId, dict[int, list[int]]](dict)
 
         # Number of NIXL regions. Currently one region per cache
         # (so 1 per layer for MLA, otherwise 2 per layer)
@@ -883,10 +898,12 @@ class NixlConnectorWorker:
         self.num_layers = 0
 
         # nixl_prepped_dlist_handle.
-        self.src_xfer_side_handle: int = 0
-        self.src_xfer_side_handles: dict[int, int] = {}
-        # Map of engine_id -> nixl_prepped_dlist_handle (int)].
-        self.dst_xfer_side_handles: dict[EngineId, int] = {}
+        self.src_xfer_handles_by_block_size: dict[int, int] = {}
+        # Populated dynamically during handshake based on remote configuration.
+        # Keep track of regions at different tp_ratio values. tp_ratio->handles
+        self.src_xfer_handles_by_tp_ratio: dict[int, list[int]] = {}
+        # Map of engine_id -> {tp_rank: nixl_prepped_dlist_handle (int)}.
+        self.dst_xfer_side_handles = defaultdict[EngineId, dict[int, int]](dict)
 
         # Map of engine_id -> num_blocks. All ranks in the same deployment will
         # have the same number of blocks.
@@ -977,103 +994,108 @@ class NixlConnectorWorker:
         expected_engine_id: str,
     ) -> dict[int, str]:
         """Do a NIXL handshake with a remote instance."""
-
-        start_time = time.perf_counter()
-
-        # NOTE(rob): we need each rank to have a unique port. This is
-        # a hack to keep us moving. We will switch when moving to etcd
-        # or where we have a single ZMQ socket in the scheduler.
-
-        # Handshake only with the remote TP rank that current local rank will
-        # pull from. With homogeneous TP it happens to be the same rank_i.
-        p_remote_rank = self.kv_topo.get_target_remote_rank(remote_tp_size)
+        # When target instance TP > local TP, we need to perform multiple
+        # handshakes. Do it in a single background job for simplicity.
+        # Regardless, only handshake with the remote TP rank(s) that current
+        # local rank will read from. Note that With homogeneous TP,
+        # this happens to be the same single rank_i.
+        p_remote_ranks = self.kv_topo.get_target_remote_ranks(remote_tp_size)
+        remote_rank_to_agent_name = {}
         path = make_zmq_path("tcp", host, port)
-        logger.debug(
-            "Querying metadata on path: %s at remote tp rank %s", path, p_remote_rank
-        )
 
-        # Send query for the request.
         with zmq_ctx(zmq.REQ, path) as sock:
-            msg = msgspec.msgpack.encode((GET_META_MSG, p_remote_rank))
-            # Set receive timeout to 5 seconds to avoid hanging on dead server
-            sock.setsockopt(zmq.RCVTIMEO, 5000)  # milliseconds
-            sock.send(msg)
-            handshake_bytes = sock.recv()
-
-            # Decode handshake payload to get compatibility hash
-            handshake_decoder = msgspec.msgpack.Decoder(NixlHandshakePayload)
-            try:
-                handshake_payload = handshake_decoder.decode(handshake_bytes)
-            except (msgspec.DecodeError, msgspec.ValidationError) as e:
-                raise RuntimeError(
-                    f"Failed to decode NixlHandshakePayload. This likely indicates "
-                    f"an incompatibility between connector version. Error: {e}"
-                ) from e
-
-            got_metadata_time = time.perf_counter()
-            logger.debug(
-                "NIXL handshake: get metadata took: %s", got_metadata_time - start_time
-            )
-
-            # Check compatibility hash BEFORE decoding agent metadata
-            if (
-                self.enforce_compat_hash
-                and handshake_payload.compatibility_hash != self.compat_hash
-            ):
-                raise RuntimeError(
-                    f"NIXL compatibility hash mismatch. "
-                    f"Local: {self.compat_hash}, "
-                    f"Remote: {handshake_payload.compatibility_hash}. "
-                    f"Prefill and decode instances have incompatible configurations. "
-                    f"This may be due to: different vLLM versions, models, dtypes, "
-                    f"KV cache layouts, attention backends, etc. "
-                    f"Both instances must use identical configurations."
-                    f"Disable this check using "
-                    f'--kv-transfer-config \'{{"kv_connector_extra_config": '
-                    f'{{"enforce_handshake_compat": false}}}}\''
+            for remote_rank in p_remote_ranks:
+                logger.debug(
+                    "Querying metadata on path: %s at remote tp rank %s",
+                    path,
+                    remote_rank,
                 )
 
-            logger.info(
-                "NIXL compatibility check passed (hash: %s)",
-                handshake_payload.compatibility_hash,
-            )
+                start_time = time.perf_counter()
+                # Send query for the request.
+                msg = msgspec.msgpack.encode((GET_META_MSG, remote_rank))
+                # Set receive timeout to 5 seconds to avoid hanging on dead server
+                sock.setsockopt(zmq.RCVTIMEO, 5000)  # milliseconds
+                sock.send(msg)
+                handshake_bytes = sock.recv()
 
-            # Decode agent metadata
-            metadata_decoder = msgspec.msgpack.Decoder(NixlAgentMetadata)
-            try:
-                metadata = metadata_decoder.decode(
-                    handshake_payload.agent_metadata_bytes
-                )
-            except (msgspec.DecodeError, msgspec.ValidationError) as e:
-                # This should not happen if hash matched
-                raise RuntimeError(
-                    f"Failed to decode NixlAgentMetadata. Error: {e}"
-                ) from e
+                # Decode handshake payload to get compatibility hash
+                handshake_decoder = msgspec.msgpack.Decoder(NixlHandshakePayload)
+                try:
+                    handshake_payload = handshake_decoder.decode(handshake_bytes)
+                except (msgspec.DecodeError, msgspec.ValidationError) as e:
+                    raise RuntimeError(
+                        f"Failed to decode NixlHandshakePayload. This likely indicates "
+                        f"an incompatibility between connector version. Error: {e}"
+                    ) from e
 
-            # Ensure engine id matches.
-            if metadata.engine_id != expected_engine_id:
-                raise RuntimeError(
-                    f"Remote NIXL agent engine ID mismatch. "
-                    f"Expected {expected_engine_id},"
-                    f"received {metadata.engine_id}."
+                got_metadata_time = time.perf_counter()
+                logger.debug(
+                    "NIXL handshake: get metadata took: %s",
+                    got_metadata_time - start_time,
                 )
 
-            # Register Remote agent.
-            assert metadata.block_size <= self.block_size, (
-                "nP > nD is not supported yet."
-            )
-            remote_agent_name = self.add_remote_agent(
-                metadata, p_remote_rank, remote_tp_size
-            )
+                # Check compatibility hash BEFORE decoding agent metadata
+                if (
+                    self.enforce_compat_hash
+                    and handshake_payload.compatibility_hash != self.compat_hash
+                ):
+                    raise RuntimeError(
+                        f"NIXL compatibility hash mismatch. "
+                        f"Local: {self.compat_hash}, "
+                        f"Remote: {handshake_payload.compatibility_hash}. "
+                        f"Prefill and decode instances have incompatible "
+                        f"configurations. This may be due to: different vLLM versions,"
+                        f" models, dtypes, KV cache layouts, attention backends, etc. "
+                        f"Both instances must use identical configurations."
+                        f"Disable this check using "
+                        f'--kv-transfer-config \'{{"kv_connector_extra_config": '
+                        f'{{"enforce_handshake_compat": false}}}}\''
+                    )
 
-            setup_agent_time = time.perf_counter()
-            logger.debug(
-                "NIXL handshake: add agent took: %s",
-                setup_agent_time - got_metadata_time,
-            )
+                logger.info(
+                    "NIXL compatibility check passed (hash: %s)",
+                    handshake_payload.compatibility_hash,
+                )
 
-        # Remote rank -> agent name.
-        return {p_remote_rank: remote_agent_name}
+                # Decode agent metadata
+                metadata_decoder = msgspec.msgpack.Decoder(NixlAgentMetadata)
+                try:
+                    metadata = metadata_decoder.decode(
+                        handshake_payload.agent_metadata_bytes
+                    )
+                except (msgspec.DecodeError, msgspec.ValidationError) as e:
+                    # This should not happen if hash matched
+                    raise RuntimeError(
+                        f"Failed to decode NixlAgentMetadata. Error: {e}"
+                    ) from e
+
+                # Ensure engine id matches.
+                if metadata.engine_id != expected_engine_id:
+                    raise RuntimeError(
+                        f"Remote NIXL agent engine ID mismatch. "
+                        f"Expected {expected_engine_id},"
+                        f"received {metadata.engine_id}."
+                    )
+                # Ensure engine id matches.
+                if metadata.engine_id != expected_engine_id:
+                    raise RuntimeError(
+                        f"Remote NIXL agent engine ID mismatch. "
+                        f"Expected {expected_engine_id},"
+                        f"received {metadata.engine_id}."
+                    )
+                setup_agent_time = time.perf_counter()
+
+                # Register Remote agent.
+                remote_agent_name = self.add_remote_agent(
+                    metadata, remote_rank, remote_tp_size
+                )
+                logger.debug(
+                    "NIXL handshake: add agent took: %s",
+                    setup_agent_time - got_metadata_time,
+                )
+                remote_rank_to_agent_name[remote_rank] = remote_agent_name
+        return remote_rank_to_agent_name
 
     def initialize_host_xfer_buffer(self, kv_caches: dict[str, torch.Tensor]) -> None:
         """
@@ -1283,7 +1305,7 @@ class NixlConnectorWorker:
         assert len(self.block_len_per_layer) == len(seen_base_addresses)
         assert self.num_blocks != 0
 
-        self.kv_caches_base_addr[self.engine_id] = seen_base_addresses
+        self.kv_caches_base_addr[self.engine_id][self.tp_rank] = seen_base_addresses
         self.num_regions = len(caches_data)
         self.num_layers = len(xfer_buffers.keys())
 
@@ -1310,9 +1332,9 @@ class NixlConnectorWorker:
 
         # Register local/src descr for NIXL xfer.
         self.seen_base_addresses = seen_base_addresses
-        self.src_xfer_side_handle = self.register_local_xfer_handler(self.block_size)
-
-        self.src_xfer_side_handles[self.block_size] = self.src_xfer_side_handle
+        self.src_xfer_handles_by_block_size[self.block_size], self.src_blocks_data = (
+            self.register_local_xfer_handler(self.block_size)
+        )
 
         # TODO(mgoin): Hybrid memory allocator is currently disabled for
         # models with local attention (Llama 4). Can remove this once enabled.
@@ -1340,8 +1362,8 @@ class NixlConnectorWorker:
         agent_metadata = NixlAgentMetadata(
             engine_id=self.engine_id,
             agent_metadata=self.nixl_wrapper.get_agent_metadata(),
-            kv_caches_base_addr=self.kv_caches_base_addr[self.engine_id],
             device_id=self.device_id,
+            kv_caches_base_addr=self.kv_caches_base_addr[self.engine_id][self.tp_rank],
             num_blocks=self.num_blocks,
             block_lens=self.block_len_per_layer,
             kv_cache_layout=self.kv_cache_layout
@@ -1359,7 +1381,7 @@ class NixlConnectorWorker:
     def register_local_xfer_handler(
         self,
         block_size: int,
-    ) -> int:
+    ) -> tuple[int, list[tuple[int, int, int]]]:
         """
         Function used for register local xfer handler with local block_size or
         Remote block_size.
@@ -1407,7 +1429,7 @@ class NixlConnectorWorker:
 
         descs = self.nixl_wrapper.get_xfer_descs(blocks_data, self.nixl_memory_type)
         # NIXL_INIT_AGENT to be used for preparations of local descs.
-        return self.nixl_wrapper.prep_xfer_dlist("NIXL_INIT_AGENT", descs)
+        return self.nixl_wrapper.prep_xfer_dlist("NIXL_INIT_AGENT", descs), blocks_data
 
     def add_remote_agent(
         self,
@@ -1421,10 +1443,12 @@ class NixlConnectorWorker:
 
         In particular, handle both homogeneous and heterogeneous TP. The former
         requires local rank_i to read from remote rank_i.
-        The latter, assuming D.world_size > P.world_size, requires that two or
-        more local TP worker share the xfer from a single TP worker.
+        The latter, in the case of D.world_size < P.world_size, requires that a
+        local (D) TP worker reads from multiple remote (P) TP workers.
+        Conversely, assuming D.world_size > P.world_size, two or more local TP
+        workers will read from a single remote TP worker.
 
-        Here's an example (non-MLA case):
+        Here's an example for the last case described above (non-MLA):
 
         rank_offset     p_remote_tp_rank
         (kv split no)
@@ -1474,9 +1498,6 @@ class NixlConnectorWorker:
             nixl_agent_meta.agent_metadata
         )
 
-        # Handle tp_size>num_kv_heads: replicate KV cache.
-        replicates_kv_cache = self.kv_topo.replicates_kv_cache(engine_id)
-
         # Create dst descs and xfer side handles. TP workers have same #blocks
         # so we only register once per engine_id.
         # Example:
@@ -1490,14 +1511,52 @@ class NixlConnectorWorker:
             self.dst_num_blocks[engine_id] = nixl_agent_meta.num_blocks
 
         # Keep track of remote agent kv caches base addresses.
-        self.kv_caches_base_addr[engine_id] = nixl_agent_meta.kv_caches_base_addr
-
+        self.kv_caches_base_addr[engine_id][remote_tp_rank] = (
+            nixl_agent_meta.kv_caches_base_addr
+        )
         self._validate_remote_agent_handshake(nixl_agent_meta, remote_tp_size)
 
-        # Number of D TP workers reading from a single P TP worker. This is
-        # 1 when P and D `--tensor-parallel-size` match.
+        # This is 1 when P and D `--tensor-parallel-size` match. Otherwise,
+        # this is the ratio between the two sizes.
         tp_ratio = self.kv_topo.tp_ratio_from_engine_id(engine_id)
 
+        # Handle tp_size>num_kv_heads: replicate KV cache.
+        indexes_into_remote = (
+            not self.kv_topo.replicates_kv_cache(engine_id) and tp_ratio > 0
+        )
+
+        logger.debug(
+            "Registering remote agent (%s, rank %s) memory regions with tp_ratio %s",
+            engine_id,
+            remote_tp_rank,
+            tp_ratio,
+        )
+
+        ### (Optional) Register local agent memory regions. MLA is not split.
+        if (
+            tp_ratio < 0
+            and not self.use_mla
+            and tp_ratio not in self.src_xfer_handles_by_tp_ratio
+        ):
+            # Remote tp_size > local tp_size: read from multiple remote ranks.
+            # Logically "split" own regions into |tp_ratio| chunks. Mind that
+            # we only do this once per remote tp_size (replica-friendly).
+            self.src_xfer_handles_by_tp_ratio[tp_ratio] = []
+            for i in range(-tp_ratio):
+                blocks_data = []
+                for memory_region in self.src_blocks_data:
+                    addr, local_block_len, own_tp_rank = memory_region
+                    # Computing block len layer by layer allows for different
+                    # block sizes to be used.
+                    remote_block_len = local_block_len // (-tp_ratio)
+                    addr = addr + i * remote_block_len
+                    blocks_data.append((addr, remote_block_len, own_tp_rank))
+                descs = self.nixl_wrapper.get_xfer_descs(
+                    blocks_data, self.nixl_memory_type
+                )
+                handle = self.nixl_wrapper.prep_xfer_dlist("NIXL_INIT_AGENT", descs)
+                self.src_xfer_handles_by_tp_ratio[tp_ratio].append(handle)
+
         ### Register remote agent memory regions
         blocks_data = []
         # With homogeneous TP, D pulls the whole kv cache from corresponding
@@ -1507,14 +1566,19 @@ class NixlConnectorWorker:
 
         # Register all remote blocks, but only the corresponding kv heads.
         for i, base_addr in enumerate(nixl_agent_meta.kv_caches_base_addr):
-            kv_block_len = self.get_backend_aware_kv_block_len(layer_idx=i)
-            remote_kv_block_len = kv_block_len // block_size_ratio
+            # Read our whole local region size from remote.
+            local_block_len = self.get_backend_aware_kv_block_len(layer_idx=i)
+            remote_kv_block_len = local_block_len // block_size_ratio
             if block_size_ratio > 1:
                 # using remote kv_block_len as transfer unit
-                kv_block_len = remote_kv_block_len
+                local_block_len = remote_kv_block_len
+
+            if tp_ratio < 0 and not self.use_mla:
+                # Remote tp is bigger: read a chunk of local region from remote
+                local_block_len = local_block_len // (-tp_ratio)
             rank_offset = (
                 self.tp_rank % tp_ratio * remote_kv_block_len
-                if not replicates_kv_cache
+                if indexes_into_remote
                 else 0
             )
             for block_id in range(nixl_agent_meta.num_blocks):
@@ -1524,7 +1588,7 @@ class NixlConnectorWorker:
                 # self.block_len == remote_block_len//tp_ratio bytes.
                 addr = base_addr + block_offset + rank_offset
                 # (addr, len, device id)
-                blocks_data.append((addr, kv_block_len, nixl_agent_meta.device_id))
+                blocks_data.append((addr, local_block_len, nixl_agent_meta.device_id))
 
             if self.kv_topo.is_kv_layout_blocks_first:
                 # With FlashInfer index V separately to allow head splitting.
@@ -1533,7 +1597,7 @@ class NixlConnectorWorker:
                     addr = base_addr + block_offset + rank_offset
                     v_addr = addr + nixl_agent_meta.block_lens[i] // 2
                     blocks_data.append(
-                        (v_addr, kv_block_len, nixl_agent_meta.device_id)
+                        (v_addr, local_block_len, nixl_agent_meta.device_id)
                     )
 
         logger.debug(
@@ -1546,15 +1610,15 @@ class NixlConnectorWorker:
 
         # Register with NIXL.
         descs = self.nixl_wrapper.get_xfer_descs(blocks_data, self.nixl_memory_type)
-        self.dst_xfer_side_handles[engine_id] = self.nixl_wrapper.prep_xfer_dlist(
-            remote_agent_name, descs
+        self.dst_xfer_side_handles[engine_id][remote_tp_rank] = (
+            self.nixl_wrapper.prep_xfer_dlist(remote_agent_name, descs)
         )
 
         if block_size_ratio > 1:
             # when prefill with smaller block_size, we need to init a
             # new handler with same block_len to match
-            self.src_xfer_side_handles[nixl_agent_meta.block_size] = (
-                self.register_local_xfer_handler(nixl_agent_meta.block_size)
+            self.src_xfer_handles_by_block_size[nixl_agent_meta.block_size] = (
+                self.register_local_xfer_handler(nixl_agent_meta.block_size)[0]
             )
 
         return remote_agent_name
@@ -1574,7 +1638,9 @@ class NixlConnectorWorker:
         block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id(
             remote_engine_id
         )
-        assert tp_ratio > 0, "Decode TP cannot be smaller than prefill TP"
+        # Num kv_heads > tp_size and P TP > D TP case, not supported
+        assert not (tp_ratio < 0 and self.kv_topo.is_kv_replicated(remote_engine_id))
+
         assert not self._use_pallas or tp_ratio == 1, (
             "TPU (pallas_v1) DOES NOT support heterogeneous TP yet."
         )
@@ -1616,17 +1682,29 @@ class NixlConnectorWorker:
                     "All remote layers must have the same block size"
                 )
 
-            assert (
-                remote_block_len
-                == (self.block_len_per_layer[0] * tp_ratio) // block_size_ratio
-            ), (
-                "Remote P worker KV layer cache must be of shape [2, N, "
-                "local_kv_heads*tp_ratio, block_size, head_dim] and same dtype."
-            )
+            if tp_ratio > 0:
+                # Remote tp is smaller: remote block_len size is bigger
+                assert (
+                    remote_block_len
+                    == (self.block_len_per_layer[0] * tp_ratio) // block_size_ratio
+                ), (
+                    "Remote P worker KV layer cache must be of shape [2, N, "
+                    "local_kv_heads*tp_ratio, page_size, head_dim] and same dtype."
+                )  # noqa: E501
+            else:
+                assert block_size_ratio == 1, (
+                    "Different local/remote block sizes are not supported when"
+                    " P TP > D TP."
+                )
+                # Remote tp is bigger: remote block_len size is smaller
+                assert remote_block_len == self.block_len_per_layer[0] // (-tp_ratio), (
+                    "Remote P worker KV layer cache must be of shape [2, N, "
+                    "local_kv_heads/tp_ratio, page_size, head_dim] and same dtype."
+                )  # noqa: E501
 
-        # TP workers have same #blocks.
+        # TP workers that handhshake with same remote have same #blocks.
         assert self.dst_num_blocks[remote_engine_id] == nixl_agent_meta.num_blocks
-
+        # Same number of regions/~layers.
         assert len(nixl_agent_meta.kv_caches_base_addr) == len(self.block_len_per_layer)
 
     def sync_recved_kv_to_device(self, req_id: str, meta: ReqMeta):
@@ -1710,7 +1788,7 @@ class NixlConnectorWorker:
                 )
                 cache.index_copy_(0, indices, permuted_blocks)
 
-    def blocksize_post_process(self, block_ids_per_ratio: dict[float, list[list[int]]]):
+    def blocksize_post_process(self, block_ids_per_ratio: dict[int, list[list[int]]]):
         def _process_local_gt_remote(blocks_to_update, block_size_ratio):
             n_kv_heads, block_size, head_size = blocks_to_update.shape[1:]
             remote_block_size = block_size // block_size_ratio
@@ -1840,7 +1918,7 @@ class NixlConnectorWorker:
         notified_req_ids: set[str] = set()
         for notifs in self.nixl_wrapper.get_new_notifs().values():
             for notif in notifs:
-                req_id, tp_ratio = notif.decode("utf-8").rsplit(":", 1)
+                req_id, tp_size = notif.decode("utf-8").rsplit(":", 1)
                 if (
                     req_id not in self._reqs_to_send
                     and req_id not in self._reqs_to_process
@@ -1853,9 +1931,22 @@ class NixlConnectorWorker:
                     )
                     continue
 
+                # NOTE: `tp_ratio` is the opposite when swapping local<>remote
+                n_consumers = int(tp_size)
+                tp_ratio = self.kv_topo.tp_ratio(n_consumers)
+
+                # Number of reads *per producer* to wait for.
+                # When remote D TP > local P TP we expect `tp_ratio` reads.
+                consumers_per_producer = (
+                    -tp_ratio if n_consumers > self.world_size else 1
+                )
+
                 self.consumer_notification_counts_by_req[req_id] += 1
                 # Wait all consumers (D) to be done reading before freeing.
-                if self.consumer_notification_counts_by_req[req_id] == int(tp_ratio):
+                if (
+                    self.consumer_notification_counts_by_req[req_id]
+                    == consumers_per_producer
+                ):
                     notified_req_ids.add(req_id)
                     del self.consumer_notification_counts_by_req[req_id]
                     self._reqs_to_process.remove(req_id)
@@ -1872,7 +1963,7 @@ class NixlConnectorWorker:
         """
         done_req_ids: set[str] = set()
         for req_id, handles in list(transfers.items()):
-            in_progress = False
+            in_progress = []
             for handle in handles:
                 try:
                     xfer_state = self.nixl_wrapper.check_xfer_state(handle)
@@ -1882,7 +1973,7 @@ class NixlConnectorWorker:
                         self.xfer_stats.record_transfer(res)
                         self.nixl_wrapper.release_xfer_handle(handle)
                     elif xfer_state == "PROC":
-                        in_progress = True
+                        in_progress.append(handle)
                         continue
                     else:
                         logger.error(
@@ -1892,7 +1983,6 @@ class NixlConnectorWorker:
                             xfer_state,
                         )
                         self._handle_failed_transfer(req_id, handle)
-                        in_progress = False
                 except Exception:
                     logger.exception(
                         "NIXL transfer exception for request %s. "
@@ -1900,11 +1990,13 @@ class NixlConnectorWorker:
                         req_id,
                     )
                     self._handle_failed_transfer(req_id, handle)
-                    in_progress = False
 
             if not in_progress:
+                # Only report request as completed when all transfers are done.
                 done_req_ids.add(req_id)
                 del transfers[req_id]
+            else:
+                transfers[req_id] = in_progress
         return done_req_ids
 
     def _handle_failed_transfer(self, req_id: str, handle: int):
@@ -1982,18 +2074,62 @@ class NixlConnectorWorker:
 
     def _read_blocks_for_req(self, req_id: str, meta: ReqMeta):
         assert meta.remote is not None
-        logger.debug(
-            "Remote agent %s available, calling _read_blocks for req %s",
-            meta.remote.engine_id,
-            req_id,
-        )
-        self._read_blocks(
-            request_id=req_id,
-            dst_engine_id=meta.remote.engine_id,
-            remote_request_id=meta.remote.request_id,
-            local_block_ids=meta.local_physical_block_ids,
-            remote_block_ids=meta.remote.block_ids,
+        remote_ranks = self.kv_topo.get_target_remote_ranks_from_engine_id(
+            meta.remote.engine_id
         )
+        tp_ratio = self.kv_topo.tp_ratio_from_engine_id(meta.remote.engine_id)
+        # D may have to perform multiple reads from different remote ranks.
+        for i, remote_rank in enumerate(remote_ranks):
+            if self.use_mla and tp_ratio < 0 and i > 0:
+                # MLA opt: when P TP > D TP, only a single read is executed for
+                # the first remote rank (cache is duplicated)..
+                break
+
+            remote_block_size = self.kv_topo.remote_block_size[meta.remote.engine_id]
+            logger.debug(
+                "Remote agent %s available, calling _read_blocks"
+                " on remote rank %s with remote block size %s for req %s",
+                meta.remote.engine_id,
+                remote_rank,
+                remote_block_size,
+                req_id,
+            )
+            # Get side handles.
+            if tp_ratio < 0 and not self.use_mla:
+                assert remote_block_size == self.block_size
+                # Remote tp_size > local tp_size: we must perform multiple
+                # reads. Get the memory chunk onto which we will write to.
+                local_xfer_side_handle = self.src_xfer_handles_by_tp_ratio[tp_ratio][i]
+            else:
+                # Single read from remote, we write to the whole memory region.
+                # Also handle remote block size different from local block size.
+                local_xfer_side_handle = self.src_xfer_handles_by_block_size[
+                    remote_block_size
+                ]
+
+            # Destination handle: remote_engine_id -> remote_rank -> handle.
+            remote_xfer_side_handle = self.dst_xfer_side_handles[meta.remote.engine_id][
+                remote_rank
+            ]
+            self._read_blocks(
+                request_id=req_id,
+                dst_engine_id=meta.remote.engine_id,
+                remote_request_id=meta.remote.request_id,
+                local_block_ids=meta.local_physical_block_ids,
+                remote_block_ids=meta.remote.block_ids,
+                remote_rank=remote_rank,
+                local_xfer_side_handle=local_xfer_side_handle,
+                remote_xfer_side_handle=remote_xfer_side_handle,
+            )
+
+            if self.use_mla and tp_ratio < 0:
+                # ..but we still need to notify the other remote ranks that we
+                # have the blocks we need so they can update the request state.
+                notif_id = f"{req_id}:{self.world_size}".encode()
+                remote_agents = self._remote_agents[meta.remote.engine_id]
+                for rank_to_notify, agent in remote_agents.items():
+                    if rank_to_notify != remote_rank:
+                        self.nixl_wrapper.send_notif(agent, notif_msg=notif_id)
 
     def _read_blocks(
         self,
@@ -2002,7 +2138,14 @@ class NixlConnectorWorker:
         dst_engine_id: str,
         request_id: str,
         remote_request_id: str,
+        remote_rank: int,
+        local_xfer_side_handle: int,
+        remote_xfer_side_handle: int,
     ):
+        """
+        Post a READ point-to-point xfer request from a single local worker to
+        a single remote worker.
+        """
         block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id(dst_engine_id)
         if block_size_ratio > 1:
             local_block_ids = self.get_mapped_blocks(
@@ -2031,18 +2174,14 @@ class NixlConnectorWorker:
         # saturate IB with heterogeneous TP sizes. We should remove the staging
         # blocks until we are ready.
 
-        # Number of D TP workers that will read from dst P. Propagate tp_ratio
+        # Number of D TP workers that will read from dst P. Propagate info
         # on notification so that dst worker can wait before freeing blocks.
-        tp_ratio = self.kv_topo.tp_ratio_from_engine_id(dst_engine_id)
-        notif_id = f"{remote_request_id}:{tp_ratio}".encode()
+        notif_id = f"{remote_request_id}:{self.world_size}".encode()
 
         # Full prefix cache hit: do not need to read remote blocks,
         # just notify P worker that we have the blocks we need.
         num_local_blocks = len(local_block_ids)
         if num_local_blocks == 0:
-            remote_rank = self.kv_topo.get_target_remote_rank_from_engine_id(
-                dst_engine_id
-            )
             agent_name = self._remote_agents[dst_engine_id][remote_rank]
             try:
                 self.nixl_wrapper.send_notif(agent_name, notif_msg=notif_id)
@@ -2062,13 +2201,6 @@ class NixlConnectorWorker:
         if num_local_blocks < num_remote_blocks:
             remote_block_ids = remote_block_ids[-num_local_blocks:]
 
-        # Get side handles.
-        remote_block_size = self.kv_topo.remote_block_size[dst_engine_id]
-        local_xfer_side_handle = self.src_xfer_side_handles.get(
-            remote_block_size, self.src_xfer_side_handle
-        )
-        remote_xfer_side_handle = self.dst_xfer_side_handles[dst_engine_id]
-
         # NOTE (nicolo) With homogeneous TP, each TP worker loads KV from
         # corresponding rank. With heterogeneous TP, fixing D>P, the D tp
         # workers will issue xfers to parts of the P worker remote kv caches.
@@ -2230,7 +2362,7 @@ class NixlConnectorWorker:
             block_ids_np, self._physical_blocks_per_logical_kv_block, block_arange
         ).tolist()
 
-    def get_backend_aware_kv_block_len(self, layer_idx: int):
+    def get_backend_aware_kv_block_len(self, layer_idx: int) -> int:
         """
         Get the block length for one K/V element (K and V have the same size).
 
@@ -2276,11 +2408,16 @@ class NixlConnectorWorker:
             for handle in handles:
                 self.nixl_wrapper.release_xfer_handle(handle)
         self._recving_transfers.clear()
-        if self.src_xfer_side_handle:
-            self.nixl_wrapper.release_dlist_handle(self.src_xfer_side_handle)
-            self.src_xfer_side_handle = 0
-        for dst_xfer_side_handle in self.dst_xfer_side_handles.values():
-            self.nixl_wrapper.release_dlist_handle(dst_xfer_side_handle)
+        for handle in self.src_xfer_handles_by_block_size.values():
+            self.nixl_wrapper.release_dlist_handle(handle)
+        self.src_xfer_handles_by_block_size.clear()
+        for handles in self.src_xfer_handles_by_tp_ratio.values():
+            for handle in handles:
+                self.nixl_wrapper.release_dlist_handle(handle)
+        self.src_xfer_handles_by_tp_ratio.clear()
+        for dst_xfer_side_handles in self.dst_xfer_side_handles.values():
+            for dst_xfer_side_handle in dst_xfer_side_handles.values():
+                self.nixl_wrapper.release_dlist_handle(dst_xfer_side_handle)
         self.dst_xfer_side_handles.clear()
         for remote_agents in self._remote_agents.values():
             for agent_name in remote_agents.values():
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
index 0ad9d4ae1b39f..a6d86bc9e1a19 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections import defaultdict
-from collections.abc import Iterable, Iterator
+from collections.abc import Iterable
 from dataclasses import dataclass
 from itertools import islice
 from typing import Any, ClassVar
@@ -12,6 +12,7 @@ from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata
 from vllm.attention.layer import Attention
 from vllm.config import VllmConfig, get_layers_from_vllm_config
 from vllm.distributed.kv_events import BlockRemoved, BlockStored, KVCacheEvent
+from vllm.distributed.kv_transfer.kv_connector.utils import yield_req_data
 from vllm.distributed.kv_transfer.kv_connector.v1 import (
     KVConnectorBase_V1,
     KVConnectorRole,
@@ -516,23 +517,3 @@ class OffloadingConnectorWorker:
                 del self._store_jobs[req_id]
 
         return finished_sending, finished_recving
-
-
-def yield_req_data(
-    scheduler_output,
-) -> Iterator[tuple[str, tuple[list[int], ...], bool]]:
-    """
-    Yields:
-        (req_id, new_block_id_groups, preempted)
-    """
-    # new requests
-    for req_data in scheduler_output.scheduled_new_reqs:
-        yield req_data.req_id, req_data.block_ids, False
-
-    # cached requests
-    cached_reqs = scheduler_output.scheduled_cached_reqs
-    yield from zip(
-        cached_reqs.req_ids,
-        cached_reqs.new_block_ids,
-        (req_id in cached_reqs.resumed_req_ids for req_id in cached_reqs.req_ids),
-    )
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 03720bd2516d4..a4d262d5e1183 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -408,7 +408,7 @@ class EngineArgs:
     data_parallel_external_lb: bool = False
     data_parallel_backend: str = ParallelConfig.data_parallel_backend
     enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel
-    all2all_backend: str | None = ParallelConfig.all2all_backend
+    all2all_backend: str = ParallelConfig.all2all_backend
     enable_dbo: bool = ParallelConfig.enable_dbo
     ubatch_size: int = ParallelConfig.ubatch_size
     dbo_decode_token_threshold: int = ParallelConfig.dbo_decode_token_threshold
@@ -523,6 +523,7 @@ class EngineArgs:
     enable_layerwise_nvtx_tracing: bool = (
         ObservabilityConfig.enable_layerwise_nvtx_tracing
     )
+    enable_mfu_metrics: bool = ObservabilityConfig.enable_mfu_metrics
     scheduling_policy: SchedulerPolicy = SchedulerConfig.policy
     scheduler_cls: str | type[object] | None = SchedulerConfig.scheduler_cls
 
@@ -1042,6 +1043,10 @@ class EngineArgs:
             "--enable-layerwise-nvtx-tracing",
             **observability_kwargs["enable_layerwise_nvtx_tracing"],
         )
+        observability_group.add_argument(
+            "--enable-mfu-metrics",
+            **observability_kwargs["enable_mfu_metrics"],
+        )
 
         # Scheduler arguments
         scheduler_kwargs = get_kwargs(SchedulerConfig)
@@ -1689,6 +1694,7 @@ class EngineArgs:
             kv_cache_metrics_sample=self.kv_cache_metrics_sample,
             cudagraph_metrics=self.cudagraph_metrics,
             enable_layerwise_nvtx_tracing=self.enable_layerwise_nvtx_tracing,
+            enable_mfu_metrics=self.enable_mfu_metrics,
         )
 
         # Compilation config overrides
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index d94951a0cffc8..bf656cf23de65 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -71,7 +71,11 @@ class EngineClient(ABC):
         truncate_prompt_tokens: int | None = None,
         tokenization_kwargs: dict[str, Any] | None = None,
     ) -> AsyncGenerator[PoolingRequestOutput, None]:
-        """Generate outputs for a request from a pooling model."""
+        """Generate outputs for a request from a pooling model.
+
+        NOTE: truncate_prompt_tokens is deprecated in v0.14.
+        TODO: Remove this argument in v0.15.
+        """
         ...
 
     @abstractmethod
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index ab055dfb1fb0e..5e31f60ad0ca8 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -67,6 +67,15 @@ else:
 
 logger = init_logger(__name__)
 
+
+class ChatTemplateResolutionError(ValueError):
+    """Raised when chat template resolution fails.
+
+    This is a subclass of ValueError for backward compatibility with
+    existing exception handlers.
+    """
+
+
 MODALITY_PLACEHOLDERS_MAP = {
     "image": "<##IMAGE##>",
     "audio": "<##AUDIO##>",
@@ -1814,7 +1823,7 @@ def apply_hf_chat_template(
     )
 
     if hf_chat_template is None:
-        raise ValueError(
+        raise ChatTemplateResolutionError(
             "As of transformers v4.44, default chat template is no longer "
             "allowed, so you must provide a chat template if the tokenizer "
             "does not define one."
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 2768e267f4837..6be1f1a126f55 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -1280,6 +1280,7 @@ class LLM:
         pooling_params: PoolingParams | None = None,
         lora_request: list[LoRARequest] | LoRARequest | None = None,
         tokenization_kwargs: dict[str, Any] | None = None,
+        score_template: str | None = None,
     ) -> list[ScoringRequestOutput]:
         model_config = self.model_config
 
@@ -1313,6 +1314,7 @@ class LLM:
                 data_2=d,
                 tokenizer=tokenizer,
                 tokenization_kwargs=tokenization_kwargs,
+                score_template=score_template,
             )
 
             if token_type_ids := engine_prompt.pop("token_type_ids", None):
@@ -1347,6 +1349,7 @@ class LLM:
         use_tqdm: bool | Callable[..., tqdm] = True,
         pooling_params: PoolingParams | None = None,
         lora_request: list[LoRARequest] | LoRARequest | None = None,
+        chat_template: str | None = None,
     ) -> list[ScoringRequestOutput]:
         """Generate similarity scores for all pairs `<text,text_pair>` or
           `<multi-modal data, multi-modal data pair>`.
@@ -1379,6 +1382,8 @@ class LLM:
             lora_request: LoRA request to use for generation, if any.
             pooling_params: The pooling parameters for pooling. If None, we
                 use the default pooling parameters.
+            chat_template: The chat template to use for the scoring. If None, we
+                use the model's default chat template.
         Returns:
             A list of `ScoringRequestOutput` objects containing the
             generated scores in the same order as the input prompts.
@@ -1406,6 +1411,11 @@ class LLM:
         ):
             raise ValueError("Score API is only enabled for num_labels == 1.")
 
+        if not model_config.is_cross_encoder and chat_template is not None:
+            raise ValueError(
+                "chat_template is only supported for cross-encoder models."
+            )
+
         # the tokenizer for models such as
         # "cross-encoder/ms-marco-MiniLM-L-6-v2" doesn't support passing
         # lists of tokens to the `text` and `text_pair` kwargs
@@ -1475,6 +1485,7 @@ class LLM:
                 use_tqdm,
                 pooling_params,
                 lora_request,
+                score_template=chat_template,
             )
         else:
             return self._embedding_score(
@@ -1610,7 +1621,7 @@ class LLM:
                 added_request_ids.append(request_id)
         except Exception as e:
             if added_request_ids:
-                self.llm_engine.abort_request(added_request_ids)
+                self.llm_engine.abort_request(added_request_ids, internal=True)
             raise e
 
     def _validate_mm_data_and_uuids(
@@ -1720,7 +1731,7 @@ class LLM:
             priority=priority,
             prompt_text=prompt_text,
         )
-        return request_id
+        return engine_request.request_id
 
     def _run_engine(
         self, *, use_tqdm: bool | Callable[..., tqdm] = True
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index bca9571e39344..bc8855a76e2a2 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -909,6 +909,16 @@ def build_app(args: Namespace) -> FastAPI:
 
     @app.exception_handler(RequestValidationError)
     async def validation_exception_handler(_: Request, exc: RequestValidationError):
+        from vllm.entrypoints.openai.protocol import VLLMValidationError
+
+        param = None
+        for error in exc.errors():
+            if "ctx" in error and "error" in error["ctx"]:
+                ctx_error = error["ctx"]["error"]
+                if isinstance(ctx_error, VLLMValidationError):
+                    param = ctx_error.parameter
+                    break
+
         exc_str = str(exc)
         errors_str = str(exc.errors())
 
@@ -922,6 +932,7 @@ def build_app(args: Namespace) -> FastAPI:
                 message=message,
                 type=HTTPStatus.BAD_REQUEST.phrase,
                 code=HTTPStatus.BAD_REQUEST,
+                param=param,
             )
         )
         return JSONResponse(err.model_dump(), status_code=HTTPStatus.BAD_REQUEST)
@@ -1082,6 +1093,9 @@ async def init_app_state(
         if "generate" in supported_tasks
         else None
     )
+    # Warm up chat template processing to avoid first-request latency
+    if state.openai_serving_chat is not None:
+        await state.openai_serving_chat.warmup()
     state.openai_serving_completion = (
         OpenAIServingCompletion(
             engine_client,
@@ -1142,6 +1156,7 @@ async def init_app_state(
             engine_client,
             state.openai_serving_models,
             request_logger=request_logger,
+            score_template=resolved_chat_template,
             log_error_stack=args.log_error_stack,
         )
         if ("embed" in supported_tasks or "score" in supported_tasks)
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index a3c347cb1bd3f..982f5533ad7f9 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -131,6 +131,36 @@ class ErrorResponse(OpenAIBaseModel):
     error: ErrorInfo
 
 
+class VLLMValidationError(ValueError):
+    """vLLM-specific validation error for request validation failures.
+
+    Args:
+        message: The error message describing the validation failure.
+        parameter: Optional parameter name that failed validation.
+        value: Optional value that was rejected during validation.
+    """
+
+    def __init__(
+        self,
+        message: str,
+        *,
+        parameter: str | None = None,
+        value: Any = None,
+    ) -> None:
+        super().__init__(message)
+        self.parameter = parameter
+        self.value = value
+
+    def __str__(self):
+        base = super().__str__()
+        extras = []
+        if self.parameter is not None:
+            extras.append(f"parameter={self.parameter}")
+        if self.value is not None:
+            extras.append(f"value={self.value}")
+        return f"{base} ({', '.join(extras)})" if extras else base
+
+
 class ModelPermission(OpenAIBaseModel):
     id: str = Field(default_factory=lambda: f"modelperm-{random_uuid()}")
     object: str = "model_permission"
@@ -466,7 +496,9 @@ class ResponsesRequest(OpenAIBaseModel):
     @model_validator(mode="before")
     def validate_prompt(cls, data):
         if data.get("prompt") is not None:
-            raise ValueError("prompt template is not supported")
+            raise VLLMValidationError(
+                "prompt template is not supported", parameter="prompt"
+            )
         return data
 
     @model_validator(mode="before")
@@ -850,7 +882,10 @@ class ChatCompletionRequest(OpenAIBaseModel):
     @classmethod
     def validate_stream_options(cls, data):
         if data.get("stream_options") and not data.get("stream"):
-            raise ValueError("Stream options can only be defined when `stream=True`.")
+            raise VLLMValidationError(
+                "Stream options can only be defined when `stream=True`.",
+                parameter="stream_options",
+            )
 
         return data
 
@@ -859,19 +894,29 @@ class ChatCompletionRequest(OpenAIBaseModel):
     def check_logprobs(cls, data):
         if (prompt_logprobs := data.get("prompt_logprobs")) is not None:
             if data.get("stream") and (prompt_logprobs > 0 or prompt_logprobs == -1):
-                raise ValueError(
-                    "`prompt_logprobs` are not available when `stream=True`."
+                raise VLLMValidationError(
+                    "`prompt_logprobs` are not available when `stream=True`.",
+                    parameter="prompt_logprobs",
                 )
 
             if prompt_logprobs < 0 and prompt_logprobs != -1:
-                raise ValueError("`prompt_logprobs` must be a positive value or -1.")
+                raise VLLMValidationError(
+                    "`prompt_logprobs` must be a positive value or -1.",
+                    parameter="prompt_logprobs",
+                    value=prompt_logprobs,
+                )
         if (top_logprobs := data.get("top_logprobs")) is not None:
             if top_logprobs < 0 and top_logprobs != -1:
-                raise ValueError("`top_logprobs` must be a positive value or -1.")
+                raise VLLMValidationError(
+                    "`top_logprobs` must be a positive value or -1.",
+                    parameter="top_logprobs",
+                    value=top_logprobs,
+                )
 
             if (top_logprobs == -1 or top_logprobs > 0) and not data.get("logprobs"):
-                raise ValueError(
-                    "when using `top_logprobs`, `logprobs` must be set to true."
+                raise VLLMValidationError(
+                    "when using `top_logprobs`, `logprobs` must be set to true.",
+                    parameter="top_logprobs",
                 )
 
         return data
@@ -1285,9 +1330,10 @@ class CompletionRequest(OpenAIBaseModel):
             for k in ("json", "regex", "choice")
         )
         if count > 1:
-            raise ValueError(
+            raise VLLMValidationError(
                 "You can only use one kind of constraints for structured "
-                "outputs ('json', 'regex' or 'choice')."
+                "outputs ('json', 'regex' or 'choice').",
+                parameter="structured_outputs",
             )
         return data
 
@@ -1296,14 +1342,23 @@ class CompletionRequest(OpenAIBaseModel):
     def check_logprobs(cls, data):
         if (prompt_logprobs := data.get("prompt_logprobs")) is not None:
             if data.get("stream") and (prompt_logprobs > 0 or prompt_logprobs == -1):
-                raise ValueError(
-                    "`prompt_logprobs` are not available when `stream=True`."
+                raise VLLMValidationError(
+                    "`prompt_logprobs` are not available when `stream=True`.",
+                    parameter="prompt_logprobs",
                 )
 
             if prompt_logprobs < 0 and prompt_logprobs != -1:
-                raise ValueError("`prompt_logprobs` must be a positive value or -1.")
+                raise VLLMValidationError(
+                    "`prompt_logprobs` must be a positive value or -1.",
+                    parameter="prompt_logprobs",
+                    value=prompt_logprobs,
+                )
         if (logprobs := data.get("logprobs")) is not None and logprobs < 0:
-            raise ValueError("`logprobs` must be a positive value.")
+            raise VLLMValidationError(
+                "`logprobs` must be a positive value.",
+                parameter="logprobs",
+                value=logprobs,
+            )
 
         return data
 
@@ -1311,7 +1366,10 @@ class CompletionRequest(OpenAIBaseModel):
     @classmethod
     def validate_stream_options(cls, data):
         if data.get("stream_options") and not data.get("stream"):
-            raise ValueError("Stream options can only be defined when `stream=True`.")
+            raise VLLMValidationError(
+                "Stream options can only be defined when `stream=True`.",
+                parameter="stream_options",
+            )
 
         return data
 
@@ -2138,7 +2196,15 @@ class TranscriptionRequest(OpenAIBaseModel):
         stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"]
         stream = data.get("stream", False)
         if any(bool(data.get(so, False)) for so in stream_opts) and not stream:
-            raise ValueError("Stream options can only be defined when `stream=True`.")
+            # Find which specific stream option was set
+            invalid_param = next(
+                (so for so in stream_opts if data.get(so, False)),
+                "stream_include_usage",
+            )
+            raise VLLMValidationError(
+                "Stream options can only be defined when `stream=True`.",
+                parameter=invalid_param,
+            )
 
         return data
 
@@ -2351,7 +2417,15 @@ class TranslationRequest(OpenAIBaseModel):
         stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"]
         stream = data.get("stream", False)
         if any(bool(data.get(so, False)) for so in stream_opts) and not stream:
-            raise ValueError("Stream options can only be defined when `stream=True`.")
+            # Find which specific stream option was set
+            invalid_param = next(
+                (so for so in stream_opts if data.get(so, False)),
+                "stream_include_usage",
+            )
+            raise VLLMValidationError(
+                "Stream options can only be defined when `stream=True`.",
+                parameter=invalid_param,
+            )
 
         return data
 
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index 837e742e6be49..2cdb6a6f8eea2 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -495,6 +495,7 @@ async def run_batch(
             engine_client,
             openai_serving_models,
             request_logger=request_logger,
+            score_template=None,
         )
         if ("embed" in supported_tasks or enable_serving_reranking)
         else None
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 98fc7810faf96..690fb22e2274d 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -51,6 +51,9 @@ from vllm.entrypoints.openai.protocol import (
     ToolCall,
     UsageInfo,
 )
+from vllm.entrypoints.openai.serving_chat_stream_harmony import (
+    extract_harmony_streaming_delta,
+)
 from vllm.entrypoints.openai.serving_engine import (
     GenerationError,
     OpenAIServing,
@@ -162,6 +165,55 @@ class OpenAIServingChat(OpenAIServing):
         self.supports_code_interpreter = False
         self.python_tool = None
 
+    async def warmup(self) -> None:
+        """
+        Warm up the chat template processing to avoid first-request latency.
+
+        This method triggers Jinja2 template compilation and content format
+        detection that would otherwise happen on the first real request,
+        causing increased latency on the first request.
+        """
+        logger.info("Warming up chat template processing...")
+        start_time = time.perf_counter()
+
+        try:
+            # Get the tokenizer from the engine
+            tokenizer = await self.engine_client.get_tokenizer()
+
+            # Create a minimal dummy request
+            dummy_request = ChatCompletionRequest(
+                messages=[{"role": "user", "content": "warmup"}],
+                model=None,
+                max_completion_tokens=1,
+            )
+
+            # Call _preprocess_chat to trigger template compilation
+            # This forces:
+            # 1. Chat template content format detection
+            # 2. Jinja2 template compilation
+            # 3. Tokenizer initialization for chat
+            await self._preprocess_chat(
+                dummy_request,
+                tokenizer,
+                dummy_request.messages,
+                chat_template=self.chat_template,
+                chat_template_content_format=self.chat_template_content_format,
+                add_generation_prompt=True,
+                continue_final_message=False,
+                tool_dicts=None,
+                documents=None,
+                chat_template_kwargs=None,
+                tool_parser=None,
+                add_special_tokens=False,
+            )
+
+            elapsed = (time.perf_counter() - start_time) * 1000
+            logger.info("Chat template warmup completed in %.1fms", elapsed)
+
+        except Exception:
+            # Log but don't fail server startup if warmup fails
+            logger.exception("Chat template warmup failed")
+
     async def create_chat_completion(
         self,
         request: ChatCompletionRequest,
@@ -204,18 +256,31 @@ class OpenAIServingChat(OpenAIServing):
                 truncate_tool_call_ids(request)
                 validate_request_params(request)
 
-            if (
-                request.tool_choice == "auto"
-                and not (self.enable_auto_tools and tool_parser is not None)
+            # Check if tool parsing is unavailable (common condition)
+            tool_parsing_unavailable = (
+                tool_parser is None
                 and not isinstance(tokenizer, MistralTokenizer)
                 and not self.use_harmony
+            )
+
+            # Validate tool_choice when tool parsing is required but unavailable
+            if tool_parsing_unavailable and request.tool_choice not in (
+                None,
+                "none",
             ):
-                # for hf tokenizers, "auto" tools requires
-                # --enable-auto-tool-choice and --tool-call-parser
-                return self.create_error_response(
-                    '"auto" tool choice requires '
-                    "--enable-auto-tool-choice and --tool-call-parser to be set"
-                )
+                if request.tool_choice == "auto" and not self.enable_auto_tools:
+                    # for hf tokenizers, "auto" tools requires
+                    # --enable-auto-tool-choice and --tool-call-parser
+                    return self.create_error_response(
+                        '"auto" tool choice requires '
+                        "--enable-auto-tool-choice and --tool-call-parser to be set"
+                    )
+                elif request.tool_choice != "auto":
+                    # "required" or named tool requires tool parser
+                    return self.create_error_response(
+                        f'tool_choice="{request.tool_choice}" requires '
+                        "--tool-call-parser to be set"
+                    )
 
             if request.tools is None or (
                 request.tool_choice == "none"
@@ -250,7 +315,10 @@ class OpenAIServingChat(OpenAIServing):
                 )
             else:
                 # For GPT-OSS.
-                conversation, engine_prompts = self._make_request_with_harmony(request)
+                should_include_tools = tool_dicts is not None
+                conversation, engine_prompts = self._make_request_with_harmony(
+                    request, should_include_tools
+                )
         except (ValueError, TypeError, RuntimeError, jinja2.TemplateError) as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(f"{e} {e.__cause__}")
@@ -332,6 +400,7 @@ class OpenAIServingChat(OpenAIServing):
                         lora_request=lora_request,
                         trace_headers=trace_headers,
                         priority=request.priority,
+                        data_parallel_rank=data_parallel_rank,
                     )
 
                     generator = self.engine_client.generate(
@@ -348,8 +417,7 @@ class OpenAIServingChat(OpenAIServing):
 
                 generators.append(generator)
         except ValueError as e:
-            # TODO: Use a vllm-specific Validation Error
-            return self.create_error_response(str(e))
+            return self.create_error_response(e)
 
         assert len(generators) == 1
         (result_generator,) = generators
@@ -379,8 +447,7 @@ class OpenAIServingChat(OpenAIServing):
         except GenerationError as e:
             return self._convert_generation_error_to_response(e)
         except ValueError as e:
-            # TODO: Use a vllm-specific Validation Error
-            return self.create_error_response(str(e))
+            return self.create_error_response(e)
 
     def get_chat_request_role(self, request: ChatCompletionRequest) -> str:
         if request.add_generation_prompt:
@@ -613,7 +680,7 @@ class OpenAIServingChat(OpenAIServing):
                 tool_parsers = [None] * num_choices
         except Exception as e:
             logger.exception("Error in tool parser creation.")
-            data = self.create_streaming_error_response(str(e))
+            data = self.create_streaming_error_response(e)
             yield f"data: {data}\n\n"
             yield "data: [DONE]\n\n"
             return
@@ -742,6 +809,11 @@ class OpenAIServingChat(OpenAIServing):
                             delta_text += harmony_parser.last_content_delta or ""
                         cur_channel = harmony_parser.current_channel
                         cur_recipient = harmony_parser.current_recipient
+                        # handle the case where several tokens where generated at once
+                        # including the final token, leading to a delta in the text
+                        # but the current channel to be empty (start state)
+                        if not cur_channel and delta_text:
+                            cur_channel = "final"
                     else:
                         delta_text = output.text
 
@@ -771,64 +843,17 @@ class OpenAIServingChat(OpenAIServing):
                             current_token_ids = as_list(output.token_ids)
 
                     if self.use_harmony:
-                        if cur_channel == "final":
-                            delta_message = DeltaMessage(content=delta_text)
-                        elif cur_channel == "analysis":
-                            if request.include_reasoning:
-                                delta_message = DeltaMessage(reasoning=delta_text)
-                            else:
-                                delta_message = None
-                        elif (
-                            cur_channel == "commentary"
-                            and cur_recipient
-                            and cur_recipient.startswith("functions.")
-                        ):
-                            # Count completed tool calls to determine index
-                            base_index = 0
-                            for msg in harmony_parser.messages:
-                                if (
-                                    msg.channel == "commentary"
-                                    and msg.recipient
-                                    and msg.recipient.startswith("functions.")
-                                ):
-                                    base_index += 1
-
-                            if prev_recipient != cur_recipient:
-                                tool_name = cur_recipient.split("functions.", 1)[1]
-                                delta_message = DeltaMessage(
-                                    tool_calls=[
-                                        DeltaToolCall(
-                                            id=make_tool_call_id(),
-                                            type="function",
-                                            function=DeltaFunctionCall(
-                                                name=tool_name,
-                                                arguments="",
-                                            ),
-                                            index=base_index,
-                                        )
-                                    ]
-                                )
-                            elif delta_text:
-                                delta_message = DeltaMessage(
-                                    tool_calls=[
-                                        DeltaToolCall(
-                                            index=base_index,
-                                            function=DeltaFunctionCall(
-                                                arguments=delta_text
-                                            ),
-                                        )
-                                    ]
-                                )
-                            else:
-                                delta_message = None
-
-                            if delta_message is not None:
-                                harmony_tools_streamed[i] = True
-                        elif cur_channel == "commentary":
-                            # Tool call preambles meant to be shown to the user
-                            delta_message = DeltaMessage(content=delta_text)
-                        else:
-                            delta_message = None
+                        delta_message, tools_streamed_flag = (
+                            extract_harmony_streaming_delta(
+                                harmony_parser=harmony_parser,
+                                cur_channel=cur_channel,
+                                cur_recipient=cur_recipient,
+                                prev_recipient=prev_recipient,
+                                delta_text=delta_text,
+                                include_reasoning=request.include_reasoning,
+                            )
+                        )
+                        harmony_tools_streamed[i] |= tools_streamed_flag
                     # handle streaming deltas for tools with named tool_choice
                     elif tool_choice_function_name:
                         if (
@@ -1301,9 +1326,8 @@ class OpenAIServingChat(OpenAIServing):
         except GenerationError as e:
             yield f"data: {self._convert_generation_error_to_streaming_response(e)}\n\n"
         except Exception as e:
-            # TODO: Use a vllm-specific Validation Error
             logger.exception("Error in chat completion stream generator.")
-            data = self.create_streaming_error_response(str(e))
+            data = self.create_streaming_error_response(e)
             yield f"data: {data}\n\n"
         # Send the final done message after all response.n are finished
         yield "data: [DONE]\n\n"
@@ -1327,8 +1351,7 @@ class OpenAIServingChat(OpenAIServing):
         except asyncio.CancelledError:
             return self.create_error_response("Client disconnected")
         except ValueError as e:
-            # TODO: Use a vllm-specific Validation Error
-            return self.create_error_response(str(e))
+            return self.create_error_response(e)
 
         assert final_res is not None
 
@@ -1783,6 +1806,7 @@ class OpenAIServingChat(OpenAIServing):
     def _make_request_with_harmony(
         self,
         request: ChatCompletionRequest,
+        should_include_tools: bool = True,
     ):
         messages: list[OpenAIMessage] = []
 
@@ -1800,13 +1824,16 @@ class OpenAIServingChat(OpenAIServing):
             reasoning_effort=request.reasoning_effort,
             browser_description=None,
             python_description=None,
-            with_custom_tools=request.tools is not None,
+            with_custom_tools=should_include_tools,
         )
         messages.append(sys_msg)
 
         # Add developer message.
-        dev_msg = get_developer_message(tools=request.tools)
-        messages.append(dev_msg)
+        if request.tools:
+            dev_msg = get_developer_message(
+                tools=request.tools if should_include_tools else None
+            )
+            messages.append(dev_msg)
 
         # Add user message.
         messages.extend(parse_chat_inputs_to_harmony_messages(request.messages))
diff --git a/vllm/entrypoints/openai/serving_chat_stream_harmony.py b/vllm/entrypoints/openai/serving_chat_stream_harmony.py
new file mode 100644
index 0000000000000..1b5ae620651c6
--- /dev/null
+++ b/vllm/entrypoints/openai/serving_chat_stream_harmony.py
@@ -0,0 +1,101 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Harmony-specific streaming delta extraction for chat completions.
+
+This module handles the extraction of DeltaMessage objects from
+harmony parser state during streaming chat completions.
+"""
+
+from openai_harmony import StreamableParser
+
+from vllm.entrypoints.chat_utils import make_tool_call_id
+from vllm.entrypoints.openai.protocol import (
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+)
+
+
+def extract_harmony_streaming_delta(
+    harmony_parser: StreamableParser,
+    cur_channel: str | None,
+    cur_recipient: str | None,
+    prev_recipient: str | None,
+    delta_text: str,
+    include_reasoning: bool,
+) -> tuple[DeltaMessage | None, bool]:
+    """
+    Extract a DeltaMessage from harmony parser state during streaming.
+
+    Args:
+        harmony_parser: The StreamableParser instance tracking parse state
+        cur_channel: Current channel ("final", "analysis", "commentary", etc.)
+        cur_recipient: Current recipient (e.g., "functions.my_func")
+        prev_recipient: Previous recipient for detecting tool call transitions
+        delta_text: The text delta to include in the message
+        include_reasoning: Whether to include reasoning content
+
+    Returns:
+        A tuple of (DeltaMessage or None, tools_streamed_flag)
+    """
+    tools_streamed = False
+
+    if cur_channel == "final":
+        delta_message = DeltaMessage(content=delta_text)
+    elif (
+        (cur_channel == "commentary" or cur_channel == "analysis")
+        and cur_recipient
+        and cur_recipient.startswith("functions.")
+    ):
+        # Count completed tool calls to determine index
+        base_index = 0
+        for msg in harmony_parser.messages:
+            if (
+                (msg.channel == "commentary" or msg.channel == "analysis")
+                and msg.recipient
+                and msg.recipient.startswith("functions.")
+            ):
+                base_index += 1
+
+        if prev_recipient != cur_recipient:
+            tool_name = cur_recipient.split("functions.", 1)[1]
+            delta_message = DeltaMessage(
+                tool_calls=[
+                    DeltaToolCall(
+                        id=make_tool_call_id(),
+                        type="function",
+                        function=DeltaFunctionCall(
+                            name=tool_name,
+                            arguments="",
+                        ),
+                        index=base_index,
+                    )
+                ]
+            )
+        elif delta_text:
+            delta_message = DeltaMessage(
+                tool_calls=[
+                    DeltaToolCall(
+                        index=base_index,
+                        function=DeltaFunctionCall(arguments=delta_text),
+                    )
+                ]
+            )
+        else:
+            delta_message = None
+
+        if delta_message is not None:
+            tools_streamed = True
+    elif cur_channel == "commentary":
+        # Tool call preambles meant to be shown to the user
+        delta_message = DeltaMessage(content=delta_text)
+    elif cur_channel == "analysis":
+        if include_reasoning:
+            delta_message = DeltaMessage(reasoning=delta_text)
+        else:
+            delta_message = None
+    else:
+        delta_message = None
+
+    return delta_message, tools_streamed
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 1be0afc8c74e5..d9a8ccb9f851d 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -23,6 +23,7 @@ from vllm.entrypoints.openai.protocol import (
     PromptTokenUsageInfo,
     RequestResponseMetadata,
     UsageInfo,
+    VLLMValidationError,
 )
 from vllm.entrypoints.openai.serving_engine import (
     GenerationError,
@@ -230,6 +231,7 @@ class OpenAIServingCompletion(OpenAIServing):
                         lora_request=lora_request,
                         trace_headers=trace_headers,
                         priority=request.priority,
+                        data_parallel_rank=data_parallel_rank,
                     )
 
                     generator = self.engine_client.generate(
@@ -246,8 +248,7 @@ class OpenAIServingCompletion(OpenAIServing):
 
                 generators.append(generator)
         except ValueError as e:
-            # TODO: Use a vllm-specific Validation Error
-            return self.create_error_response(str(e))
+            return self.create_error_response(e)
 
         result_generator = merge_async_iterators(*generators)
 
@@ -307,8 +308,7 @@ class OpenAIServingCompletion(OpenAIServing):
         except GenerationError as e:
             return self._convert_generation_error_to_response(e)
         except ValueError as e:
-            # TODO: Use a vllm-specific Validation Error
-            return self.create_error_response(str(e))
+            return self.create_error_response(e)
 
         # When user requests streaming but we don't stream, we still need to
         # return a streaming response with a single event.
@@ -509,9 +509,8 @@ class OpenAIServingCompletion(OpenAIServing):
         except GenerationError as e:
             yield f"data: {self._convert_generation_error_to_streaming_response(e)}\n\n"
         except Exception as e:
-            # TODO: Use a vllm-specific Validation Error
             logger.exception("Error in completion stream generator.")
-            data = self.create_streaming_error_response(str(e))
+            data = self.create_streaming_error_response(e)
             yield f"data: {data}\n\n"
         yield "data: [DONE]\n\n"
 
@@ -659,8 +658,11 @@ class OpenAIServingCompletion(OpenAIServing):
                     token = f"token_id:{token_id}"
                 else:
                     if tokenizer is None:
-                        raise ValueError(
-                            "Unable to get tokenizer because `skip_tokenizer_init=True`"
+                        raise VLLMValidationError(
+                            "Unable to get tokenizer because "
+                            "`skip_tokenizer_init=True`",
+                            parameter="skip_tokenizer_init",
+                            value=True,
                         )
 
                     token = tokenizer.decode(token_id)
@@ -719,6 +721,15 @@ class OpenAIServingCompletion(OpenAIServing):
         request: CompletionRequest,
         max_input_length: int | None = None,
     ) -> RenderConfig:
+        # Validate max_tokens before using it
+        if request.max_tokens is not None and request.max_tokens > self.max_model_len:
+            raise VLLMValidationError(
+                f"'max_tokens' ({request.max_tokens}) cannot be greater than "
+                f"the model's maximum context length ({self.max_model_len}).",
+                parameter="max_tokens",
+                value=request.max_tokens,
+            )
+
         max_input_tokens_len = self.max_model_len - (request.max_tokens or 0)
         return RenderConfig(
             max_length=max_input_tokens_len,
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 5f7cfaa53ec18..5ea2a7a572650 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -57,6 +57,7 @@ from vllm.entrypoints.openai.protocol import (
     TranscriptionRequest,
     TranscriptionResponse,
     TranslationRequest,
+    VLLMValidationError,
 )
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.pooling.classify.protocol import (
@@ -322,8 +323,10 @@ class OpenAIServing:
         input_processor = self.input_processor
         tokenizer = input_processor.tokenizer
         if tokenizer is None:
-            raise ValueError(
-                "You cannot use beam search when `skip_tokenizer_init=True`"
+            raise VLLMValidationError(
+                "You cannot use beam search when `skip_tokenizer_init=True`",
+                parameter="skip_tokenizer_init",
+                value=True,
             )
 
         eos_token_id: int = tokenizer.eos_token_id  # type: ignore
@@ -706,8 +709,7 @@ class OpenAIServing:
             return None
 
         except Exception as e:
-            # TODO: Use a vllm-specific Validation Error
-            return self.create_error_response(str(e))
+            return self.create_error_response(e)
 
     async def _collect_batch(
         self,
@@ -738,14 +740,43 @@ class OpenAIServing:
             return None
 
         except Exception as e:
-            return self.create_error_response(str(e))
+            return self.create_error_response(e)
 
     def create_error_response(
         self,
-        message: str,
+        message: str | Exception,
         err_type: str = "BadRequestError",
         status_code: HTTPStatus = HTTPStatus.BAD_REQUEST,
+        param: str | None = None,
     ) -> ErrorResponse:
+        exc: Exception | None = None
+
+        if isinstance(message, Exception):
+            exc = message
+
+            from vllm.entrypoints.openai.protocol import VLLMValidationError
+
+            if isinstance(exc, VLLMValidationError):
+                err_type = "BadRequestError"
+                status_code = HTTPStatus.BAD_REQUEST
+                param = exc.parameter
+            elif isinstance(exc, (ValueError, TypeError, RuntimeError)):
+                # Common validation errors from user input
+                err_type = "BadRequestError"
+                status_code = HTTPStatus.BAD_REQUEST
+                param = None
+            elif exc.__class__.__name__ == "TemplateError":
+                # jinja2.TemplateError (avoid importing jinja2)
+                err_type = "BadRequestError"
+                status_code = HTTPStatus.BAD_REQUEST
+                param = None
+            else:
+                err_type = "InternalServerError"
+                status_code = HTTPStatus.INTERNAL_SERVER_ERROR
+                param = None
+
+            message = str(exc)
+
         if self.log_error_stack:
             exc_type, _, _ = sys.exc_info()
             if exc_type is not None:
@@ -753,18 +784,27 @@ class OpenAIServing:
             else:
                 traceback.print_stack()
         return ErrorResponse(
-            error=ErrorInfo(message=message, type=err_type, code=status_code.value)
+            error=ErrorInfo(
+                message=message,
+                type=err_type,
+                code=status_code.value,
+                param=param,
+            )
         )
 
     def create_streaming_error_response(
         self,
-        message: str,
+        message: str | Exception,
         err_type: str = "BadRequestError",
         status_code: HTTPStatus = HTTPStatus.BAD_REQUEST,
+        param: str | None = None,
     ) -> str:
         json_str = json.dumps(
             self.create_error_response(
-                message=message, err_type=err_type, status_code=status_code
+                message=message,
+                err_type=err_type,
+                status_code=status_code,
+                param=param,
             ).model_dump()
         )
         return json_str
@@ -825,6 +865,7 @@ class OpenAIServing:
             message=f"The model `{request.model}` does not exist.",
             err_type="NotFoundError",
             status_code=HTTPStatus.NOT_FOUND,
+            param="model",
         )
 
     def _get_active_default_mm_loras(self, request: AnyRequest) -> LoRARequest | None:
@@ -991,11 +1032,13 @@ class OpenAIServing:
                     ClassificationChatRequest: "classification",
                 }
                 operation = operations.get(type(request), "embedding generation")
-                raise ValueError(
+                raise VLLMValidationError(
                     f"This model's maximum context length is "
                     f"{self.max_model_len} tokens. However, you requested "
                     f"{token_num} tokens in the input for {operation}. "
-                    f"Please reduce the length of the input."
+                    f"Please reduce the length of the input.",
+                    parameter="input_tokens",
+                    value=token_num,
                 )
             return TokensPrompt(prompt=input_text, prompt_token_ids=input_ids)
 
@@ -1017,20 +1060,24 @@ class OpenAIServing:
         # Note: input length can be up to model context length - 1 for
         # completion-like requests.
         if token_num >= self.max_model_len:
-            raise ValueError(
+            raise VLLMValidationError(
                 f"This model's maximum context length is "
                 f"{self.max_model_len} tokens. However, your request has "
                 f"{token_num} input tokens. Please reduce the length of "
-                "the input messages."
+                "the input messages.",
+                parameter="input_tokens",
+                value=token_num,
             )
 
         if max_tokens is not None and token_num + max_tokens > self.max_model_len:
-            raise ValueError(
+            raise VLLMValidationError(
                 "'max_tokens' or 'max_completion_tokens' is too large: "
                 f"{max_tokens}. This model's maximum context length is "
                 f"{self.max_model_len} tokens and your request has "
                 f"{token_num} input tokens ({max_tokens} > {self.max_model_len}"
-                f" - {token_num})."
+                f" - {token_num}).",
+                parameter="max_tokens",
+                value=max_tokens,
             )
 
         return TokensPrompt(prompt=input_text, prompt_token_ids=input_ids)
@@ -1231,6 +1278,7 @@ class OpenAIServing:
         lora_request: LoRARequest | None,
         trace_headers: Mapping[str, str] | None,
         priority: int,
+        data_parallel_rank: int | None = None,
     ) -> tuple[EngineCoreRequest, dict[str, Any]]:
         """Use the Processor to process inputs for AsyncLLM."""
         tokenization_kwargs: dict[str, Any] = {}
@@ -1246,6 +1294,7 @@ class OpenAIServing:
             tokenization_kwargs=tokenization_kwargs,
             trace_headers=trace_headers,
             priority=priority,
+            data_parallel_rank=data_parallel_rank,
         )
         return engine_request, tokenization_kwargs
 
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index 1f9b5704624ab..e9eaaa49275d3 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -94,6 +94,7 @@ from vllm.entrypoints.openai.protocol import (
     ResponsesResponse,
     ResponseUsage,
     StreamingResponsesResponse,
+    VLLMValidationError,
 )
 from vllm.entrypoints.openai.serving_engine import (
     GenerationError,
@@ -271,6 +272,7 @@ class OpenAIServingResponses(OpenAIServing):
                 err_type="invalid_request_error",
                 message=error_message,
                 status_code=HTTPStatus.BAD_REQUEST,
+                param="input",
             )
         return None
 
@@ -282,6 +284,7 @@ class OpenAIServingResponses(OpenAIServing):
                 err_type="invalid_request_error",
                 message="logprobs are not supported with gpt-oss models",
                 status_code=HTTPStatus.BAD_REQUEST,
+                param="logprobs",
             )
         if request.store and not self.enable_store and request.background:
             return self.create_error_response(
@@ -294,6 +297,7 @@ class OpenAIServingResponses(OpenAIServing):
                     "the vLLM server."
                 ),
                 status_code=HTTPStatus.BAD_REQUEST,
+                param="background",
             )
         if request.previous_input_messages and request.previous_response_id:
             return self.create_error_response(
@@ -301,6 +305,7 @@ class OpenAIServingResponses(OpenAIServing):
                 message="Only one of `previous_input_messages` and "
                 "`previous_response_id` can be set.",
                 status_code=HTTPStatus.BAD_REQUEST,
+                param="previous_response_id",
             )
         return None
 
@@ -457,8 +462,7 @@ class OpenAIServingResponses(OpenAIServing):
                 )
                 generators.append(generator)
         except ValueError as e:
-            # TODO: Use a vllm-specific Validation Error
-            return self.create_error_response(str(e))
+            return self.create_error_response(e)
 
         assert len(generators) == 1
         (result_generator,) = generators
@@ -546,7 +550,7 @@ class OpenAIServingResponses(OpenAIServing):
         except GenerationError as e:
             return self._convert_generation_error_to_response(e)
         except Exception as e:
-            return self.create_error_response(str(e))
+            return self.create_error_response(e)
 
     async def _make_request(
         self,
@@ -630,8 +634,7 @@ class OpenAIServingResponses(OpenAIServing):
             except asyncio.CancelledError:
                 return self.create_error_response("Client disconnected")
             except ValueError as e:
-                # TODO: Use a vllm-specific Validation Error
-                return self.create_error_response(str(e))
+                return self.create_error_response(e)
 
         # NOTE: Implementation of stauts is still WIP, but for now
         # we guarantee that if the status is not "completed", it is accurate.
@@ -1074,7 +1077,7 @@ class OpenAIServingResponses(OpenAIServing):
             response = self._convert_generation_error_to_response(e)
         except Exception as e:
             logger.exception("Background request failed for %s", request.request_id)
-            response = self.create_error_response(str(e))
+            response = self.create_error_response(e)
         finally:
             new_event_signal.set()
 
@@ -1099,7 +1102,7 @@ class OpenAIServingResponses(OpenAIServing):
             response = self._convert_generation_error_to_response(e)
         except Exception as e:
             logger.exception("Background request failed for %s", request.request_id)
-            response = self.create_error_response(str(e))
+            response = self.create_error_response(e)
 
         if isinstance(response, ErrorResponse):
             # If the request has failed, update the status to "failed".
@@ -1116,7 +1119,11 @@ class OpenAIServingResponses(OpenAIServing):
         starting_after: int | None = None,
     ) -> AsyncGenerator[StreamingResponsesResponse, None]:
         if response_id not in self.event_store:
-            raise ValueError(f"Unknown response_id: {response_id}")
+            raise VLLMValidationError(
+                f"Unknown response_id: {response_id}",
+                parameter="response_id",
+                value=response_id,
+            )
 
         event_deque, new_event_signal = self.event_store[response_id]
         start_index = 0 if starting_after is None else starting_after + 1
@@ -1172,6 +1179,7 @@ class OpenAIServingResponses(OpenAIServing):
                 return self.create_error_response(
                     err_type="invalid_request_error",
                     message="Cannot cancel a synchronous response.",
+                    param="response_id",
                 )
 
             # Update the status to "cancelled".
@@ -1191,6 +1199,7 @@ class OpenAIServingResponses(OpenAIServing):
             err_type="invalid_request_error",
             message=f"Response with id '{response_id}' not found.",
             status_code=HTTPStatus.NOT_FOUND,
+            param="response_id",
         )
 
     def _make_store_not_supported_error(self) -> ErrorResponse:
@@ -1203,6 +1212,7 @@ class OpenAIServingResponses(OpenAIServing):
                 "starting the vLLM server."
             ),
             status_code=HTTPStatus.BAD_REQUEST,
+            param="store",
         )
 
     async def _process_simple_streaming_events(
diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py
index df9c06adb105a..22da46902da14 100644
--- a/vllm/entrypoints/openai/speech_to_text.py
+++ b/vllm/entrypoints/openai/speech_to_text.py
@@ -30,12 +30,13 @@ from vllm.entrypoints.openai.protocol import (
     TranslationSegment,
     TranslationStreamResponse,
     UsageInfo,
+    VLLMValidationError,
 )
 from vllm.entrypoints.openai.serving_engine import OpenAIServing, SpeechToTextRequest
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.inputs.data import PromptType
 from vllm.logger import init_logger
-from vllm.model_executor.models import SupportsTranscription
+from vllm.model_executor.models import SupportsTranscription, supports_transcription
 from vllm.outputs import RequestOutput
 from vllm.tokenizers import get_tokenizer
 from vllm.utils.import_utils import PlaceholderModule
@@ -112,6 +113,131 @@ class OpenAISpeechToText(OpenAIServing):
                 self.default_sampling_params,
             )
 
+        # Warm up audio preprocessing to avoid first-request latency
+        self._warmup_audio_preprocessing()
+        # Warm up input processor with dummy audio
+        self._warmup_input_processor()
+
+    def _warmup_audio_preprocessing(self) -> None:
+        """Warm up audio processing libraries to avoid first-request latency.
+
+        The first call to librosa functions (load, get_duration, mel-spectrogram)
+        triggers JIT compilation and library initialization which can take ~7s.
+        This method warms up these operations during server initialization.
+        """
+        # Skip warmup if librosa is not installed (optional dependency)
+        if isinstance(librosa, PlaceholderModule):
+            return
+
+        # Skip warmup if model doesn't support transcription
+        if not supports_transcription(self.model_cls):
+            return
+
+        try:
+            warmup_start = time.perf_counter()
+            logger.info("Warming up audio preprocessing libraries...")
+
+            # Create a minimal dummy audio (1 second of silence at target sample rate)
+            dummy_audio = np.zeros(int(self.asr_config.sample_rate), dtype=np.float32)
+
+            # Warm up librosa.load by using librosa functions on the dummy data
+            # This initializes FFTW, numba JIT, and other audio processing libraries
+            _ = librosa.get_duration(y=dummy_audio, sr=self.asr_config.sample_rate)
+
+            # Warm up mel-spectrogram computation with model-specific parameters
+            from vllm.transformers_utils.processor import (
+                cached_processor_from_config,
+            )
+
+            processor = cached_processor_from_config(self.model_config)
+            feature_extractor = None
+            if hasattr(processor, "feature_extractor"):
+                feature_extractor = processor.feature_extractor
+            elif hasattr(processor, "audio_processor"):
+                # For models like GraniteSpeech that use audio_processor
+                audio_proc = processor.audio_processor
+                if hasattr(audio_proc, "feature_extractor"):
+                    feature_extractor = audio_proc.feature_extractor
+                # If audio_processor doesn't have feature_extractor,
+                # skip mel-spectrogram warmup for these models
+
+            if feature_extractor is not None:
+                _ = librosa.feature.melspectrogram(
+                    y=dummy_audio,
+                    sr=self.asr_config.sample_rate,
+                    n_mels=getattr(feature_extractor, "n_mels", 128),
+                    n_fft=getattr(feature_extractor, "n_fft", 400),
+                    hop_length=getattr(feature_extractor, "hop_length", 160),
+                )
+
+            warmup_elapsed = time.perf_counter() - warmup_start
+            logger.info("Audio preprocessing warmup completed in %.2fs", warmup_elapsed)
+        except Exception:
+            # Don't fail initialization if warmup fails - log exception and continue
+            logger.exception(
+                "Audio preprocessing warmup failed (non-fatal): %s. "
+                "First request may experience higher latency.",
+            )
+
+    def _warmup_input_processor(self) -> None:
+        """Warm up input processor with dummy audio to avoid first-request latency.
+
+        The first call to input_processor.process_inputs() with multimodal audio
+        triggers multimodal processing initialization which can take ~2.5s.
+        This method processes a dummy audio request to warm up the pipeline.
+        """
+        # Skip warmup if model doesn't support transcription
+        if not supports_transcription(self.model_cls):
+            return
+
+        # Only warm up if model supports transcription methods
+        if not hasattr(self.model_cls, "get_generation_prompt"):
+            return
+
+        try:
+            from vllm.sampling_params import SamplingParams
+
+            warmup_start = time.perf_counter()
+            logger.info("Warming up multimodal input processor...")
+
+            # Create minimal dummy audio (1 second of silence)
+            dummy_audio = np.zeros(int(self.asr_config.sample_rate), dtype=np.float32)
+
+            # Use the same method that _preprocess_speech_to_text uses
+            # to create the prompt
+            dummy_prompt = self.model_cls.get_generation_prompt(
+                audio=dummy_audio,
+                stt_config=self.asr_config,
+                model_config=self.model_config,
+                language="en",
+                task_type=self.task_type,
+                request_prompt="",
+                to_language=None,
+            )
+
+            # Create minimal sampling params
+            dummy_params = SamplingParams(
+                max_tokens=1,
+                temperature=0.0,
+            )
+
+            # Process the dummy input through the input processor
+            # This will trigger all the multimodal processing initialization
+            _ = self.input_processor.process_inputs(
+                request_id="warmup",
+                prompt=dummy_prompt,
+                params=dummy_params,
+            )
+
+            warmup_elapsed = time.perf_counter() - warmup_start
+            logger.info("Input processor warmup completed in %.2fs", warmup_elapsed)
+        except Exception:
+            # Don't fail initialization if warmup fails - log warning and continue
+            logger.exception(
+                "Input processor warmup failed (non-fatal): %s. "
+                "First request may experience higher latency."
+            )
+
     @cached_property
     def model_cls(self) -> type[SupportsTranscription]:
         from vllm.model_executor.model_loader import get_model_cls
@@ -134,7 +260,11 @@ class OpenAISpeechToText(OpenAIServing):
         )
 
         if len(audio_data) / 1024**2 > self.max_audio_filesize_mb:
-            raise ValueError("Maximum file size exceeded.")
+            raise VLLMValidationError(
+                "Maximum file size exceeded",
+                parameter="audio_filesize_mb",
+                value=len(audio_data) / 1024**2,
+            )
 
         with io.BytesIO(audio_data) as bytes_:
             # NOTE resample to model SR here for efficiency. This is also a
@@ -162,12 +292,18 @@ class OpenAISpeechToText(OpenAIServing):
             )
             if request.response_format == "verbose_json":
                 if not isinstance(prompt, dict):
-                    raise ValueError(f"Expected prompt to be a dict,got {type(prompt)}")
+                    raise VLLMValidationError(
+                        "Expected prompt to be a dict",
+                        parameter="prompt",
+                        value=type(prompt).__name__,
+                    )
                 prompt_dict = cast(dict, prompt)
                 decoder_prompt = prompt.get("decoder_prompt")
                 if not isinstance(decoder_prompt, str):
-                    raise ValueError(
-                        f"Expected decoder_prompt to bestr, got {type(decoder_prompt)}"
+                    raise VLLMValidationError(
+                        "Expected decoder_prompt to be str",
+                        parameter="decoder_prompt",
+                        value=type(decoder_prompt).__name__,
                     )
                 prompt_dict["decoder_prompt"] = decoder_prompt.replace(
                     "<|notimestamps|>", "<|0.00|>"
@@ -287,7 +423,7 @@ class OpenAISpeechToText(OpenAIServing):
 
         except ValueError as e:
             logger.exception("Error in preprocessing prompt inputs")
-            return self.create_error_response(str(e))
+            return self.create_error_response(e)
 
         list_result_generator: list[AsyncGenerator[RequestOutput, None]] | None = None
         try:
@@ -323,8 +459,7 @@ class OpenAISpeechToText(OpenAIServing):
                 for i, prompt in enumerate(prompts)
             ]
         except ValueError as e:
-            # TODO: Use a vllm-specific Validation Error
-            return self.create_error_response(str(e))
+            return self.create_error_response(e)
 
         if request.stream:
             return stream_generator_method(
@@ -398,8 +533,7 @@ class OpenAISpeechToText(OpenAIServing):
         except asyncio.CancelledError:
             return self.create_error_response("Client disconnected")
         except ValueError as e:
-            # TODO: Use a vllm-specific Validation Error
-            return self.create_error_response(str(e))
+            return self.create_error_response(e)
 
     async def _speech_to_text_stream_generator(
         self,
@@ -509,9 +643,8 @@ class OpenAISpeechToText(OpenAIServing):
             )
 
         except Exception as e:
-            # TODO: Use a vllm-specific Validation Error
             logger.exception("Error in %s stream generator.", self.task_type)
-            data = self.create_streaming_error_response(str(e))
+            data = self.create_streaming_error_response(e)
             yield f"data: {data}\n\n"
         # Send the final done message after all response.n are finished
         yield "data: [DONE]\n\n"
diff --git a/vllm/entrypoints/pooling/score/serving.py b/vllm/entrypoints/pooling/score/serving.py
index edbfcd03ac92c..9762b23639853 100644
--- a/vllm/entrypoints/pooling/score/serving.py
+++ b/vllm/entrypoints/pooling/score/serving.py
@@ -52,6 +52,7 @@ class ServingScores(OpenAIServing):
         models: OpenAIServingModels,
         *,
         request_logger: RequestLogger | None,
+        score_template: str | None = None,
         log_error_stack: bool = False,
     ) -> None:
         super().__init__(
@@ -60,6 +61,7 @@ class ServingScores(OpenAIServing):
             request_logger=request_logger,
             log_error_stack=log_error_stack,
         )
+        self.score_template = score_template
 
     async def _embedding_score(
         self,
@@ -169,6 +171,7 @@ class ServingScores(OpenAIServing):
             data_2=data_2,
             tokenizer=tokenizer,
             tokenization_kwargs=tokenization_kwargs,
+            score_template=self.score_template,
         )
         self._validate_input(request, engine_prompt["prompt_token_ids"], full_prompt)
         if request.mm_processor_kwargs is not None:
diff --git a/vllm/entrypoints/renderer.py b/vllm/entrypoints/renderer.py
index 0f89c840be80f..ca2e27fa4428b 100644
--- a/vllm/entrypoints/renderer.py
+++ b/vllm/entrypoints/renderer.py
@@ -12,6 +12,7 @@ import torch
 from pydantic import Field
 
 from vllm.config import ModelConfig
+from vllm.entrypoints.openai.protocol import VLLMValidationError
 from vllm.inputs.data import EmbedsPrompt, TextPrompt, TokensPrompt
 from vllm.inputs.parse import get_prompt_components, parse_raw_prompts
 from vllm.tokenizers import TokenizerLike
@@ -162,8 +163,9 @@ class BaseRenderer(ABC):
     ) -> list[EmbedsPrompt]:
         """Load and validate base64-encoded embeddings into prompt objects."""
         if not self.model_config.enable_prompt_embeds:
-            raise ValueError(
-                "You must set `--enable-prompt-embeds` to input `prompt_embeds`."
+            raise VLLMValidationError(
+                "You must set `--enable-prompt-embeds` to input `prompt_embeds`.",
+                parameter="prompt_embeds",
             )
 
         def _load_and_validate_embed(embed: bytes) -> EmbedsPrompt:
@@ -396,10 +398,12 @@ class CompletionRenderer(BaseRenderer):
     ) -> TokensPrompt:
         """Create validated TokensPrompt."""
         if max_length is not None and len(token_ids) > max_length:
-            raise ValueError(
+            raise VLLMValidationError(
                 f"This model's maximum context length is {max_length} tokens. "
                 f"However, your request has {len(token_ids)} input tokens. "
-                "Please reduce the length of the input messages."
+                "Please reduce the length of the input messages.",
+                parameter="input_tokens",
+                value=len(token_ids),
             )
 
         tokens_prompt = TokensPrompt(prompt_token_ids=token_ids)
diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py
index 072ddd4c90b16..d7c31cddffad6 100644
--- a/vllm/entrypoints/score_utils.py
+++ b/vllm/entrypoints/score_utils.py
@@ -11,9 +11,11 @@ from vllm.entrypoints.chat_utils import (
     ChatCompletionContentPartImageEmbedsParam,
     ChatCompletionContentPartImageParam,
     ChatCompletionContentPartTextParam,
+    ChatTemplateResolutionError,
     MultiModalItemTracker,
     _ContentPart,
     _parse_chat_message_content_part,
+    apply_hf_chat_template,
 )
 from vllm.inputs import TokensPrompt
 from vllm.model_executor.models.interfaces import supports_score_template
@@ -139,10 +141,8 @@ def _parse_score_content(
     return next(iter(mm_placeholder_storage.values()))[0]
 
 
-def apply_score_template(
-    model_config: ModelConfig,
-    prompt_1: str,
-    prompt_2: str,
+def _apply_model_score_template(
+    model_config: ModelConfig, prompt_1: str, prompt_2: str
 ) -> str:
     # NOTE(Simon): lazy import to avoid bring in all dependencies (e.g. gguf)
     from vllm.model_executor.model_loader import get_model_cls
@@ -181,6 +181,7 @@ def get_score_prompt(
     tokenization_kwargs: dict[str, Any],
     data_1: str | ScoreContentPartParam,
     data_2: str | ScoreContentPartParam,
+    score_template: str | None = None,
 ) -> tuple[str, TokensPrompt]:
     prompt_1, prompt_2, mm_data = parse_score_data(
         data_1,
@@ -190,19 +191,48 @@ def get_score_prompt(
     from vllm.model_executor.model_loader import get_model_cls
 
     model = get_model_cls(model_config)
-    if supports_score_template(model):
-        full_prompt = apply_score_template(model_config, prompt_1, prompt_2)
-        prompt_inputs = tokenizer(full_prompt, **tokenization_kwargs)
-    elif model_config.use_pad_token:
-        # cross_encoder models defaults to using pad_token.
-        prompt_inputs = tokenizer(
-            text=prompt_1, text_pair=prompt_2, **tokenization_kwargs
-        )
-        full_prompt = tokenizer.decode(prompt_inputs["input_ids"])
+
+    def default_tokenizer_encode():
+        if supports_score_template(model):
+            full_prompt = _apply_model_score_template(model_config, prompt_1, prompt_2)
+            prompt_inputs = tokenizer(full_prompt, **tokenization_kwargs)
+        else:
+            if model_config.use_pad_token:
+                # cross_encoder models defaults to using pad_token.
+                prompt_inputs = tokenizer(
+                    text=prompt_1, text_pair=prompt_2, **tokenization_kwargs
+                )
+                full_prompt = tokenizer.decode(prompt_inputs["input_ids"])
+            else:
+                # `llm as reranker` models defaults to not using pad_token.
+                full_prompt = prompt_1 + prompt_2
+                prompt_inputs = tokenizer(text=full_prompt, **tokenization_kwargs)
+        return full_prompt, prompt_inputs
+
+    # FIXME: For now, we only apply a template when one is explicitly provided.
+    # We cannot rely on the tokenizer's chat template because many models
+    # inherit junk templates from their base LLM, which breaks both the models
+    # and the tests that use them.
+    if score_template is None:
+        full_prompt, prompt_inputs = default_tokenizer_encode()
     else:
-        # `llm as reranker` models defaults to not using pad_token.
-        full_prompt = prompt_1 + prompt_2
-        prompt_inputs = tokenizer(text=full_prompt, **tokenization_kwargs)
+        # FIXME: Try applying a score template from the CLI arg or tokenizer_config.json
+        # If that fails because there is no such template,
+        # fall back to the default implementation.
+        try:
+            full_prompt = apply_hf_chat_template(
+                tokenizer,
+                [
+                    {"role": "query", "content": prompt_1},
+                    {"role": "document", "content": prompt_2},
+                ],
+                score_template,
+                tools=None,
+                model_config=model_config,
+            )
+            prompt_inputs = tokenizer(full_prompt, **tokenization_kwargs)
+        except ChatTemplateResolutionError:
+            full_prompt, prompt_inputs = default_tokenizer_encode()
 
     engine_prompt = TokensPrompt(prompt_token_ids=prompt_inputs["input_ids"])
 
diff --git a/vllm/envs.py b/vllm/envs.py
index 2f8158d88d6c5..1d4128d74b95c 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -24,6 +24,7 @@ if TYPE_CHECKING:
     LOCAL_RANK: int = 0
     CUDA_VISIBLE_DEVICES: str | None = None
     VLLM_ENGINE_ITERATION_TIMEOUT_S: int = 60
+    VLLM_ENGINE_READY_TIMEOUT_S: int = 600
     VLLM_API_KEY: str | None = None
     VLLM_DEBUG_LOG_API_SERVER_RESPONSE: bool = False
     S3_ACCESS_KEY_ID: str | None = None
@@ -244,6 +245,7 @@ if TYPE_CHECKING:
     VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD: int = 256
     VLLM_COMPILE_CACHE_SAVE_FORMAT: Literal["binary", "unpacked"] = "binary"
     VLLM_USE_V2_MODEL_RUNNER: bool = False
+    VLLM_DEBUG_MFU_METRICS: bool = False
 
 
 def get_default_cache_root():
@@ -603,6 +605,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_ENGINE_ITERATION_TIMEOUT_S": lambda: int(
         os.environ.get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "60")
     ),
+    # Timeout in seconds for waiting for engine cores to become ready
+    # during startup. Default is 600 seconds (10 minutes).
+    "VLLM_ENGINE_READY_TIMEOUT_S": lambda: int(
+        os.environ.get("VLLM_ENGINE_READY_TIMEOUT_S", "600")
+    ),
     # API key for vLLM API server
     "VLLM_API_KEY": lambda: os.environ.get("VLLM_API_KEY", None),
     # Whether to log responses from API Server for debugging
@@ -1262,7 +1269,8 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_MOONCAKE_BOOTSTRAP_PORT": lambda: int(
         os.getenv("VLLM_MOONCAKE_BOOTSTRAP_PORT", "8998")
     ),
-    # all2all backend for vllm's expert parallel communication
+    # [DEPRECATED - will be removed in v0.15.0] all2all backend for vllm's
+    # expert parallel communication. Use --all2all-backend CLI argument instead.
     # Available options:
     # - "naive": naive all2all implementation using broadcasts
     # - "allgather_reducescatter": all2all implementation based on allgather and
@@ -1273,7 +1281,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # - "flashinfer_all2allv", use flashinfer alltoallv kernels for mnnvl
     "VLLM_ALL2ALL_BACKEND": env_with_choices(
         "VLLM_ALL2ALL_BACKEND",
-        "allgather_reducescatter",
+        None,
         [
             "naive",
             "pplx",
@@ -1565,6 +1573,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_USE_V2_MODEL_RUNNER": lambda: bool(
         int(os.getenv("VLLM_USE_V2_MODEL_RUNNER", "0"))
     ),
+    # Debug logging for --enable-mfu-metrics
+    "VLLM_DEBUG_MFU_METRICS": lambda: bool(
+        int(os.getenv("VLLM_DEBUG_MFU_METRICS", "0"))
+    ),
 }
 
 # --8<-- [end:env-vars-definition]
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index 033cc1f544b3b..7a569ec32eac9 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -186,6 +186,7 @@ class DPMetadata:
 class ForwardContext:
     # copy from vllm_config.compilation_config.static_forward_context
     no_compile_layers: dict[str, Any]
+    attn_metadata: dict[str, AttentionMetadata] | list[dict[str, AttentionMetadata]]
     """
     Type Dict[str, AttentionMetadata] for v1, map from layer_name of each 
     attention layer to its attention metadata
@@ -193,7 +194,6 @@ class ForwardContext:
     for each microbatch.
     Set dynamically for each forward pass
     """
-    attn_metadata: dict[str, AttentionMetadata] | list[dict[str, AttentionMetadata]]
     # TODO: remove after making all virtual_engines share the same kv cache
     virtual_engine: int  # set dynamically for each forward pass
     # set dynamically for each forward pass
diff --git a/vllm/inputs/parse.py b/vllm/inputs/parse.py
index 71289277eb987..5e7795a14072f 100644
--- a/vllm/inputs/parse.py
+++ b/vllm/inputs/parse.py
@@ -45,16 +45,17 @@ def parse_raw_prompts(
 
         # case 4: array of token arrays
         if is_list_of(prompt, list):
-            first = prompt[0]
-            if not isinstance(first, list):
-                raise ValueError("prompt expected to be a list of lists")
-
-            if len(first) == 0:
-                raise ValueError("Please provide at least one prompt")
-
-            # strict validation: every nested list must be list[int]
-            if not all(is_list_of(elem, int) for elem in prompt):
-                raise TypeError("Nested lists must contain only integers")
+            if len(prompt) == 1 and isinstance(prompt[0], list) and len(prompt[0]) == 0:
+                raise ValueError("please provide at least one prompt")
+            for elem in prompt:
+                if not isinstance(elem, list):
+                    raise TypeError(
+                        "prompt must be a list of lists, but found a non-list element."
+                    )
+                if not is_list_of(elem, int):
+                    raise TypeError(
+                        "Nested lists of tokens must contain only integers."
+                    )
 
             prompt = cast(list[list[int]], prompt)
             return [TokensPrompt(prompt_token_ids=elem) for elem in prompt]
diff --git a/vllm/lora/lora_model.py b/vllm/lora/lora_model.py
index f5e36697ed18c..bc88c71eaf8d9 100644
--- a/vllm/lora/lora_model.py
+++ b/vllm/lora/lora_model.py
@@ -12,7 +12,6 @@ from vllm.lora.peft_helper import PEFTHelper
 from vllm.lora.utils import (
     get_lora_id,
     is_base_embeddding_weights,
-    is_regex_target_modules,
     parse_fine_tuned_lora_name,
 )
 from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
@@ -201,37 +200,13 @@ class LoRAModel:
                 for module in f.keys():  # noqa
                     tensors[module] = f.get_tensor(module)
         elif os.path.isfile(lora_bin_file_path) or os.path.isfile(lora_pt_file_path):
-            # When a bin/pt file is provided, we rely on config to find
-            # unexpected modules.
-            unexpected_modules = []
-            target_modules = peft_helper.target_modules
-            if not isinstance(target_modules, list):
-                target_modules = [target_modules]
-            for module in target_modules:
-                # Compatible with more modules,
-                # such as:layers.11.self_attn.k_proj
-                part_name = module.split(".")[-1]
-                if part_name not in expected_lora_modules:
-                    unexpected_modules.append(module)
-            # loaded lora's target modules must be a subset of
-            # expected_lora_modules. It is not reliable. See
-            # https://github.com/vllm-project/vllm/pull/5909. But there's no
-            # other better mechanism.
-            if unexpected_modules and not is_regex_target_modules(
-                peft_helper.target_modules, expected_lora_modules
-            ):
-                raise ValueError(
-                    f"While loading {lora_dir}, expected"
-                    f" target modules in {expected_lora_modules}"
-                    f" but received {unexpected_modules}."
-                    f" Please verify that the loaded LoRA module is correct"
-                )
             lora_file_path = (
                 lora_bin_file_path
                 if os.path.isfile(lora_bin_file_path)
                 else lora_pt_file_path
             )
             tensors = torch.load(lora_file_path, map_location=device, weights_only=True)
+            check_unexpected_modules(tensors)
         else:
             raise ValueError(f"{lora_dir} doesn't contain tensors")
 
diff --git a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
index 34383cdf1767c..f04936221eea6 100644
--- a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
+++ b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
@@ -156,16 +156,22 @@ def _fused_moe_lora_kernel(
         + offs_bn[None, :] * stride_bn
     )
 
+    if USE_GDC and IS_PRIMARY:
+        # GDC launch dependents hints the runtime system to launch dependent kernels.
+        tl.extra.cuda.gdc_launch_dependents()
+
     # accumulator
     accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+
+    # GDC wait waits for ALL programs in the prior kernel to complete
+    # before continuing.
+    if USE_GDC and not IS_PRIMARY:
+        tl.extra.cuda.gdc_wait()
+
     for k in range(0, grid_k):
         k_remaining = K - k * (BLOCK_SIZE_K * SPLIT_K)
         # pre-fetch lora weight
         b = tl.load(b_ptrs, mask=offs_k[:, None] < k_remaining, other=0.0)
-        # GDC wait waits for ALL programs in the prior kernel to complete
-        # before continuing.
-        if USE_GDC and not IS_PRIMARY:
-            tl.extra.cuda.gdc_wait()
         a = tl.load(
             a_ptrs,
             mask=token_mask[:, None] & (offs_k[None, :] < k_remaining),
@@ -179,9 +185,6 @@ def _fused_moe_lora_kernel(
     if MUL_ROUTED_WEIGHT:
         moe_weight = tl.load(topk_weights_ptr + offs_token, mask=token_mask, other=0)
         accumulator = accumulator * moe_weight[:, None]
-    if USE_GDC and IS_PRIMARY:
-        # GDC launch dependents hints the runtime system to launch dependent kernels.
-        tl.extra.cuda.gdc_launch_dependents()
     accumulator = accumulator.to(c_ptr.dtype.element_ty)
     # Write back the block of the output
     offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
@@ -290,6 +293,7 @@ def _fused_moe_lora_shrink(
 def _fused_moe_lora_expand(
     output: torch.Tensor,  # (num_tokens, top_k_num, N*len(lora_a_stacked),)
     a_intermediate_cache1: torch.Tensor,  # (num_slices, M, top_k_num, max_lora_rank)
+    b_intermediate_cache1: torch.Tensor,  # (num_slices, M, top_k_num, output_dim_size)
     lora_b_stacked: list[
         torch.Tensor
     ],  # [(max_loras, num_experts, max_lora_rank, K,),...]
@@ -331,11 +335,6 @@ def _fused_moe_lora_expand(
         -1, a_intermediate_cache1.shape[3]
     )
 
-    b_intermediate_cache1 = torch.zeros(
-        (num_slices, M, top_k_num, w1_output_dim_size),
-        dtype=output.dtype,
-        device=device,
-    )
     use_gdc = supports_pdl(a_intermediate_cache1.device)
     expand_config = {
         "BLOCK_SIZE_M": block_size_m,
@@ -460,6 +459,12 @@ def _fused_moe_lora(
         device=device,
     )
 
+    b_intermediate_cache1 = torch.zeros(
+        (num_slices, M, top_k_num, w1_output_dim_size),
+        dtype=output.dtype,
+        device=device,
+    )
+
     _fused_moe_lora_shrink(
         a_intermediate_cache1,
         qcurr_hidden_states,
@@ -506,6 +511,7 @@ def _fused_moe_lora(
     _fused_moe_lora_expand(
         output,
         a_intermediate_cache1,
+        b_intermediate_cache1,
         lora_b_stacked,
         topk_weights,
         sorted_token_ids,
diff --git a/vllm/lora/ops/triton_ops/utils.py b/vllm/lora/ops/triton_ops/utils.py
index 8ed42382e3a86..ed9e916455e5f 100644
--- a/vllm/lora/ops/triton_ops/utils.py
+++ b/vllm/lora/ops/triton_ops/utils.py
@@ -11,9 +11,11 @@ import torch
 
 from vllm import envs
 from vllm.logger import init_logger
+from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant
 from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
+is_batch_invariant = vllm_is_batch_invariant()
 
 _LORA_A_PTR_DICT: dict[tuple[int, ...], tuple[torch.tensor, ...]] = {}
 _LORA_B_PTR_DICT: dict[tuple[int, ...], tuple[torch.tensor, ...]] = {}
@@ -150,7 +152,8 @@ def _get_lora_b_ptr(
 @functools.lru_cache
 def load_lora_op_config(op_type: str, add_inputs: bool | None) -> dict | None:
     user_defined_config_folder = envs.VLLM_TUNED_CONFIG_FOLDER
-    if user_defined_config_folder is not None:
+    # Avoid optimizing for the batch invariant case. Use default config
+    if user_defined_config_folder is not None and not is_batch_invariant:
         gpu_name = torch.cuda.get_device_name()
         gpu_name = gpu_name.replace(" ", "_")
         gpu_name = gpu_name.replace("-", "_")
@@ -203,11 +206,14 @@ def get_lora_op_configs(
     # default config
     default = {}
     if op_type == "shrink":
+        split_k = 64 if batch < 128 else 8
+        if is_batch_invariant:
+            split_k = 1
         default = {
             "block_m": 32,
             "block_n": 16,
             "block_k": 256 if batch < 128 else 32,
-            "split_k": 64 if batch < 128 else 8,
+            "split_k": split_k,
             "num_warps": 4,
             "num_ctas": 1,
             "group_size_m": 8,
diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py
index 4d264c06826b8..75aeccd004422 100644
--- a/vllm/lora/utils.py
+++ b/vllm/lora/utils.py
@@ -5,7 +5,6 @@ import os
 from typing import TYPE_CHECKING, Optional
 
 import huggingface_hub
-import regex as re
 from huggingface_hub.utils import (
     EntryNotFoundError,
     HfHubHTTPError,
@@ -186,39 +185,6 @@ def is_base_embeddding_weights(name: str) -> bool:
     return name.endswith(embedding_suffixes)
 
 
-def is_regex_target_modules(
-    load_modules: str | list[str], expected_lora_modules: set[str]
-) -> bool:
-    """
-    PEFT supports passing `target_modules` in the form of regular expressions,
-    such as `model.*(q_proj|k_proj|v_proj)$`. This function is mainly used to
-    determine whether the suffix in the regular expression is present in the
-    `expected_lora_modules`.
-    """
-
-    def is_valid_regex(pattern):
-        try:
-            re.compile(pattern)
-            return True
-        except re.error:
-            return False
-
-    def is_subset(sub_list, full_set):
-        return set(sub_list).issubset(full_set)
-
-    # Similar to PEFT's processing logic, regex-related operations are only
-    #  executed when the load_modules is a `str`.
-    if not isinstance(load_modules, str):
-        return False
-
-    if is_valid_regex(load_modules):
-        match = re.search(r"\((.*?)\)\$?$", load_modules)
-        if match:
-            suffix = match.group(1).split("|")
-            return is_subset(suffix, expected_lora_modules)
-    return False
-
-
 def get_supported_lora_modules(model: nn.Module) -> list[str]:
     """
     In vLLM, all linear layers support LoRA.
diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py
index fde0826779eb1..1058270889b29 100644
--- a/vllm/model_executor/layers/batch_invariant.py
+++ b/vllm/model_executor/layers/batch_invariant.py
@@ -933,30 +933,26 @@ def enable_batch_invariant_mode():
     _batch_invariant_MODE = True
     _batch_invariant_LIB = torch.library.Library("aten", "IMPL")
 
-    # Batch invariant matmuls are no longer needed after cublas overrides
-    if not is_torch_equal_or_newer("2.10.0.dev"):
-        if (
-            current_platform.is_device_capability_family(100)
-            or current_platform.is_device_capability(80)
-            or current_platform.is_device_capability(89)
-        ):
-            # For PyTorch 2.9, B200 uses GEMV for bs=1
-            # Requires https://github.com/pytorch/pytorch/pull/166735
-            _batch_invariant_LIB.impl("aten::mm", mm_batch_invariant, "CUDA")
-            _batch_invariant_LIB.impl("aten::addmm", addmm_batch_invariant, "CUDA")
-            _batch_invariant_LIB.impl("aten::matmul", matmul_batch_invariant, "CUDA")
-            _batch_invariant_LIB.impl("aten::linear", linear_batch_invariant, "CUDA")
-        else:
-            # Only source of batch invariance for Hopper is split-k, can disable through
-            # cuBLAS workspace config
-            _original_cublas_workspace_cfg = os.environ.get(
-                "CUBLAS_WORKSPACE_CONFIG", None
-            )
-            _original_cublaslt_workspace_size = os.environ.get(
-                "CUBLASLT_WORKSPACE_SIZE", None
-            )
-            os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
-            os.environ["CUBLASLT_WORKSPACE_SIZE"] = "1"
+    if (
+        current_platform.is_device_capability_family(100)
+        or current_platform.is_device_capability(80)
+        or current_platform.is_device_capability(89)
+    ):
+        # For PyTorch 2.9, B200 uses GEMV for bs=1
+        # Requires https://github.com/pytorch/pytorch/pull/166735
+        _batch_invariant_LIB.impl("aten::mm", mm_batch_invariant, "CUDA")
+        _batch_invariant_LIB.impl("aten::addmm", addmm_batch_invariant, "CUDA")
+        _batch_invariant_LIB.impl("aten::matmul", matmul_batch_invariant, "CUDA")
+        _batch_invariant_LIB.impl("aten::linear", linear_batch_invariant, "CUDA")
+    else:
+        # Only source of batch invariance for Hopper is split-k, can disable through
+        # cuBLAS workspace config
+        _original_cublas_workspace_cfg = os.environ.get("CUBLAS_WORKSPACE_CONFIG", None)
+        _original_cublaslt_workspace_size = os.environ.get(
+            "CUBLASLT_WORKSPACE_SIZE", None
+        )
+        os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
+        os.environ["CUBLASLT_WORKSPACE_SIZE"] = "1"
 
     _batch_invariant_LIB.impl(
         "aten::_log_softmax", _log_softmax_batch_invariant, "CUDA"
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index 8fee4038b60b8..3d248e7fb9945 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -25,6 +25,9 @@ from vllm.model_executor.layers.fused_moe.unquantized_fused_moe_method import (
     UnquantizedFusedMoEMethod,
 )
 from vllm.model_executor.layers.fused_moe.utils import activation_without_mul
+from vllm.model_executor.layers.fused_moe.zero_expert_fused_moe import (
+    ZeroExpertFusedMoE,
+)
 from vllm.triton_utils import HAS_TRITON
 
 _config: dict[str, Any] | None = None
@@ -54,6 +57,7 @@ __all__ = [
     "FusedMoEPrepareAndFinalize",
     "RoutingMethodType",
     "SharedFusedMoE",
+    "ZeroExpertFusedMoE",
     "activation_without_mul",
     "override_config",
     "get_config",
diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py
index a9a2990ca2b53..d581e91f36d03 100644
--- a/vllm/model_executor/layers/fused_moe/config.py
+++ b/vllm/model_executor/layers/fused_moe/config.py
@@ -19,6 +19,7 @@ from vllm.model_executor.layers.quantization.utils.ocp_mx_utils import (
     OCP_MX_Scheme,
 )
 from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
+from vllm.platforms import current_platform
 from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
 from vllm.utils.import_utils import has_triton_kernels
 from vllm.utils.math_utils import cdiv
@@ -39,6 +40,7 @@ if has_triton_kernels():
 def _get_config_dtype_str(
     dtype: torch.dtype,
     use_fp8_w8a8: bool = False,
+    use_fp8_w8a16: bool = False,
     use_int8_w8a16: bool = False,
     use_int4_w4a16: bool = False,
     ocp_mx_scheme: str | None = None,
@@ -50,6 +52,8 @@ def _get_config_dtype_str(
     """
     if use_fp8_w8a8:
         return "fp8_w8a8"
+    elif use_fp8_w8a16:
+        return "fp8_w8a16"
     elif use_int8_w8a16:
         return "int8_w8a16"
     elif use_int4_w4a16:
@@ -319,6 +323,10 @@ class FusedMoEQuantConfig:
     def use_int8_w8a16(self) -> bool:
         return self._a1.dtype is None and self._w1.dtype == torch.int8
 
+    @property
+    def use_fp8_w8a16(self) -> bool:
+        return self._a1.dtype is None and self._w1.dtype == current_platform.fp8_dtype()
+
     @property
     def use_int4_w4a16(self) -> bool:
         return self._a1.dtype is None and self._w1.dtype == "int4"
@@ -362,6 +370,7 @@ class FusedMoEQuantConfig:
         """
         return _get_config_dtype_str(
             use_fp8_w8a8=self.use_fp8_w8a8,
+            use_fp8_w8a16=self.use_fp8_w8a16,
             use_int8_w8a16=self.use_int8_w8a16,
             use_int4_w4a16=self.use_int4_w4a16,
             ocp_mx_scheme=self.ocp_mx_scheme,
@@ -680,7 +689,6 @@ def int4_w4a16_moe_quant_config(
 ) -> FusedMoEQuantConfig:
     """
     Construct a quant config for 16-bit float activations and int4 weights.
-    Note: Activations are pre-quantized.
     """
     group_shape = GroupShape(*block_shape) if block_shape is not None else None
     return FusedMoEQuantConfig(
@@ -691,6 +699,27 @@ def int4_w4a16_moe_quant_config(
     )
 
 
+def fp8_w8a16_moe_quant_config(
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    block_shape: list[int] | None = None,
+) -> FusedMoEQuantConfig:
+    """
+    Construct a quant config for 16-bit float activations and fp8 weights.
+    """
+    group_shape = GroupShape(*block_shape) if block_shape is not None else None
+    return FusedMoEQuantConfig(
+        _a1=FusedMoEQuantDesc(),
+        _a2=FusedMoEQuantDesc(),
+        _w1=FusedMoEQuantDesc(
+            current_platform.fp8_dtype(), group_shape, w1_scale, None, None
+        ),
+        _w2=FusedMoEQuantDesc(
+            current_platform.fp8_dtype(), group_shape, w2_scale, None, None
+        ),
+    )
+
+
 def int8_w8a16_moe_quant_config(
     w1_scale: torch.Tensor,
     w2_scale: torch.Tensor,
@@ -700,7 +729,6 @@ def int8_w8a16_moe_quant_config(
 ) -> FusedMoEQuantConfig:
     """
     Construct a quant config for 16-bit float activations and int8 weights.
-    Note: Activations are pre-quantized.
     """
     group_shape = GroupShape(*block_shape) if block_shape is not None else None
     return FusedMoEQuantConfig(
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..3859583fb31f2
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.5.0",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI308X.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI308X.json
old mode 100755
new mode 100644
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..291a760cb2382
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.5.1",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..a081be65f613b
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.5.1",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=160,N=768,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=160,N=768,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..49aadc8c9dfd3
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=160,N=768,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.5.1",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..b03a587294217
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.5.0",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..7e57e97eef8a7
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.5.0",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..4438d15c56949
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.5.0",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..93f7227b11269
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.5.0",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..694dbf47b2074
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.5.0",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
index 659a2d4ee5b39..cf7a4313de24c 100644
--- a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
@@ -1,12 +1,22 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import weakref
 from collections.abc import Callable
 
 import torch
 from torch.nn import functional as F
 
 from vllm import _custom_ops as ops
+from vllm._custom_ops import cpu_fused_moe, cpu_prepack_moe_weight
 from vllm.model_executor.layers.activation import SiluAndMul, SwigluOAIAndMul
+from vllm.model_executor.layers.quantization.utils.layer_utils import replace_parameter
+from vllm.utils.torch_utils import direct_register_custom_op
+
+_CPU_MOE_LAYER_CACHE = {}
+_CPU_MOE_ACT = {
+    "silu": SiluAndMul(),
+    "swigluoai": SwigluOAIAndMul(),
+}
 
 
 def grouped_topk(
@@ -174,8 +184,105 @@ class SGLFusedMOE:
 
 class CPUFusedMOE:
     def __init__(self, layer: torch.nn.Module) -> None:
-        use_onednn_mm = ops._supports_onednn and ops.is_onednn_acl_supported()
+        use_grouped_gemm, isa = self.check_grouped_gemm(layer)
+        self.isa = isa
+        if use_grouped_gemm:
+            self.forward_method = self.forward_grouped_gemm
+            self.init_moe_grouped_gemm(layer=layer)
+        else:
+            self.forward_method = self.forward_torch
+            self.init_moe_torch(layer=layer)
 
+    def __call__(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        use_grouped_topk: bool,
+        top_k: int,
+        router_logits: torch.Tensor,
+        renormalize: bool,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
+        global_num_experts: int = -1,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
+        scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
+        e_score_correction_bias: torch.Tensor | None = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+    ) -> torch.Tensor:
+        assert activation in _CPU_MOE_ACT, f"{activation} is not supported."
+        assert not apply_router_weight_on_input
+
+        topk_weights, topk_ids = select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
+            e_score_correction_bias=e_score_correction_bias,
+        )
+
+        return self.forward_method(
+            layer,
+            x,
+            topk_weights,
+            topk_ids,
+            activation,
+            global_num_experts,
+        )
+
+    def check_grouped_gemm(
+        self,
+        layer: torch.nn.Module,
+    ) -> tuple[bool, str]:
+        if not hasattr(torch.ops._C, "prepack_moe_weight"):
+            return False, "none"
+
+        dtype = layer.w13_weight.dtype
+        w13_input_size = layer.w13_weight.size(2)
+        w13_output_size = layer.w13_weight.size(1)
+        w2_input_size = layer.w2_weight.size(2)
+        w2_output_size = layer.w2_weight.size(1)
+
+        if not (w13_output_size % 32 == 0 and w2_output_size % 32 == 0):
+            return False, "none"
+
+        supports_amx = torch._C._cpu._is_amx_tile_supported()
+
+        if (
+            supports_amx
+            and dtype == torch.bfloat16
+            and w13_input_size % 32 == 0
+            and w2_input_size % 32 == 0
+        ):
+            return True, "amx"
+
+        if supports_amx:
+            return False, "none"
+
+        return True, "vec"
+
+    def init_moe_grouped_gemm(
+        self,
+        layer: torch.nn.Module,
+    ) -> None:
+        new_w13 = cpu_prepack_moe_weight(layer.w13_weight, self.isa)
+        replace_parameter(layer, "w13_weight", new_w13)
+        new_w2 = cpu_prepack_moe_weight(layer.w2_weight, self.isa)
+        replace_parameter(layer, "w2_weight", new_w2)
+
+    def init_moe_torch(
+        self,
+        layer: torch.nn.Module,
+    ) -> None:
+        use_onednn_mm = ops._supports_onednn and ops.is_onednn_acl_supported()
         num_experts = layer.w13_weight.size(0)
         has_w13_bias = hasattr(layer, "w13_bias")
         has_w2_bias = hasattr(layer, "w2_bias")
@@ -208,85 +315,112 @@ class CPUFusedMOE:
                 layer.down_linear.append(
                     lambda x, w=layer_w2_weight, b=layer_w2_bias: F.linear(x, w, b)
                 )
+
         if use_onednn_mm:  # remove weight
             layer.w13_weight = torch.nn.Parameter(torch.empty(0), requires_grad=False)
             layer.w2_weight = torch.nn.Parameter(torch.empty(0), requires_grad=False)
 
-        self.act_to_impl = {
-            "silu": SiluAndMul(),
-            "swigluoai": SwigluOAIAndMul(),
-        }
+        _CPU_MOE_LAYER_CACHE[id(layer)] = weakref.ref(layer)
 
-    def __call__(
+    def forward_grouped_gemm(
         self,
         layer: torch.nn.Module,
-        x: torch.Tensor,
-        use_grouped_topk: bool,
-        top_k: int,
-        router_logits: torch.Tensor,
-        renormalize: bool,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
+        input: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
         global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
     ) -> torch.Tensor:
-        assert activation in self.act_to_impl, f"{activation} is not supported."
-        assert not apply_router_weight_on_input
-        topk_weights, topk_ids = select_experts(
-            hidden_states=x,
-            router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
+        output = cpu_fused_moe(
+            input,
+            layer.w13_weight,
+            layer.w2_weight,
+            getattr(layer, "w13_bias", None),
+            getattr(layer, "w2_bias", None),
+            topk_weights,
+            topk_ids,
+            activation,
+            self.isa,
+        )
+        return output
+
+    def forward_torch(
+        self,
+        layer: torch.nn.Module,
+        input: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int = -1,
+    ) -> torch.Tensor:
+        output = torch.empty_like(input)
+        layer_id = id(layer)
+        torch.ops.vllm.cpu_fused_moe_torch(
+            layer_id,
+            output,
+            input,
+            topk_weights,
+            topk_ids,
+            activation,
+            global_num_experts,
         )
 
-        # Ref code from https://github.com/sgl-project/sglang/blob/716e682721397df103f347d22da8bd46c6016dab/python/sglang/srt/layers/moe/fused_moe_native.py#L53
-        len_experts = global_num_experts
+        return output
 
-        cnts = topk_ids.new_zeros((topk_ids.shape[0], len_experts))
-        cnts.scatter_(1, topk_ids.to(torch.int64), 1)
-        tokens_per_expert = cnts.sum(dim=0)
-        idxs = topk_ids.view(-1).argsort()
 
-        sorted_tokens = x[idxs // topk_ids.shape[1]]
-        tokens_per_expert = tokens_per_expert.cpu().numpy()
+def cpu_fused_moe_torch(
+    layer_id: int,
+    output: torch.Tensor,
+    input: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    activation: str,
+    global_num_experts: int = -1,
+) -> None:
+    layer = _CPU_MOE_LAYER_CACHE[layer_id]()
 
-        outputs = []
-        start_idx = 0
+    # Ref code from https://github.com/sgl-project/sglang/blob/716e682721397df103f347d22da8bd46c6016dab/python/sglang/srt/layers/moe/fused_moe_native.py#L53
+    len_experts = global_num_experts
 
-        for i, num_tokens in enumerate(tokens_per_expert):
-            end_idx = start_idx + num_tokens
-            if num_tokens == 0:
-                continue
-            tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
+    cnts = topk_ids.new_zeros((topk_ids.shape[0], len_experts))
+    cnts.scatter_(1, topk_ids.to(torch.int64), 1)
+    tokens_per_expert = cnts.sum(dim=0)
+    idxs = topk_ids.view(-1).argsort()
 
-            gate_up = layer.gate_up_linear[i](tokens_for_this_expert)
-            gate_up = self.act_to_impl[activation].forward_native(gate_up)
-            expert_out = layer.down_linear[i](gate_up)
-            outputs.append(expert_out)
-            start_idx = end_idx
+    sorted_tokens = input[idxs // topk_ids.shape[1]]
+    tokens_per_expert = tokens_per_expert.cpu().numpy()
 
-        outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)
-        new_x = torch.empty_like(outs)
+    outputs = []
+    start_idx = 0
 
-        new_x[idxs] = outs
-        final_out = (
-            new_x.view(*topk_ids.shape, -1)
-            .type(topk_weights.dtype)
-            .mul_(topk_weights.unsqueeze(dim=-1))
-            .sum(dim=1)
-            .type(new_x.dtype)
-        )
-        return final_out
+    for i, num_tokens in enumerate(tokens_per_expert):
+        end_idx = start_idx + num_tokens
+        if num_tokens == 0:
+            continue
+        tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
+
+        gate_up = layer.gate_up_linear[i](tokens_for_this_expert)  # type: ignore
+        gate_up = _CPU_MOE_ACT[activation].forward_native(gate_up)
+        expert_out = layer.down_linear[i](gate_up)  # type: ignore
+        outputs.append(expert_out)
+        start_idx = end_idx
+
+    outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)
+    new_x = torch.empty_like(outs)
+
+    new_x[idxs] = outs
+    final_out = (
+        new_x.view(*topk_ids.shape, -1)
+        .type(topk_weights.dtype)
+        .mul_(topk_weights.unsqueeze(dim=-1))
+        .sum(dim=1)
+        .type(new_x.dtype)
+    )
+    output.copy_(final_out)
+
+
+direct_register_custom_op(
+    op_name="cpu_fused_moe_torch",
+    op_func=cpu_fused_moe_torch,
+    mutates_args=["output"],
+)
diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
index 4a0b4e82c1b39..9281780fca478 100644
--- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
@@ -21,7 +21,7 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
     TopKWeightAndReduceDelegate,
     TopKWeightAndReduceNoOP,
 )
-from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize, _resize_cache
+from vllm.model_executor.layers.fused_moe.utils import _resize_cache
 from vllm.scalar_type import scalar_types
 
 logger = init_logger(__name__)
@@ -896,162 +896,6 @@ def cutlass_moe_fp4(
     )
 
 
-def _valid_cutlass_block_scaled_grouped_gemm(
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    inplace: bool,
-    activation: str,
-    apply_router_weight_on_input: bool,
-    expert_map: torch.Tensor | None,
-) -> bool:
-    def _valid_cutlass_block_scaled_grouped_gemm_shape(N: int, K: int):
-        return N % 128 == 0 and K % 128 == 0
-
-    _, K, N = w2.size()
-    if not _valid_cutlass_block_scaled_grouped_gemm_shape(N, K):
-        logger.debug_once(
-            "CutlassBlockScaledGroupedGemm disabled: unaligned problem size. "
-            "N: %s, K: %s",
-            N,
-            K,
-        )
-        return False
-
-    if w1.dtype != torch.float8_e4m3fn or w2.dtype != torch.float8_e4m3fn:
-        logger.debug_once(
-            "CutlassBlockScaledGroupedGemm disabled: invalid weight dtype(s). "
-            "w1.dtype: %s, w2.dtype: %s",
-            w1.dtype,
-            w2.dtype,
-        )
-        return False
-
-    if expert_map is not None:
-        logger.debug_once(
-            "CutlassBlockScaledGroupedGemm disabled: expert_parallel is not supported."
-        )
-        return False
-
-    if activation != "silu":
-        logger.debug_once(
-            "CutlassBlockScaledGroupedGemm disabled: only activation silu is supported."
-        )
-        return False
-
-    if apply_router_weight_on_input:
-        logger.debug_once(
-            "CutlassBlockScaledGroupedGemm disabled:"
-            " apply_router_weight_on_input is not supported."
-        )
-        return False
-
-    if inplace:
-        logger.debug_once(
-            "CutlassBlockScaledGroupedGemm disabled: inplace is not supported."
-        )
-        return False
-
-    return True
-
-
-# TODO(bnell): would be nice combine/integrate with regular cutlass_fp8.
-def run_cutlass_block_scaled_fused_experts(
-    a: torch.Tensor,
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    w1_scale: torch.Tensor,
-    w2_scale: torch.Tensor,
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-) -> torch.Tensor:
-    w1_q = w1.transpose(1, 2)
-    w2_q = w2.transpose(1, 2)
-    w1_scale = w1_scale.transpose(1, 2)
-    w2_scale = w2_scale.transpose(1, 2)
-
-    assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
-    assert a.shape[0] == topk_ids.shape[0], (
-        "a and topk_ids must have the same batch size"
-    )
-    assert w1_q.dtype == torch.float8_e4m3fn, "w1_q must be float8_e4m3fn"
-    assert w2_q.dtype == torch.float8_e4m3fn, "w2_q must be float8_e4m3fn"
-    assert a.shape[1] == w1_q.shape[1], "Hidden size mismatch w1"
-    assert w1_q.shape[2] == w2_q.shape[1] * 2, "Hidden size mismatch w2"
-    assert w1_q.shape[0] == w2_q.shape[0], "Expert number mismatch"
-    assert w1_q.shape[0] == w1_scale.shape[0], "w1_scale expert number mismatch"
-    assert w1_q.shape[0] == w2_scale.shape[0], "w2_scale expert number mismatch"
-    assert a.dtype in [torch.half, torch.bfloat16], "Invalid output dtype"
-
-    out_dtype = a.dtype
-    num_experts = w1_q.size(0)
-    m = a.size(0)
-    k = w1_q.size(1)
-    n = w2_q.size(1)
-
-    topk = topk_ids.size(1)
-
-    a_q, a1_scale = _fp8_quantize(
-        a, A_scale=None, per_act_token=False, block_shape=[128, 128]
-    )
-    device = a_q.device
-
-    expert_offsets = torch.empty((num_experts + 1,), dtype=torch.int32, device=device)
-    problem_sizes1 = torch.empty((num_experts, 3), dtype=torch.int32, device=device)
-    problem_sizes2 = torch.empty((num_experts, 3), dtype=torch.int32, device=device)
-
-    a_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
-    c_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
-
-    ops.get_cutlass_moe_mm_data(
-        topk_ids,
-        expert_offsets,
-        problem_sizes1,
-        problem_sizes2,
-        a_map,
-        c_map,
-        num_experts,
-        n,
-        k,
-    )
-
-    rep_a_q = a_q.view(dtype=torch.uint8)[a_map].view(dtype=a_q.dtype)
-    rep_a1_scales = a1_scale[a_map]
-
-    c1 = torch.empty((m * topk, n * 2), dtype=out_dtype, device=device)
-    c2 = torch.empty((m * topk, k), dtype=out_dtype, device=device)
-
-    ops.cutlass_blockwise_scaled_grouped_mm(
-        c1,
-        rep_a_q,
-        w1_q,
-        rep_a1_scales,
-        w1_scale,
-        problem_sizes1,
-        expert_offsets[:-1],
-    )
-
-    intermediate = torch.empty((m * topk, n), dtype=out_dtype, device=device)
-    torch.ops._C.silu_and_mul(intermediate, c1)
-
-    intermediate_q, a2_scale = _fp8_quantize(
-        intermediate, A_scale=None, per_act_token=False, block_shape=[128, 128]
-    )
-
-    ops.cutlass_blockwise_scaled_grouped_mm(
-        c2,
-        intermediate_q,
-        w2_q,
-        a2_scale,
-        w2_scale,
-        problem_sizes2,
-        expert_offsets[:-1],
-    )
-
-    return (
-        c2[c_map].view(m, topk, k) * topk_weights.view(m, topk, 1).to(out_dtype)
-    ).sum(dim=1)
-
-
 # W4A8
 def run_cutlass_moe_w4a8_fp8(
     output: torch.Tensor,
diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
index 92d72b75656cd..295a2a28156ed 100644
--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -13,9 +13,6 @@ from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
     batched_moe_align_block_size,
     moe_align_block_size,
 )
-from vllm.model_executor.layers.fused_moe.prepare_finalize import (
-    MoEPrepareAndFinalizeNoEP,
-)
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
     TopKWeightAndReduceDelegate,
     TopKWeightAndReduceNoOP,
@@ -26,6 +23,7 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     marlin_moe_intermediate_size,
     marlin_quant_input,
 )
+from vllm.platforms import current_platform
 from vllm.scalar_type import ScalarType, scalar_types
 
 
@@ -542,9 +540,11 @@ class MarlinExpertsBase(mk.FusedMoEPermuteExpertsUnpermute):
         is_k_full: bool = True,
     ):
         # TODO (varun) : Enable activation quantization
-        assert quant_config.use_mxfp4_w4a16 or quant_config.use_int4_w4a16, (
-            "Supports only mxfp4_w4a16 or int4_w4a16"
-        )
+        assert (
+            quant_config.use_mxfp4_w4a16
+            or quant_config.use_int4_w4a16
+            or quant_config.use_fp8_w8a16
+        ), "Supports only mxfp4_w4a16, int4_w4a16 or fp8_w8a16"
         self.w13_g_idx = w13_g_idx
         self.w2_g_idx = w2_g_idx
         self.w13_g_idx_sort_indices = w13_g_idx_sort_indices
@@ -555,11 +555,17 @@ class MarlinExpertsBase(mk.FusedMoEPermuteExpertsUnpermute):
     @property
     def quant_type_id(self) -> int:
         # uint4b8 will be set for int4 weight and float4_e2m1f will be used for mxfp4
-        return (
-            scalar_types.uint4b8.id
-            if self.quant_config.use_int4_w4a16
-            else scalar_types.float4_e2m1f.id
-        )
+        if self.quant_config.use_int4_w4a16:
+            return scalar_types.uint4b8.id
+        elif self.quant_config.use_mxfp4_w4a16:
+            return scalar_types.float4_e2m1f.id
+        elif (
+            self.quant_config.use_fp8_w8a16
+            and current_platform.fp8_dtype() == torch.float8_e4m3fn
+        ):
+            return scalar_types.float8_e4m3fn.id
+        else:
+            raise NotImplementedError("Unsupported quantization type.")
 
     def moe_problem_size(
         self,
@@ -711,16 +717,6 @@ class MarlinExperts(MarlinExpertsBase):
         ops.moe_sum(input, output)
 
 
-def modular_marlin_fused_moe(
-    quant_config: FusedMoEQuantConfig, shared_experts: torch.nn.Module | None = None
-) -> mk.FusedMoEModularKernel:
-    return mk.FusedMoEModularKernel(
-        MoEPrepareAndFinalizeNoEP(),
-        MarlinExperts(quant_config),
-        shared_experts,
-    )
-
-
 class BatchedMarlinExperts(MarlinExpertsBase):
     def __init__(
         self,
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 20782e2712f27..bf51554341607 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -25,10 +25,6 @@ from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEQuantConfig,
     _get_config_dtype_str,
 )
-from vllm.model_executor.layers.fused_moe.cutlass_moe import (
-    _valid_cutlass_block_scaled_grouped_gemm,
-    run_cutlass_block_scaled_fused_experts,
-)
 from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
     _valid_deep_gemm,
     deep_gemm_moe_fp8,
@@ -1678,11 +1674,9 @@ def fused_experts(
     expert_map: torch.Tensor | None = None,
     quant_config: FusedMoEQuantConfig | None = None,
     allow_deep_gemm: bool = False,
-    allow_cutlass_block_scaled_grouped_gemm: bool = False,
 ) -> torch.Tensor:
     if quant_config is None:
         quant_config = FUSED_MOE_UNQUANTIZED_CONFIG
-    use_fp8_w8a8 = quant_config.use_fp8_w8a8
 
     # For now, disable DeepGemm for small N (<= 512) until better
     # permute/unpermute ops are available.
@@ -1696,7 +1690,6 @@ def fused_experts(
         and (is_deep_gemm_e8m0_used() or _valid_deep_gemm(hidden_states, w1, w2))
     ):
         assert quant_config is not None
-        assert apply_router_weight_on_input is False
         return deep_gemm_moe_fp8(
             hidden_states=hidden_states,
             w1=w1,
@@ -1713,23 +1706,6 @@ def fused_experts(
             a2_scale=quant_config.a2_scale,
             apply_router_weight_on_input=apply_router_weight_on_input,
         )
-    elif (
-        allow_cutlass_block_scaled_grouped_gemm
-        and use_fp8_w8a8
-        and _valid_cutlass_block_scaled_grouped_gemm(
-            w1, w2, inplace, activation, apply_router_weight_on_input, expert_map
-        )
-    ):
-        assert quant_config is not None
-        return run_cutlass_block_scaled_fused_experts(
-            a=hidden_states,
-            w1=w1,
-            w2=w2,
-            w1_scale=quant_config.w1_scale,
-            w2_scale=quant_config.w2_scale,
-            topk_weights=topk_weights,
-            topk_ids=topk_ids,
-        )
     else:
         return dispatch_fused_experts_func(inplace)(
             hidden_states=hidden_states,
@@ -2156,6 +2132,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
             torch.float16,
             torch.bfloat16,
             torch.float8_e4m3fn,
+            torch.float8_e4m3fnuz,
         ]
 
         E, num_tokens, N, K, top_k_num = self.moe_problem_size(
@@ -2180,7 +2157,10 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
             compute_type = tl.float16
         elif hidden_states.dtype == torch.float32:
             compute_type = tl.float32
-        elif hidden_states.dtype == torch.float8_e4m3fn:
+        elif (
+            hidden_states.dtype == torch.float8_e4m3fn
+            or hidden_states.dtype == torch.float8_e4m3fnuz
+        ):
             compute_type = tl.bfloat16
         else:
             raise ValueError(f"Unsupported compute_type: {hidden_states.dtype}")
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
index 9c9bc2514bb4b..30ff1bf2f008a 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
@@ -92,7 +92,7 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
         x: torch.Tensor,
         router_logits: torch.Tensor,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        topk_weights, topk_ids, zero_expert_result = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
@@ -110,10 +110,4 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
             expert_map=None if self.disable_expert_map else layer.expert_map,
         )
 
-        if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
-            assert not isinstance(result, tuple), (
-                "Shared + zero experts are mutually exclusive not yet supported"
-            )
-            return result, zero_expert_result
-        else:
-            return result
+        return result
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index db97d6eb88ea5..2e7267d56d838 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -32,7 +32,6 @@ from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEQuantConfig,
     RoutingMethodType,
 )
-from vllm.model_executor.layers.fused_moe.fused_moe import zero_experts_compute_triton
 from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
     init_aiter_topK_meta_data,
 )
@@ -350,8 +349,6 @@ class FusedMoE(CustomOp):
         num_redundant_experts: int = 0,
         has_bias: bool = False,
         is_sequence_parallel=False,
-        zero_expert_num: int | None = 0,
-        zero_expert_type: str | None = None,
         expert_mapping: list[tuple[str, str, int, str]] | None = None,
         n_shared_experts: int | None = None,
         routing_method_type: int | None = None,
@@ -409,8 +406,6 @@ class FusedMoE(CustomOp):
 
         self.global_num_experts = num_experts + num_redundant_experts
         self.logical_num_experts = num_experts
-        self.zero_expert_num = zero_expert_num
-        self.zero_expert_type = zero_expert_type
 
         # Expert mapping used in self.load_weights
         self.expert_mapping = expert_mapping
@@ -1525,15 +1520,15 @@ class FusedMoE(CustomOp):
         self,
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         """
         Route the input hidden states to the top-k experts based on the
         router logits.
 
         Returns:
-                (topk_weights, topk_ids, zero_expert_result)
-                (tuple[torch.Tensor, torch.Tensor, torch.Tensor]):
-                The weights, expert ids, and zero expert computation result.
+                (topk_weights, topk_ids)
+                (tuple[torch.Tensor, torch.Tensor]):
+                The weights and expert ids.
 
             **Compatibility**: When EPLB is not enabled, the returned ids are
             equivalent to global logical ids, so should be compatible with
@@ -1655,23 +1650,7 @@ class FusedMoE(CustomOp):
 
         assert topk_ids.dtype == indices_type or indices_type is None
 
-        # Compute zero expert result if needed
-        if (
-            self.zero_expert_num is not None
-            and self.zero_expert_num > 0
-            and self.zero_expert_type is not None
-            and self.global_num_experts is not None
-        ):
-            zero_expert_result = zero_experts_compute_triton(
-                expert_indices=topk_ids,
-                expert_scales=topk_weights,
-                num_experts=self.global_num_experts,
-                zero_expert_type=self.zero_expert_type,
-                hidden_states=hidden_states,
-            )
-        else:
-            zero_expert_result = None
-        return topk_weights, topk_ids, zero_expert_result
+        return topk_weights, topk_ids
 
     def must_reduce_shared_expert_outputs(self) -> bool:
         """
@@ -1726,27 +1705,22 @@ class FusedMoE(CustomOp):
             return states
 
         if self.shared_experts is None:
-            if current_platform.is_tpu():
+            if current_platform.is_tpu() or current_platform.is_cpu():
                 # TODO: Once the OOM issue for the TPU backend is resolved, we
                 # will switch to using the moe_forward custom op.
+                # Note: CPU doesn't require wrapped forward_impl.
                 fused_output = self.forward_impl(hidden_states, router_logits)
                 assert not isinstance(fused_output, tuple)
             else:
                 fused_output = torch.ops.vllm.moe_forward(
                     hidden_states, router_logits, self.layer_name
                 )
-            if self.zero_expert_num is not None and self.zero_expert_num > 0:
-                assert isinstance(fused_output, tuple)
-                fused_output, zero_expert_result = fused_output
-                return (reduce_output(fused_output) + zero_expert_result)[
-                    ..., :og_hidden_states
-                ]
-            else:
-                return reduce_output(fused_output)[..., :og_hidden_states]
+            return reduce_output(fused_output)[..., :og_hidden_states]
         else:
-            if current_platform.is_tpu():
+            if current_platform.is_tpu() or current_platform.is_cpu():
                 # TODO: Once the OOM issue for the TPU backend is resolved, we
                 # will switch to using the moe_forward custom op.
+                # Note: CPU doesn't require wrapped forward_impl.
                 shared_output, fused_output = self.forward_impl(
                     hidden_states, router_logits
                 )
@@ -1839,13 +1813,6 @@ class FusedMoE(CustomOp):
                     final_hidden_states,
                 )
 
-            if self.zero_expert_num is not None and self.zero_expert_num > 0:
-                assert isinstance(final_hidden_states, tuple)
-                assert self.shared_experts is None
-                final_hidden_states, zero_expert_result = final_hidden_states
-                if zero_expert_result is not None:
-                    final_hidden_states += zero_expert_result
-
             if not skip_result_store:
                 if self.shared_experts is None:
                     full_fused_final_hidden_states[chunk_start:chunk_end, :].copy_(
@@ -2028,9 +1995,6 @@ class FusedMoE(CustomOp):
                     shared_output,
                     final_hidden_states,
                 )
-            elif self.zero_expert_num is not None and self.zero_expert_num > 0:
-                assert isinstance(final_hidden_states, tuple)
-                final_hidden_states, zero_expert_result = final_hidden_states
 
             def combine_output(states: torch.Tensor) -> torch.Tensor:
                 if do_naive_dispatch_combine:
@@ -2049,9 +2013,6 @@ class FusedMoE(CustomOp):
                     final_hidden_states[0],
                     combine_output(final_hidden_states[1]),
                 )
-            elif self.zero_expert_num is not None and self.zero_expert_num > 0:
-                assert isinstance(final_hidden_states, torch.Tensor)
-                return (combine_output(final_hidden_states), zero_expert_result)
             else:
                 return combine_output(final_hidden_states)
 
diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
index 484314091cb15..25308b3106a44 100644
--- a/vllm/model_executor/layers/fused_moe/modular_kernel.py
+++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -743,7 +743,7 @@ class FusedMoEModularKernel(torch.nn.Module):
             1,
             (
                 M
-                if not self.fused_experts.supports_chunking()
+                if not self.fused_experts.enable_chunking()
                 else min(M, envs.VLLM_FUSED_MOE_CHUNK_SIZE)
             ),
         )
@@ -786,7 +786,7 @@ class FusedMoEModularKernel(torch.nn.Module):
             is_forward_context_available()
             and get_forward_context().attn_metadata is None
         )
-        if is_profile_run and self.fused_experts.supports_chunking() and self.is_dp_ep:
+        if is_profile_run and self.fused_experts.enable_chunking() and self.is_dp_ep:
             max_workspace_13, max_workspace_2, max_fused_out_shape = (
                 self.fused_experts.workspace_shapes(
                     envs.VLLM_FUSED_MOE_CHUNK_SIZE,
@@ -795,7 +795,10 @@ class FusedMoEModularKernel(torch.nn.Module):
                     top_k,
                     global_num_experts,
                     local_num_experts,
-                    expert_tokens_meta,
+                    # expert_tokens_meta help in allocating optimal/minimal
+                    # amount of workspace. Mark it None, so we allocate for
+                    # the worst-case scenario.
+                    expert_tokens_meta=None,
                 )
             )
 
diff --git a/vllm/model_executor/layers/fused_moe/prepare_finalize.py b/vllm/model_executor/layers/fused_moe/prepare_finalize.py
index e27e2eb32da0f..5d806fa843a3c 100644
--- a/vllm/model_executor/layers/fused_moe/prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/prepare_finalize.py
@@ -13,6 +13,10 @@ from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input
 
 
 class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize):
+    def __init__(self, defer_input_quant: bool = False) -> None:
+        super().__init__()
+        self.defer_input_quant = defer_input_quant
+
     @property
     def activation_format(self) -> mk.FusedMoEActivationFormat:
         return mk.FusedMoEActivationFormat.Standard
@@ -48,6 +52,11 @@ class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize):
             # Note: do not use inplace for shared experts overlap
             a1 = a1 * topk_weights.to(a1.dtype)
 
+        # Defer input quant to moe kernel for backends (e.g. AITER, FI)
+        # which use a single kernel call for quant + experts.
+        if self.defer_input_quant:
+            return a1, None, None, None, None
+
         a1q, a1q_scale = moe_kernel_quantize_input(
             a1,
             quant_config.a1_scale,
diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
index 882ad0a537cd5..ebd9e3a4a8f2a 100644
--- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@@ -5,11 +5,15 @@ from functools import lru_cache
 
 import torch
 
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm._aiter_ops import rocm_aiter_ops
 from vllm.model_executor.layers.fused_moe.config import (
     FUSED_MOE_UNQUANTIZED_CONFIG,
     FusedMoEQuantConfig,
 )
+from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
+    TopKWeightAndReduceNoOP,
+)
 
 
 class QuantMethod(IntEnum):
@@ -263,3 +267,78 @@ def rocm_aiter_fused_experts(
             a2_scale=quant_config.a2_scale,
             doweight_stage1=apply_router_weight_on_input,
         )
+
+
+class AiterExperts(mk.FusedMoEPermuteExpertsUnpermute):
+    def __init__(self, quant_config):
+        super().__init__(quant_config)
+
+    @property
+    def activation_formats(
+        self,
+    ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
+        return (
+            mk.FusedMoEActivationFormat.Standard,
+            mk.FusedMoEActivationFormat.Standard,
+        )
+
+    def supports_expert_map(self):
+        return True
+
+    def supports_chunking(self):
+        return False
+
+    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
+        return TopKWeightAndReduceNoOP()
+
+    def workspace_shapes(
+        self,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        global_num_experts: int,
+        local_num_experts: int,
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
+        # Workspaces are managed internally by AITER.
+        workspace1 = (0,)
+        workspace2 = (0,)
+        output = (M, K)
+        return (workspace1, workspace2, output)
+
+    def apply(
+        self,
+        output: torch.Tensor,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        a2_scale: torch.Tensor | None,
+        workspace13: torch.Tensor,
+        workspace2: torch.Tensor,
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+        apply_router_weight_on_input: bool,
+    ):
+        assert a1q_scale is None
+        assert a2_scale is None
+        assert expert_tokens_meta is None
+
+        result = rocm_aiter_fused_experts(
+            hidden_states=hidden_states,
+            w1=w1,
+            w2=w2,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            activation=activation,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            expert_map=expert_map,
+            quant_config=self.quant_config,
+        )
+        assert result.shape == output.shape
+        output.copy_(result)
diff --git a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
index 6182f10aa70f0..82dbccf3fa9da 100644
--- a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
+++ b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
@@ -6,6 +6,7 @@ import torch
 import torch.nn.functional as F
 
 import vllm.envs as envs
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm._aiter_ops import rocm_aiter_ops
 from vllm.logger import init_logger
 from vllm.model_executor.custom_op import CustomOp
@@ -23,6 +24,9 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import (
     FusedMoEPermuteExpertsUnpermute,
     FusedMoEPrepareAndFinalize,
 )
+from vllm.model_executor.layers.fused_moe.prepare_finalize import (
+    MoEPrepareAndFinalizeNoEP,
+)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.platforms.interface import CpuArchEnum
@@ -30,9 +34,9 @@ from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
 
 if current_platform.is_cuda_alike():
     from .fused_batched_moe import BatchedTritonExperts
-    from .fused_moe import TritonExperts, fused_experts
+    from .fused_moe import TritonExperts
 else:
-    fused_experts = None  # type: ignore
+    TritonExperts = None  # type: ignore
 
 if current_platform.is_tpu():
     from .moe_pallas import fused_moe as fused_moe_pallas
@@ -265,6 +269,13 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
                     layer.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer)
             else:
                 layer.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer)
+        elif current_platform.is_cuda_alike():
+            self.moe_quant_config = self.get_fused_moe_quant_config(layer)
+            self.kernel = mk.FusedMoEModularKernel(
+                MoEPrepareAndFinalizeNoEP(),
+                TritonExperts(self.moe_quant_config),
+                shared_experts=None,
+            )
 
     def apply(
         self,
@@ -278,9 +289,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
             router_logits=router_logits,
         )
 
-    def get_fused_moe_quant_config(
-        self, layer: torch.nn.Module
-    ) -> FusedMoEQuantConfig | None:
+    def get_fused_moe_quant_config(self, layer: torch.nn.Module) -> FusedMoEQuantConfig:
         if self.moe.has_bias:
             return biased_moe_quant_config(
                 layer.w13_bias,
@@ -295,7 +304,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         x: torch.Tensor,
         router_logits: torch.Tensor,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        topk_weights, topk_ids, zero_expert_result = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
@@ -322,7 +331,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
                 apply_router_weight_on_input=layer.apply_router_weight_on_input,
             )
         else:
-            result = fused_experts(
+            result = self.kernel(
                 hidden_states=x,
                 w1=layer.w13_weight,
                 w2=layer.w2_weight,
@@ -330,19 +339,12 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
                 topk_ids=topk_ids,
                 inplace=True,
                 activation=layer.activation,
-                quant_config=self.moe_quant_config,
                 apply_router_weight_on_input=layer.apply_router_weight_on_input,
                 global_num_experts=layer.global_num_experts,
                 expert_map=layer.expert_map,
             )
 
-        if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
-            assert not isinstance(result, tuple), (
-                "Shared + zero experts are mutually exclusive not yet supported"
-            )
-            return result, zero_expert_result
-        else:
-            return result
+        return result
 
     def forward_cpu(
         self,
diff --git a/vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py b/vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py
new file mode 100644
index 0000000000000..97d21767f4fc3
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py
@@ -0,0 +1,189 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from contextlib import contextmanager
+
+import torch
+from torch import nn
+
+from vllm.model_executor.layers.fused_moe.fused_moe import zero_experts_compute_triton
+from vllm.model_executor.layers.fused_moe.layer import FusedMoE
+
+
+class ZeroExpertFusedMoE(FusedMoE):
+    """
+    A FusedMoE operation that also computes the results of zero experts.
+    Zero experts perform identity operations (scaled pass-through) instead
+    of full MLP computations.
+
+    This class uses memoization to avoid redundant routing computation:
+    routing is computed once and reused for both zero expert computation
+    and the main FusedMoE forward pass.
+    """
+
+    def __init__(
+        self,
+        zero_expert_num: int,
+        zero_expert_type: str,
+        router: nn.Module,
+        **kwargs,
+    ):
+        # ZeroExpertFusedMoE manages its own custom_routing_function for memoization
+        assert (
+            "custom_routing_function" not in kwargs
+            or kwargs.get("custom_routing_function") is None
+        ), (
+            "ZeroExpertFusedMoE does not support external custom_routing_function. "
+            "It manages its own for routing memoization."
+        )
+
+        # Automatically slice router's e_score_correction_bias to only include
+        # real experts (not zero_experts) for the base FusedMoE.
+        # The full bias will be used temporarily in forward() for routing.
+        if hasattr(router, "e_score_correction_bias") and "num_experts" in kwargs:
+            num_real_experts = kwargs["num_experts"]
+            router_bias = router.e_score_correction_bias
+            user_bias = kwargs.get("e_score_correction_bias")
+
+            # Use router's bias if:
+            # 1. User didn't provide bias, or
+            # 2. User provided full bias (same size as router)
+            if user_bias is None or user_bias.shape[0] == router_bias.shape[0]:
+                kwargs["e_score_correction_bias"] = router_bias[:num_real_experts]
+
+        # FusedMoE no longer accepts zero_expert_num/zero_expert_type.
+        # We handle zero experts ourselves in forward().
+        super().__init__(**kwargs)
+        # Store the actual zero_expert_num and zero_expert_type for our own use
+        self._actual_zero_expert_num = zero_expert_num
+        self._actual_zero_expert_type = zero_expert_type
+        self._router = router  # Full router (includes zero experts)
+
+        # Expose zero_expert_num and zero_expert_type as attributes for
+        # compatibility with quantization methods that check these attributes
+        self.zero_expert_num = 0
+        self.zero_expert_type = None
+
+        # Memoization state for routing results
+        self._memoized_topk_weights: torch.Tensor | None = None
+        self._memoized_topk_ids: torch.Tensor | None = None
+
+        # Create custom_routing_function to reuse memoized routing results
+        def custom_routing_function(hidden_states, gating_output, topk, renormalize):
+            """Return memoized `topk_weights` and `topk_ids`."""
+            if self._memoized_topk_weights is None or self._memoized_topk_ids is None:
+                raise RuntimeError(
+                    "ZeroExpertFusedMoE: routing results not memoized. "
+                    "Call select_experts first to compute routing."
+                )
+            return self._memoized_topk_weights, self._memoized_topk_ids
+
+        self.custom_routing_function = custom_routing_function
+
+    @contextmanager
+    def _temporarily_set_attrs(self, **attrs):
+        """
+        Temporarily set attributes using object.__setattr__ and restore them.
+
+        This bypasses nn.Module.__setattr__ to avoid Dynamo tracing issues.
+        When PyTorch Dynamo traces the forward pass, it cannot handle
+        nn.Module.__setattr__ calls (which include parameter registration logic),
+        resulting in "Unsupported" errors. Using object.__setattr__ directly
+        sets the attribute without triggering nn.Module's custom __setattr__,
+        allowing Dynamo to trace the code successfully.
+        """
+        originals = {key: getattr(self, key) for key in attrs}
+        try:
+            for key, value in attrs.items():
+                object.__setattr__(self, key, value)
+            yield
+        finally:
+            for key, value in originals.items():
+                object.__setattr__(self, key, value)
+
+    def _compute_zero_expert_result(
+        self,
+        hidden_states: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+    ) -> torch.Tensor | None:
+        """Compute zero expert results using pre-computed routing."""
+        if (
+            self._actual_zero_expert_num is None
+            or self._actual_zero_expert_num <= 0
+            or self._actual_zero_expert_type is None
+        ):
+            return None
+
+        return zero_experts_compute_triton(
+            expert_indices=topk_ids.clone(),
+            expert_scales=topk_weights.clone(),
+            num_experts=self.logical_num_experts,
+            zero_expert_type=self._actual_zero_expert_type,
+            hidden_states=hidden_states,
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,  # Full logits including zero experts
+    ) -> torch.Tensor:
+        """
+        Forward pass with zero expert support and routing memoization.
+
+        Args:
+            hidden_states: Input hidden states
+            router_logits: Full router logits (including zero experts)
+
+        Returns:
+            Combined output from real experts and zero experts
+        """
+        # Prepare temporary attribute overrides for routing computation
+        temp_attrs = {
+            "custom_routing_function": None,  # Disable for first routing
+        }
+        if self._router is not None:
+            temp_attrs["e_score_correction_bias"] = self._router.e_score_correction_bias
+
+        # Compute routing with temporary attributes
+        # Pass full router_logits (including zero experts) so that zero experts
+        # can be properly identified in topk_ids
+        with self._temporarily_set_attrs(**temp_attrs):
+            topk_weights, topk_ids = self.select_experts(
+                hidden_states=hidden_states,
+                router_logits=router_logits,  # Full logits (includes zero experts)
+            )
+
+        # Compute zero expert result if needed
+        zero_expert_result = self._compute_zero_expert_result(
+            hidden_states=hidden_states,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+        )
+
+        # Memoize routing results for reuse in super().forward()
+        self._memoized_topk_weights = topk_weights
+        self._memoized_topk_ids = topk_ids
+
+        # Slice router_logits for real experts only
+        router_logits_sliced = router_logits[..., : self.logical_num_experts]
+
+        # Compute real expert results (will reuse memoized routing via
+        # custom_routing_function)
+        # zero_expert_num is already 0, so FusedMoE won't handle zero experts
+        fused_out = super().forward(
+            hidden_states=hidden_states,
+            router_logits=router_logits_sliced,
+        )
+
+        # Combine results
+        # Both zero_expert_result and fused_out are computed from the same
+        # hidden_states, so they should be on the same device.
+        if zero_expert_result is not None:
+            fused_out = fused_out + zero_expert_result
+
+        # Clear memoization after use
+        self._memoized_topk_weights = None
+        self._memoized_topk_ids = None
+
+        return fused_out
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index dfcc601a1c530..402f0bf69ceaa 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -53,6 +53,8 @@ WEIGHT_LOADER_V2_SUPPORTED = [
     "GPTQLinearMethod",
     "FBGEMMFp8LinearMethod",
     "ModelOptFp8LinearMethod",
+    "ModelOptFp8PcPtLinearMethod",
+    "ModelOptFp8PbWoLinearMethod",
     "IPEXAWQLinearMethod",
     "IPEXGPTQLinearMethod",
     "HQQMarlinMethod",
@@ -277,6 +279,7 @@ class LinearBase(CustomOp):
         self.params_dtype = params_dtype
         self.quant_config = quant_config
         self.prefix = prefix
+        self.allow_fp8_block_shape_mismatch = False
         if quant_config is None:
             self.quant_method: QuantizeMethodBase | None = UnquantizedLinearMethod()
         else:
@@ -475,6 +478,7 @@ class ColumnParallelLinear(LinearBase):
             disable_tp=disable_tp,
         )
 
+        self._maybe_allow_fp8_block_shape_mismatch()
         self.gather_output = gather_output
 
         if output_sizes is None:
@@ -509,6 +513,33 @@ class ColumnParallelLinear(LinearBase):
             self.register_parameter("bias", None)
         self.update_param_tp_status()
 
+    def _maybe_allow_fp8_block_shape_mismatch(self) -> None:
+        quant_config = getattr(self, "quant_config", None)
+        weight_block = getattr(quant_config, "weight_block_size", None)
+        if (
+            weight_block is None
+            or len(weight_block) < 1
+            or len(self.output_partition_sizes) <= 1
+        ):
+            return
+
+        try:
+            block_n = int(weight_block[0])
+        except (ValueError, TypeError):
+            return
+
+        if block_n <= 0:
+            return
+
+        if any(size % block_n != 0 for size in self.output_partition_sizes):
+            self.allow_fp8_block_shape_mismatch = True
+            logger.debug(
+                "Allowing FP8 block shape mismatch for %s (block_n=%d, partitions=%s)",
+                getattr(self, "prefix", "<unknown>"),
+                block_n,
+                self.output_partition_sizes,
+            )
+
     def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         output_dim = getattr(param, "output_dim", None)
 
@@ -906,9 +937,11 @@ class QKVParallelLinear(ColumnParallelLinear):
         *,
         return_bias: bool = True,
         disable_tp: bool = False,
+        v_head_size: int | None = None,
     ):
         self.hidden_size = hidden_size
         self.head_size = head_size
+        self.v_head_size = v_head_size if v_head_size is not None else head_size
         self.total_num_heads = total_num_heads
         if total_num_kv_heads is None:
             total_num_kv_heads = total_num_heads
@@ -924,12 +957,14 @@ class QKVParallelLinear(ColumnParallelLinear):
             self.num_kv_head_replicas = 1
         input_size = self.hidden_size
         output_size = (
-            (self.num_heads + 2 * self.num_kv_heads) * tp_size * self.head_size
-        )
+            self.num_heads * self.head_size
+            + self.num_kv_heads * self.head_size
+            + self.num_kv_heads * self.v_head_size
+        ) * tp_size
         self.output_sizes = [
             self.num_heads * self.head_size * tp_size,  # q_proj
             self.num_kv_heads * self.head_size * tp_size,  # k_proj
-            self.num_kv_heads * self.head_size * tp_size,  # v_proj
+            self.num_kv_heads * self.v_head_size * tp_size,  # v_proj
         ]
 
         super().__init__(
@@ -950,7 +985,8 @@ class QKVParallelLinear(ColumnParallelLinear):
             "q": 0,
             "k": self.num_heads * self.head_size,
             "v": (self.num_heads + self.num_kv_heads) * self.head_size,
-            "total": (self.num_heads + 2 * self.num_kv_heads) * self.head_size,
+            "total": (self.num_heads + self.num_kv_heads) * self.head_size
+            + self.num_kv_heads * self.v_head_size,
         }
         return shard_offset_mapping.get(loaded_shard_id)
 
@@ -958,7 +994,7 @@ class QKVParallelLinear(ColumnParallelLinear):
         shard_size_mapping = {
             "q": self.num_heads * self.head_size,
             "k": self.num_kv_heads * self.head_size,
-            "v": self.num_kv_heads * self.head_size,
+            "v": self.num_kv_heads * self.v_head_size,
         }
         return shard_size_mapping.get(loaded_shard_id)
 
@@ -985,7 +1021,7 @@ class QKVParallelLinear(ColumnParallelLinear):
             (
                 "v",
                 (self.total_num_heads + self.total_num_kv_heads) * self.head_size,
-                self.total_num_kv_heads * self.head_size,
+                self.total_num_kv_heads * self.v_head_size,
             ),
         ]
 
@@ -1110,7 +1146,7 @@ class QKVParallelLinear(ColumnParallelLinear):
                 (
                     "v",
                     (self.total_num_heads + self.total_num_kv_heads) * self.head_size,
-                    self.total_num_kv_heads * self.head_size,
+                    self.total_num_kv_heads * self.v_head_size,
                 ),
             ]
             use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
@@ -1139,11 +1175,12 @@ class QKVParallelLinear(ColumnParallelLinear):
                         "v": (
                             (self.total_num_heads + self.total_num_kv_heads)
                             * self.head_size,
-                            self.total_num_kv_heads * self.head_size,
+                            self.total_num_kv_heads * self.v_head_size,
                         ),
                         "total": (
-                            (self.total_num_heads + 2 * self.total_num_kv_heads)
-                            * self.head_size,
+                            (self.total_num_heads + self.total_num_kv_heads)
+                            * self.head_size
+                            + self.total_num_kv_heads * self.v_head_size,
                             0,
                         ),
                     }
@@ -1170,7 +1207,7 @@ class QKVParallelLinear(ColumnParallelLinear):
                 shard_size = self.num_kv_heads * self.head_size
             elif loaded_shard_id == "v":
                 shard_offset = (self.num_heads + self.num_kv_heads) * self.head_size
-                shard_size = self.num_kv_heads * self.head_size
+                shard_size = self.num_kv_heads * self.v_head_size
             # Special case for Quantized Weights.
             # If quantized, we need to adjust the offset and size to account
             # for the packing.
@@ -1199,10 +1236,11 @@ class QKVParallelLinear(ColumnParallelLinear):
                     ),
                     "v": (
                         (self.num_heads + self.num_kv_heads) * self.head_size,
-                        self.num_kv_heads * self.head_size,
+                        self.num_kv_heads * self.v_head_size,
                     ),
                     "total": (
-                        (self.num_heads + 2 * self.num_kv_heads) * self.head_size,
+                        (self.num_heads + self.num_kv_heads) * self.head_size
+                        + self.num_kv_heads * self.v_head_size,
                         0,
                     ),
                 }
diff --git a/vllm/model_executor/layers/mamba/short_conv.py b/vllm/model_executor/layers/mamba/short_conv.py
index 0bbad17d7ebc7..c9a80e9f7317d 100644
--- a/vllm/model_executor/layers/mamba/short_conv.py
+++ b/vllm/model_executor/layers/mamba/short_conv.py
@@ -118,6 +118,7 @@ class ShortConv(MambaBase, CustomOp):
             conv_state = self_kv_cache[0].transpose(-1, -2)
             state_indices_tensor = attn_metadata.state_indices_tensor
             has_initial_states_p = attn_metadata.has_initial_states_p
+            query_start_loc_p = attn_metadata.query_start_loc_p
 
         BCx, _ = self.in_proj(hidden_states)
 
@@ -165,11 +166,6 @@ class ShortConv(MambaBase, CustomOp):
             [num_decodes, num_prefills],
             dim=0,
         )
-        query_start_loc_p = (
-            attn_metadata.query_start_loc[-num_prefills - 1 :] - num_decodes
-            if has_prefill
-            else None
-        )
 
         conv_output_list = []
 
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index 314848721a80a..602d02d2f15a4 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -764,7 +764,7 @@ class AWQMarlinMoEMethod(FusedMoEMethodBase):
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         assert layer.activation == "silu", "Only SiLU activation is supported."
 
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index 1fd959cb3423d..efe5677045e4b 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -500,7 +500,7 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase):
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         from vllm.model_executor.layers.fused_moe import fused_experts
 
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index c302e465aedb7..f4038801c266b 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -96,6 +96,7 @@ from vllm.utils.deep_gemm import (
     get_col_major_tma_aligned_tensor,
     get_mk_alignment_for_contiguous_layout,
     is_deep_gemm_e8m0_used,
+    is_deep_gemm_supported,
 )
 from vllm.utils.import_utils import has_deep_gemm
 
@@ -573,7 +574,7 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod):
                 e_score_correction_bias=layer.e_score_correction_bias,
             )
 
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
@@ -716,6 +717,13 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
             get_marlin_input_dtype(layer_name) if self.use_marlin else None
         )
 
+        self.allow_deep_gemm = (
+            self.block_quant
+            and envs.VLLM_MOE_USE_DEEP_GEMM
+            and is_deep_gemm_supported()
+            and list(self.weight_block_size) == get_mk_alignment_for_contiguous_layout()
+        )
+
     def create_weights(
         self,
         layer: torch.nn.Module,
@@ -1158,7 +1166,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
         x: torch.Tensor,
         router_logits: torch.Tensor,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
@@ -1231,6 +1239,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                     if self.disable_expert_map
                     else layer.expert_map,  # ???
                     quant_config=self.moe_quant_config,
+                    allow_deep_gemm=self.allow_deep_gemm,
                 )
             else:
                 from vllm.model_executor.layers.fused_moe.cutlass_moe import (
@@ -1272,6 +1281,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                 global_num_experts=layer.global_num_experts,
                 expert_map=layer.expert_map,
                 quant_config=self.moe_quant_config,
+                allow_deep_gemm=self.allow_deep_gemm,
             )
 
     @property
@@ -1393,7 +1403,7 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         from vllm.model_executor.layers.fused_moe import fused_experts
 
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
@@ -1755,7 +1765,7 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
             f"{layer.activation} not supported for Marlin MoE."
         )
 
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
@@ -1981,7 +1991,7 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         from vllm.model_executor.layers.fused_moe import fused_experts
 
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
@@ -2597,7 +2607,7 @@ class CompressedTensorsW4A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                 "EPLB not supported for `CompressedTensorsW4A8Fp8MoEMethod` yet."
             )
         assert self.moe_quant_config is not None
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
index ee99572f5f499..758a54c10605a 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
@@ -61,7 +61,7 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme):
             )
 
         self.cutlass_block_fp8_supported = cutlass_block_fp8_supported()
-        self.use_aiter_and_is_supported = rocm_aiter_ops.is_linear_fp8_enaled()
+        self.use_aiter_and_is_supported = rocm_aiter_ops.is_linear_fp8_enabled()
 
         if self.weight_block_size is not None:
             assert not self.is_static_input_scheme
diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py
index 11097cf36f5ca..56b11b22f7ff5 100644
--- a/vllm/model_executor/layers/quantization/experts_int8.py
+++ b/vllm/model_executor/layers/quantization/experts_int8.py
@@ -142,7 +142,7 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase):
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         from vllm.model_executor.layers.fused_moe import fused_experts
 
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 800340ed6043c..9da19c082dc27 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -2,12 +2,12 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from enum import Enum
-from functools import partial
 from typing import TYPE_CHECKING, Any, Optional
 
 import torch
 from torch.nn import Module
 from torch.nn.parameter import Parameter
+from torch.utils._python_dispatch import TorchDispatchMode
 
 import vllm.envs as envs
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
@@ -32,8 +32,8 @@ from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEQuantConfig,
     RoutingMethodType,
     fp8_w8a8_moe_quant_config,
+    fp8_w8a16_moe_quant_config,
 )
-from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe
 from vllm.model_executor.layers.fused_moe.layer import UnquantizedFusedMoEMethod
 from vllm.model_executor.layers.linear import (
     LinearBase,
@@ -50,7 +50,6 @@ from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
     FlashinferMoeBackend,
     apply_flashinfer_per_tensor_scale_fp8,
     build_flashinfer_fp8_cutlass_moe_prepare_finalize,
-    flashinfer_cutlass_moe_fp8,
     get_flashinfer_moe_backend,
     register_moe_scaling_factors,
     rotate_flashinfer_fp8_moe_weights,
@@ -96,7 +95,6 @@ from vllm.model_executor.parameter import (
 )
 from vllm.model_executor.utils import replace_parameter, set_weight_attrs
 from vllm.platforms import current_platform
-from vllm.scalar_type import scalar_types
 from vllm.utils.deep_gemm import (
     is_deep_gemm_e8m0_used,
     is_deep_gemm_supported,
@@ -117,20 +115,22 @@ class Fp8MoeBackend(Enum):
     FLASHINFER_TRTLLM = 1
     FLASHINFER_CUTLASS = 2
     DEEPGEMM = 3
-    CUTLASS_BLOCK_SCALED_GROUPED_GEMM = 4
-    MARLIN = 5
-    TRITON = 6
+    MARLIN = 4
+    TRITON = 5
+    AITER = 6
 
 
 def get_fp8_moe_backend(
     block_quant: bool,
     moe_parallel_config: FusedMoEParallelConfig,
     with_lora_support: bool,
-) -> Fp8MoeBackend:
+) -> Fp8MoeBackend | None:
     """
     Select the primary FP8 MoE backend
     Note: Shape-specific fallbacks may still occur at runtime.
     """
+    if current_platform.is_xpu():
+        return None
     if with_lora_support:
         return Fp8MoeBackend.TRITON
     # Prefer FlashInfer backends on supported GPUs; allow SM90 and SM100.
@@ -190,16 +190,9 @@ def get_fp8_moe_backend(
             logger.info_once("Using DeepGEMM backend for FP8 MoE", scope="local")
             return Fp8MoeBackend.DEEPGEMM
 
-    # CUTLASS BlockScaled GroupedGemm on SM100 with block-quantized weights
-    if (
-        current_platform.is_cuda()
-        and current_platform.is_device_capability_family(100)
-        and block_quant
-    ):
-        logger.info_once(
-            "Using Cutlass BlockScaled GroupedGemm backend for FP8 MoE", scope="local"
-        )
-        return Fp8MoeBackend.CUTLASS_BLOCK_SCALED_GROUPED_GEMM
+    if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MOE:
+        logger.info_once("Using ROCm AITER backend for FP8 MoE", scope="local")
+        return Fp8MoeBackend.AITER
 
     # default to Triton
     logger.info_once("Using Triton backend for FP8 MoE")
@@ -305,6 +298,13 @@ class Fp8Config(QuantizationConfig):
                 return UnquantizedLinearMethod()
             return XPUFp8LinearMethod(fp8_config)
         elif isinstance(layer, FusedMoE):
+            if is_layer_skipped(
+                prefix=prefix,
+                ignored_layers=self.ignored_layers,
+                fused_mapping=self.packed_modules_mapping,
+            ):
+                return UnquantizedFusedMoEMethod(layer.moe_config)
+
             return XPUFp8MoEMethod(fp8_config, layer)
         elif isinstance(layer, Attention):
             return Fp8KVCacheMethod(self)
@@ -363,6 +363,26 @@ class Fp8Config(QuantizationConfig):
         return None
 
 
+class CopyNumelCounter(TorchDispatchMode):
+    """
+    Tracks total number of elements modified with `copy_`. Useful for keeping
+    track of weight loading where underlying weights can be arbitrarily
+    transformed (such as with `narrow`) before calling copy.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.copied_numel = 0
+
+    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+        if kwargs is None:
+            kwargs = {}
+        out = func(*args, **kwargs)
+        if func == torch.ops.aten.copy_.default:
+            self.copied_numel += args[0].numel()
+        return out
+
+
 class Fp8LinearMethod(LinearMethodBase):
     """Linear method for FP8.
     Supports loading FP8 checkpoints with static weight scale and
@@ -399,7 +419,7 @@ class Fp8LinearMethod(LinearMethodBase):
         if vllm_is_batch_invariant():
             self.use_marlin = False
 
-        self.use_aiter_and_is_supported = rocm_aiter_ops.is_linear_fp8_enaled()
+        self.use_aiter_and_is_supported = rocm_aiter_ops.is_linear_fp8_enabled()
         self.use_deep_gemm = is_deep_gemm_supported()
 
         self.weight_block_size = self.quant_config.weight_block_size
@@ -469,13 +489,15 @@ class Fp8LinearMethod(LinearMethodBase):
         else:
 
             def patched_weight_loader(param, loaded_weight, *args, **kwargs):
-                # load the current weight chunk
-                res = weight_loader(param, loaded_weight, *args, **kwargs)  # type: ignore[misc]
-
                 # track how many elements we have updated
                 if not hasattr(layer, "_loaded_numel"):
                     layer._loaded_numel = 0
-                layer._loaded_numel += loaded_weight.numel()
+
+                # load the current weight chunk
+                copy_numel_counter = CopyNumelCounter()
+                with copy_numel_counter:
+                    res = weight_loader(param, loaded_weight, *args, **kwargs)  # type: ignore[misc]
+                layer._loaded_numel += copy_numel_counter.copied_numel
 
                 # if we have loaded all of the elements, call
                 # process_weights_after_loading
@@ -711,27 +733,33 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         )
 
         self.marlin_input_dtype = None
-        self.use_marlin = self.fp8_backend == Fp8MoeBackend.MARLIN
         self.flashinfer_moe_backend: FlashinferMoeBackend | None = None
         if self.fp8_backend == Fp8MoeBackend.FLASHINFER_TRTLLM:
             self.flashinfer_moe_backend = FlashinferMoeBackend.TENSORRT_LLM
         elif self.fp8_backend == Fp8MoeBackend.FLASHINFER_CUTLASS:
             self.flashinfer_moe_backend = FlashinferMoeBackend.CUTLASS
-            if self.block_quant:
-                assert self.weight_block_size == [128, 128], (
-                    f"Only support weight_block_size == [128, 128], "
-                    f"got {self.weight_block_size}"
+            if self.block_quant and self.weight_block_size != [128, 128]:
+                raise NotImplementedError(
+                    "FlashInfer CUTLASS FP8 MoE backend only supports block "
+                    "size [128, 128]."
+                )
+            if not self.block_quant:
+                if layer.renormalize or layer.custom_routing_function is not None:
+                    raise NotImplementedError(
+                        "FlashInfer CUTLASS FP8 MoE backend does custom routing "
+                        f"function or renormalization, but got {layer.renormalize} and "
+                        f"{layer.custom_routing_function}."
+                    )
+                if layer.scoring_func != "sigmoid":
+                    raise NotImplementedError(
+                        "FlashInfer CUTLASS FP8 MoE backend only supports "
+                        f"'sigmoid' scoring function, but got {layer.scoring_func}."
+                    )
+            if layer.activation != "silu":
+                raise NotImplementedError(
+                    "FlashInfer CUTLASS FP8 MoE backend only supports SiLU "
+                    "activation function, but got {layer.activation}."
                 )
-            self.flashinfer_moe_fn = partial(
-                flashinfer_cutlass_moe_fp8,
-                moe=self.moe,
-                use_deepseek_fp8_block_scale=self.block_quant,
-            )
-
-        self.allow_deep_gemm = self.fp8_backend == Fp8MoeBackend.DEEPGEMM
-        self.allow_cutlass_block_scaled_grouped_gemm = (
-            self.fp8_backend == Fp8MoeBackend.CUTLASS_BLOCK_SCALED_GROUPED_GEMM
-        )
 
     def create_weights(
         self,
@@ -865,16 +893,10 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             layer.w13_input_scale = None
             layer.w2_input_scale = None
 
-        self.rocm_aiter_moe_enabled = False
-
     def process_weights_after_loading(self, layer: Module) -> None:
         if getattr(layer, "_already_called_process_weights_after_loading", False):
             return
 
-        # Lazy import to avoid importing triton too early.
-
-        self.rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled()
-
         # TODO (rob): refactor block quant into separate class.
         if self.block_quant:
             assert self.quant_config.activation_scheme == "dynamic"
@@ -909,7 +931,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             replace_parameter(layer, "w13_weight_scale_inv", w13_weight_scale_inv)
             replace_parameter(layer, "w2_weight", w2_weight)
             replace_parameter(layer, "w2_weight_scale_inv", w2_weight_scale_inv)
-            if self.rocm_aiter_moe_enabled:
+            if self.fp8_backend == Fp8MoeBackend.AITER:
                 # reshaping weights is required for aiter moe kernel.
                 shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights(
                     layer.w13_weight.data, layer.w2_weight.data
@@ -920,7 +942,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
 
             # DeepGemm scales need to be transposed and aligned. We try to do
             # it ahead of time for performance reasons.
-            if self.allow_deep_gemm:
+            if self.fp8_backend == Fp8MoeBackend.DEEPGEMM:
                 dg_w13_weight, dg_w13_weight_scale_inv = (
                     deepgemm_post_process_fp8_weight_block(
                         wq=layer.w13_weight.data,
@@ -1003,7 +1025,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                     )
                     start += shard_size
 
-            if self.rocm_aiter_moe_enabled:
+            if self.fp8_backend == Fp8MoeBackend.AITER:
                 shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights(
                     layer.w13_weight, layer.w2_weight
                 )
@@ -1023,7 +1045,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                     rotate_flashinfer_fp8_moe_weights(w13_weight, w2_weight)
                 layer.w13_weight.data = w13_weight.data
 
-        if self.use_marlin:
+        if self.fp8_backend == Fp8MoeBackend.MARLIN:
             prepare_moe_fp8_layer_for_marlin(
                 layer, False, input_dtype=self.marlin_input_dtype
             )
@@ -1031,13 +1053,94 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             del layer.w13_input_scale
             del layer.w2_input_scale
 
+        # NOTE(rob): this is a WIP refactor. We are first migrating
+        # all of the kernels in the TP case to use mk. Once this is
+        # done, then we will initialzie the TP case and DP/EP case
+        # via the same code path (i.e. via maybe_init_modular_kernel).
+        # NOTE(rob): in progress migrating all into this format.
+        if self.fp8_backend == Fp8MoeBackend.FLASHINFER_CUTLASS:
+            from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
+                FlashInferExperts,
+            )
+            from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import (  # noqa: E501
+                FlashInferAllGatherMoEPrepareAndFinalize,
+            )
+
+            config = self.get_fused_moe_quant_config(layer)
+            assert config is not None
+            self.moe_quant_config = config
+
+            self.kernel = mk.FusedMoEModularKernel(
+                # TODO(rob): we can use the generic MoEPrepareAndFinalizeNoEP
+                # with the changes to defer input quantization
+                FlashInferAllGatherMoEPrepareAndFinalize(
+                    use_dp=(self.moe.dp_size > 1),
+                    use_deepseek_fp8_block_scale=self.block_quant,
+                ),
+                FlashInferExperts(
+                    out_dtype=torch.get_default_dtype(),
+                    quant_config=self.moe_quant_config,
+                    ep_rank=self.moe.ep_rank,
+                    ep_size=self.moe.ep_size,
+                    tp_rank=self.moe.tp_rank,
+                    tp_size=self.moe.tp_size,
+                    use_dp=(self.moe.dp_size > 1),
+                    use_deepseek_fp8_block_scale=self.block_quant,
+                ),
+            )
+            self.use_inplace = False
+
+        elif self.fp8_backend in [
+            Fp8MoeBackend.DEEPGEMM,
+            Fp8MoeBackend.TRITON,
+            Fp8MoeBackend.MARLIN,
+            Fp8MoeBackend.AITER,
+        ]:
+            from vllm.model_executor.layers.fused_moe import (
+                TritonOrDeepGemmExperts,
+            )
+            from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
+                MarlinExperts,
+            )
+            from vllm.model_executor.layers.fused_moe.prepare_finalize import (
+                MoEPrepareAndFinalizeNoEP,
+            )
+            from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+                AiterExperts,
+            )
+
+            config = self.get_fused_moe_quant_config(layer)
+            assert config is not None
+            self.moe_quant_config = config
+
+            if self.fp8_backend == Fp8MoeBackend.AITER:
+                self.kernel = mk.FusedMoEModularKernel(
+                    # TODO: make defer_input_quant an attr of the AiterExperts
+                    MoEPrepareAndFinalizeNoEP(defer_input_quant=True),
+                    AiterExperts(quant_config=self.moe_quant_config),
+                )
+            elif self.fp8_backend == Fp8MoeBackend.MARLIN:
+                self.kernel = mk.FusedMoEModularKernel(
+                    MoEPrepareAndFinalizeNoEP(),
+                    MarlinExperts(quant_config=self.moe_quant_config),
+                )
+            else:
+                self.kernel = mk.FusedMoEModularKernel(
+                    MoEPrepareAndFinalizeNoEP(),
+                    TritonOrDeepGemmExperts(
+                        quant_config=self.moe_quant_config,
+                        allow_deep_gemm=(self.fp8_backend == Fp8MoeBackend.DEEPGEMM),
+                    ),
+                )
+            self.use_inplace = True
+
     def maybe_make_prepare_finalize(
         self,
         routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
     ) -> mk.FusedMoEPrepareAndFinalize | None:
         if (
-            self.rocm_aiter_moe_enabled
-            or self.use_marlin
+            self.fp8_backend == Fp8MoeBackend.AITER
+            or self.fp8_backend == Fp8MoeBackend.MARLIN
             or self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
         ):
             return None
@@ -1069,9 +1172,10 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             TritonOrDeepGemmExperts,
         )
 
-        assert not self.use_marlin and not self.rocm_aiter_moe_enabled, (
-            "Marlin and ROCm AITER are not supported with all2all yet."
-        )
+        if self.fp8_backend in [Fp8MoeBackend.MARLIN, Fp8MoeBackend.AITER]:
+            raise NotImplementedError(
+                "Marlin and ROCm AITER are not supported with all2all yet."
+            )
 
         assert self.moe_quant_config is not None
 
@@ -1083,7 +1187,9 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             assert max_num_tokens_per_rank is not None
 
             experts_impl = (
-                BatchedDeepGemmExperts if self.allow_deep_gemm else BatchedTritonExperts
+                BatchedDeepGemmExperts
+                if self.fp8_backend == Fp8MoeBackend.DEEPGEMM
+                else BatchedTritonExperts
             )
             logger.debug(
                 "%s(%s): max_tokens_per_rank=%s, block_size=%s, per_act_token=%s",
@@ -1118,14 +1224,18 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             )
             return TritonOrDeepGemmExperts(
                 quant_config=self.moe_quant_config,
-                allow_deep_gemm=self.allow_deep_gemm,
+                allow_deep_gemm=(self.fp8_backend == Fp8MoeBackend.DEEPGEMM),
             )
 
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
     ) -> FusedMoEQuantConfig | None:
-        if self.use_marlin:
-            return None
+        if self.fp8_backend == Fp8MoeBackend.MARLIN:
+            return fp8_w8a16_moe_quant_config(
+                w1_scale=layer.w13_weight_scale,
+                w2_scale=layer.w2_weight_scale,
+                block_shape=self.weight_block_size,
+            )
 
         return fp8_w8a8_moe_quant_config(
             w1_scale=(
@@ -1156,6 +1266,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         router_logits: torch.Tensor,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM:
+            # TODO(rob): convert this to MK.
             if layer.enable_eplb:
                 raise NotImplementedError("EPLB not supported for `Fp8MoEMethod` yet.")
             assert layer.activation == "silu", (
@@ -1208,103 +1319,24 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                     apply_router_weight_on_input=layer.apply_router_weight_on_input,
                 )
 
-        select_result = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
+        result = self.kernel(
+            x,
+            layer.w13_weight,
+            layer.w2_weight,
+            topk_weights,
+            topk_ids,
+            inplace=self.use_inplace,
+            activation=layer.activation,
+            global_num_experts=layer.global_num_experts,
+            expert_map=layer.expert_map,
+            apply_router_weight_on_input=layer.apply_router_weight_on_input,
+        )
 
-        topk_weights, topk_ids, zero_expert_result = select_result
-
-        if self.rocm_aiter_moe_enabled:
-            from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (  # noqa: E501
-                rocm_aiter_fused_experts,
-            )
-
-            result = rocm_aiter_fused_experts(
-                x,
-                layer.w13_weight,
-                layer.w2_weight,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                activation=layer.activation,
-                apply_router_weight_on_input=layer.apply_router_weight_on_input,
-                expert_map=layer.expert_map,
-                quant_config=self.moe_quant_config,
-            )
-        elif self.use_marlin:
-            assert layer.activation == "silu", (
-                f"{layer.activation} not supported for Marlin MoE."
-            )
-            result = fused_marlin_moe(
-                x,
-                layer.w13_weight,
-                layer.w2_weight,
-                None,
-                None,
-                layer.w13_weight_scale,
-                layer.w2_weight_scale,
-                router_logits,
-                topk_weights,
-                topk_ids,
-                quant_type_id=scalar_types.float8_e4m3fn.id,
-                apply_router_weight_on_input=layer.apply_router_weight_on_input,
-                global_num_experts=layer.global_num_experts,
-                expert_map=layer.expert_map,
-                input_dtype=self.marlin_input_dtype,
-                workspace=layer.workspace,
-            )
-        elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
-            assert layer.activation == "silu", (
-                f"Expected 'silu' activation but got {layer.activation}"
-            )
-            if not self.block_quant:
-                assert (
-                    not layer.renormalize and layer.custom_routing_function is not None
-                )
-                assert layer.scoring_func == "sigmoid", (
-                    f"Expected 'sigmoid' scoring func but got {layer.scoring_func}"
-                )
-            # Delegate to CUTLASS FlashInfer path; function already bound with
-            # use_deepseek_fp8_block_scale for block-quant when applicable
-            result = self.flashinfer_moe_fn(
-                x,
-                layer,
-                topk_weights,
-                topk_ids,
-                inplace=False,
-                activation=layer.activation,
-                global_num_experts=layer.global_num_experts,
-                expert_map=layer.expert_map,
-                apply_router_weight_on_input=layer.apply_router_weight_on_input,
-            )
-        else:
-            from vllm.model_executor.layers.fused_moe import fused_experts
-
-            result = fused_experts(
-                hidden_states=x,
-                w1=layer.w13_weight,
-                w2=layer.w2_weight,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                inplace=True,
-                activation=layer.activation,
-                global_num_experts=layer.global_num_experts,
-                apply_router_weight_on_input=layer.apply_router_weight_on_input,
-                expert_map=layer.expert_map,
-                quant_config=self.moe_quant_config,
-                allow_deep_gemm=self.allow_deep_gemm,
-                allow_cutlass_block_scaled_grouped_gemm=(
-                    self.allow_cutlass_block_scaled_grouped_gemm
-                ),
-            )
-
-        if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
-            assert not isinstance(result, tuple), (
-                "Shared + zero experts are mutually exclusive not yet supported"
-            )
-            return result, zero_expert_result
-        else:
-            return result
+        return result
 
 
 class Fp8OnlineMoEMethod(Fp8MoEMethod):
@@ -1348,13 +1380,15 @@ class Fp8OnlineMoEMethod(Fp8MoEMethod):
         new_extra_weight_attrs = extra_weight_attrs
 
         def patched_weight_loader(param, loaded_weight, *args, **kwargs):
-            # load the current weight chunk
-            res = weight_loader(param, loaded_weight, *args, **kwargs)  # type: ignore[misc]
-
             # add a counter to track how many elements we have updated
             if not hasattr(layer, "_loaded_numel"):
                 layer._loaded_numel = 0
-            layer._loaded_numel += loaded_weight.numel()
+
+            # load the current weight chunk
+            copy_numel_counter = CopyNumelCounter()
+            with copy_numel_counter:
+                res = weight_loader(param, loaded_weight, *args, **kwargs)  # type: ignore[misc]
+            layer._loaded_numel += copy_numel_counter.copied_numel
 
             # if we have loaded all of the elements, call
             # process_weights_after_loading
@@ -1413,15 +1447,10 @@ class Fp8OnlineMoEMethod(Fp8MoEMethod):
         layer.w13_input_scale = None
         layer.w2_input_scale = None
 
-        self.rocm_aiter_moe_enabled = False
-
     def process_weights_after_loading(self, layer: Module) -> None:
         if getattr(layer, "_already_called_process_weights_after_loading", False):
             return
 
-        # Lazy import to avoid importing triton too early.
-        self.rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled()
-
         # If checkpoint is fp16, quantize in place.
         fp8_dtype = current_platform.fp8_dtype()
         w13_weight = torch.empty_like(layer.w13_weight.data, dtype=fp8_dtype)
@@ -1438,7 +1467,7 @@ class Fp8OnlineMoEMethod(Fp8MoEMethod):
         replace_parameter(layer, "w2_weight", w2_weight)
 
         # Reshuffle weights for AITER if needed.
-        if self.rocm_aiter_moe_enabled:
+        if self.fp8_backend == Fp8MoeBackend.AITER:
             shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights(
                 layer.w13_weight, layer.w2_weight
             )
@@ -1446,7 +1475,7 @@ class Fp8OnlineMoEMethod(Fp8MoEMethod):
             replace_parameter(layer, "w2_weight", shuffled_w2)
 
         # Rushuffle weights for MARLIN if needed.
-        if self.use_marlin:
+        elif self.fp8_backend == Fp8MoeBackend.MARLIN:
             prepare_moe_fp8_layer_for_marlin(
                 layer, False, input_dtype=self.marlin_input_dtype
             )
diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index 9dd734f2fea6a..9600bb42295dc 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -639,7 +639,7 @@ class GGUFMoEMethod(FusedMoEMethodBase):
                 "fused GGUF MoE method."
             )
 
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 347c7b2008d12..d2dafca99a230 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -900,7 +900,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         assert layer.activation == "silu", "Only SiLU activation is supported."
 
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
diff --git a/vllm/model_executor/layers/quantization/input_quant_fp8.py b/vllm/model_executor/layers/quantization/input_quant_fp8.py
index a5db086fb4729..7994c838ad548 100644
--- a/vllm/model_executor/layers/quantization/input_quant_fp8.py
+++ b/vllm/model_executor/layers/quantization/input_quant_fp8.py
@@ -51,7 +51,7 @@ class QuantFP8(CustomOp):
         self.column_major_scales = column_major_scales
         self.use_ue8m0 = use_ue8m0
 
-        self.use_aiter = rocm_aiter_ops.is_linear_fp8_enaled()
+        self.use_aiter = rocm_aiter_ops.is_linear_fp8_enabled()
 
         self.is_group_quant = group_shape.is_per_group()
         if self.is_group_quant:
diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py
index f33ee43727f19..9de2924ec71b1 100644
--- a/vllm/model_executor/layers/quantization/ipex_quant.py
+++ b/vllm/model_executor/layers/quantization/ipex_quant.py
@@ -6,13 +6,8 @@ from typing import Any, Optional
 import torch
 from packaging import version
 from torch.nn import Module
-from torch.nn.parameter import Parameter
 
 from vllm._ipex_ops import ipex_ops as ops
-from vllm.model_executor.layers.fused_moe import (
-    FusedMoEMethodBase,
-    FusedMoeWeightScaleSupported,
-)
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.linear import (
     LinearBase,
@@ -24,14 +19,14 @@ from vllm.model_executor.layers.quantization import (
     QuantizationMethods,
 )
 from vllm.model_executor.layers.quantization.awq import AWQLinearMethod
-from vllm.model_executor.layers.quantization.fp8 import Fp8Config, Fp8LinearMethod
+from vllm.model_executor.layers.quantization.fp8 import (
+    Fp8Config,
+    Fp8LinearMethod,
+    Fp8OnlineMoEMethod,
+)
 from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
 from vllm.model_executor.layers.quantization.utils.quant_utils import is_layer_skipped
-from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    maybe_create_device_identity,
-)
-from vllm.model_executor.parameter import ModelWeightParameter
-from vllm.model_executor.utils import set_weight_attrs
+from vllm.model_executor.utils import replace_parameter
 from vllm.platforms import current_platform
 
 MIN_IPEX_VERSION = "2.6.0"
@@ -309,44 +304,15 @@ class XPUFp8LinearMethod(Fp8LinearMethod):
     def __init__(self, quant_config: Fp8Config):
         super().__init__(quant_config)
 
-    def create_weights(
-        self,
-        layer: torch.nn.Module,
-        input_size_per_partition: int,
-        output_partition_sizes: list[int],
-        input_size: int,
-        output_size: int,
-        params_dtype: torch.dtype,
-        **extra_weight_attrs,
-    ):
-        maybe_create_device_identity()
-
-        output_size_per_partition = sum(output_partition_sizes)
-        weight_loader = extra_weight_attrs.get("weight_loader")
-        layer.logical_widths = output_partition_sizes
-        layer.input_size_per_partition = input_size_per_partition
-        layer.output_size_per_partition = output_size_per_partition
-        layer.orig_dtype = params_dtype
-        layer.weight_block_size = None
-        weight = ModelWeightParameter(
-            data=torch.empty(
-                output_size_per_partition,
-                input_size_per_partition,
-                dtype=params_dtype,
-            ),
-            input_dim=1,
-            output_dim=0,
-            weight_loader=weight_loader,
-        )
-        layer.register_parameter("weight", weight)
-
     def process_weights_after_loading(self, layer: Module) -> None:
+        if getattr(layer, "_already_called_process_weights_after_loading", False):
+            return
         # If checkpoint not serialized fp8, quantize the weights.
         if not self.quant_config.is_checkpoint_fp8_serialized:
             qweight, weight_scale = ops.scaled_fp8_quant(layer.weight, scale=None)
             # Update the layer with the new values.
-            layer.weight = Parameter(qweight, requires_grad=False)
-            layer.weight_scale = Parameter(weight_scale, requires_grad=False)
+            replace_parameter(layer, "weight", qweight.data)
+            replace_parameter(layer, "weight_scale", weight_scale.data)
             layer.input_scale = None
 
     def apply(
@@ -363,69 +329,14 @@ class XPUFp8LinearMethod(Fp8LinearMethod):
         return output
 
 
-class XPUFp8MoEMethod(FusedMoEMethodBase):
+class XPUFp8MoEMethod(Fp8OnlineMoEMethod):
     def __init__(self, quant_config: Fp8Config, layer: torch.nn.Module):
-        super().__init__(layer.moe_config)
+        super().__init__(quant_config, layer)
         self.quant_config = quant_config
 
-    def create_weights(
-        self,
-        layer: Module,
-        num_experts: int,
-        hidden_size: int,
-        intermediate_size_per_partition: int,
-        params_dtype: torch.dtype,
-        **extra_weight_attrs,
-    ):
-        layer.intermediate_size_per_partition = intermediate_size_per_partition
-        layer.hidden_size = hidden_size
-        layer.num_experts = num_experts
-        layer.orig_dtype = params_dtype
-        layer.weight_block_size = None
-        # WEIGHTS
-        w13_weight = torch.nn.Parameter(
-            torch.empty(
-                num_experts,
-                2 * intermediate_size_per_partition,
-                hidden_size,
-                dtype=params_dtype,
-            ),
-            requires_grad=False,
-        )
-        layer.register_parameter("w13_weight", w13_weight)
-        set_weight_attrs(w13_weight, extra_weight_attrs)
-
-        w2_weight = torch.nn.Parameter(
-            torch.empty(
-                num_experts,
-                hidden_size,
-                intermediate_size_per_partition,
-                dtype=params_dtype,
-            ),
-            requires_grad=False,
-        )
-        layer.register_parameter("w2_weight", w2_weight)
-        set_weight_attrs(w2_weight, extra_weight_attrs)
-
-        # Allocate 2 scales for w1 and w3 respectively.
-        # They will be combined to a single scale after weight loading.
-        w13_weight_scale = torch.nn.Parameter(
-            torch.ones(num_experts, 2, dtype=torch.float32), requires_grad=False
-        )
-        w2_weight_scale = torch.nn.Parameter(
-            torch.ones(num_experts, dtype=torch.float32), requires_grad=False
-        )
-        layer.register_parameter("w13_weight_scale", w13_weight_scale)
-        layer.register_parameter("w2_weight_scale", w2_weight_scale)
-
-        extra_weight_attrs.update(
-            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
-        )
-        # INPUT_SCALES
-        layer.w13_input_scale = None
-        layer.w2_input_scale = None
-
     def process_weights_after_loading(self, layer: Module) -> None:
+        if getattr(layer, "_already_called_process_weights_after_loading", False):
+            return
         if not self.quant_config.is_checkpoint_fp8_serialized:
             fp8_dtype = current_platform.fp8_dtype()
             w13_weight = torch.empty_like(layer.w13_weight.data, dtype=fp8_dtype)
@@ -448,8 +359,9 @@ class XPUFp8MoEMethod(FusedMoEMethodBase):
                 w2_weight[expert, :, :], layer.w2_weight_scale[expert] = (
                     ops.scaled_fp8_quant(layer.w2_weight.data[expert, :, :])
                 )
-            layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False)
-            layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
+            replace_parameter(layer, "w13_weight", w13_weight)
+            replace_parameter(layer, "w2_weight", w2_weight)
+
         import intel_extension_for_pytorch as ipex
 
         ep_rank_start = self.moe.ep_rank * self.moe.num_local_experts
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
index faaa45b861de7..eb14f9ec378c4 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
@@ -30,7 +30,7 @@ from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
 class MarlinLinearKernel(MPLinearKernel):
     @classmethod
     def get_min_capability(cls) -> int:
-        return 80
+        return 75
 
     @classmethod
     def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, str | None]:
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index aa3937d4c03ff..afbefe1fedc18 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -55,6 +55,9 @@ from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
     select_cutlass_fp8_gemm_impl,
     swap_w13_to_w31,
 )
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    W8A8BlockFp8LinearOp,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     get_marlin_input_dtype,
 )
@@ -72,9 +75,15 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
 )
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     Fp8LinearOp,
+    cutlass_block_fp8_supported,
     requantize_with_max_scale,
 )
-from vllm.model_executor.parameter import ModelWeightParameter, PerTensorScaleParameter
+from vllm.model_executor.parameter import (
+    BlockQuantScaleParameter,
+    ChannelQuantScaleParameter,
+    ModelWeightParameter,
+    PerTensorScaleParameter,
+)
 from vllm.scalar_type import scalar_types
 from vllm.utils.flashinfer import (
     flashinfer_scaled_fp4_mm,
@@ -88,7 +97,16 @@ if TYPE_CHECKING:
 
 logger = init_logger(__name__)
 
-QUANT_ALGOS = ["FP8", "NVFP4"]
+QUANT_ALGOS = [
+    # FP8 (per-tensor weight + optional static activation scale).
+    "FP8",
+    # FP8 per-channel weight scale + per-token activation scale.
+    "FP8_PER_CHANNEL_PER_TOKEN",
+    # FP8 per-block weight-only (ModelOpt may emit this as lowercase).
+    "FP8_PB_WO",
+    # FP4
+    "NVFP4",
+]
 KV_CACHE_QUANT_ALGOS = ["FP8"]
 
 
@@ -255,6 +273,9 @@ class ModelOptQuantConfigBase(QuantizationConfig):
         if not quant_method:
             raise ValueError("Missing 'quant_algo' in quantization config")
 
+        # Normalize quant_algo for robust matching (ModelOpt may emit lowercase).
+        quant_method = str(quant_method).upper()
+
         if kv_cache_quant_method is None:
             # No KV cache quantization, keep this branch just to have this comment
             pass
@@ -263,6 +284,8 @@ class ModelOptQuantConfigBase(QuantizationConfig):
                 f"kv_cache_quant_algo must be a string, got "
                 f"{type(kv_cache_quant_method)}"
             )
+        else:
+            kv_cache_quant_method = kv_cache_quant_method.upper()
 
         if not isinstance(exclude_modules, list):
             raise ValueError(
@@ -302,17 +325,34 @@ class ModelOptFp8Config(ModelOptQuantConfigBase):
 
     def __init__(
         self,
+        quant_method: str,
         is_checkpoint_fp8_serialized: bool,
         kv_cache_quant_method: str | None,
         exclude_modules: list[str],
     ) -> None:
         super().__init__(exclude_modules)
+        self.quant_method = quant_method
         self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
         self.kv_cache_quant_method = kv_cache_quant_method
         if is_checkpoint_fp8_serialized:
             logger.warning(
-                "Detected ModelOpt fp8 checkpoint. Please note that"
-                " the format is experimental and could change."
+                "Detected ModelOpt fp8 checkpoint (quant_algo=%s). Please note "
+                "that the format is experimental and could change.",
+                quant_method,
+            )
+
+        # Select LinearMethod implementation based on quant_algo.
+        if self.quant_method == "FP8":
+            self.LinearMethodCls = ModelOptFp8LinearMethod
+        elif self.quant_method == "FP8_PER_CHANNEL_PER_TOKEN":
+            self.LinearMethodCls = ModelOptFp8PcPtLinearMethod
+        elif self.quant_method == "FP8_PB_WO":
+            self.LinearMethodCls = ModelOptFp8PbWoLinearMethod
+        else:
+            raise ValueError(
+                "Unsupported ModelOpt FP8 quant_algo for vLLM: "
+                f"{self.quant_method}. Supported: FP8 / "
+                "FP8_PER_CHANNEL_PER_TOKEN / FP8_PB_WO."
             )
 
     def get_name(self) -> QuantizationMethods:
@@ -346,13 +386,13 @@ class ModelOptFp8Config(ModelOptQuantConfigBase):
         if "quantization" in hf_quant_cfg:
             quant_config = hf_quant_cfg["quantization"]
             if isinstance(quant_config, dict):
-                quant_algo = quant_config.get("quant_algo", "")
-                if "FP8" in quant_algo:
+                quant_algo = str(quant_config.get("quant_algo", ""))
+                if "FP8" in quant_algo.upper():
                     return "modelopt"
         else:
             # Check for compressed-tensors style config with specific quant_algo
-            quant_algo = hf_quant_cfg.get("quant_algo", "")
-            if isinstance(quant_algo, str) and "FP8" in quant_algo:
+            quant_algo = str(hf_quant_cfg.get("quant_algo", ""))
+            if "FP8" in quant_algo.upper():
                 return "modelopt"
 
         return None
@@ -369,7 +409,12 @@ class ModelOptFp8Config(ModelOptQuantConfigBase):
     ) -> "ModelOptFp8Config":
         is_checkpoint_fp8_serialized = "FP8" in quant_method
 
-        return cls(is_checkpoint_fp8_serialized, kv_cache_quant_method, exclude_modules)
+        return cls(
+            quant_method,
+            is_checkpoint_fp8_serialized,
+            kv_cache_quant_method,
+            exclude_modules,
+        )
 
 
 class ModelOptFp8LinearMethod(LinearMethodBase):
@@ -464,6 +509,203 @@ class ModelOptFp8LinearMethod(LinearMethodBase):
         )
 
 
+class ModelOptFp8PcPtLinearMethod(LinearMethodBase):
+    """Linear method for ModelOpt FP8_PER_CHANNEL_PER_TOKEN checkpoints.
+
+    Expected checkpoint structure (per Linear):
+    - weight: fp8-e4m3fn, shape [out, in]
+    - weight_scale: fp32, shape [out] (per-output-channel)
+    - no input_scale (activations are dynamically quantized per-token)
+    """
+
+    def __init__(self, quant_config: ModelOptFp8Config) -> None:
+        self.quant_config = quant_config
+        self.fp8_linear = Fp8LinearOp(
+            act_quant_static=False, act_quant_group_shape=GroupShape.PER_TOKEN
+        )
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        del input_size, output_size
+
+        if not self.quant_config.is_checkpoint_fp8_serialized:
+            raise ValueError(
+                "FP8_PER_CHANNEL_PER_TOKEN currently only supports "
+                "FP8-serialized checkpoints."
+            )
+
+        output_size_per_partition = sum(output_partition_sizes)
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        layer.logical_widths = output_partition_sizes
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                output_size_per_partition,
+                input_size_per_partition,
+                dtype=torch.float8_e4m3fn,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight", weight)
+
+        weight_scale = ChannelQuantScaleParameter(
+            data=torch.empty(output_size_per_partition, dtype=torch.float32),
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        weight_scale[:] = torch.finfo(torch.float32).min
+        layer.register_parameter("weight_scale", weight_scale)
+
+    def process_weights_after_loading(self, layer: Module) -> None:
+        layer.weight = Parameter(layer.weight.t(), requires_grad=False)
+        layer.weight_scale = Parameter(layer.weight_scale.data, requires_grad=False)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        return self.fp8_linear.apply(
+            input=x,
+            weight=layer.weight,
+            weight_scale=layer.weight_scale,
+            input_scale=None,
+            bias=bias,
+        )
+
+
+class ModelOptFp8PbWoLinearMethod(LinearMethodBase):
+    """Linear method for ModelOpt FP8_PB_WO checkpoints.
+
+    ModelOpt exports `weight_scale` as a 4D tensor:
+      [out_blk, 1, in_blk, 1]
+    where block size is typically 128 for both dims.
+
+    vLLM executes it as FP8 GEMM with *dynamic per-token* activation quant.
+    """
+
+    _WEIGHT_BLOCK_SIZE: tuple[int, int] = (128, 128)
+
+    def __init__(self, quant_config: ModelOptFp8Config) -> None:
+        self.quant_config = quant_config
+        block_n, block_k = self._WEIGHT_BLOCK_SIZE
+        self.weight_block_size = list(self._WEIGHT_BLOCK_SIZE)
+        self.w8a8_block_fp8_linear = W8A8BlockFp8LinearOp(
+            weight_group_shape=GroupShape(block_n, block_k),
+            act_quant_group_shape=GroupShape(1, block_k),
+            cutlass_block_fp8_supported=cutlass_block_fp8_supported(),
+            use_aiter_and_is_supported=False,
+        )
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        del input_size, output_size
+
+        if not self.quant_config.is_checkpoint_fp8_serialized:
+            raise ValueError(
+                "FP8_PB_WO currently only supports FP8-serialized checkpoints."
+            )
+
+        output_size_per_partition = sum(output_partition_sizes)
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        layer.logical_widths = output_partition_sizes
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+
+        # Expose block size so the v2 weight loaders can translate offsets from
+        # element-space -> block-space for BlockQuantScaleParameter.
+        layer.weight_block_size = self.weight_block_size
+
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                output_size_per_partition,
+                input_size_per_partition,
+                dtype=torch.float8_e4m3fn,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight", weight)
+
+        block_n, block_k = self._WEIGHT_BLOCK_SIZE
+        if output_size_per_partition % block_n != 0:
+            raise ValueError(
+                "ModelOpt FP8_PB_WO requires out_features divisible by "
+                f"{block_n}, got {output_size_per_partition}."
+            )
+        if input_size_per_partition % block_k != 0:
+            raise ValueError(
+                "ModelOpt FP8_PB_WO requires in_features divisible by "
+                f"{block_k}, got {input_size_per_partition}."
+            )
+
+        out_blks = output_size_per_partition // block_n
+        in_blks = input_size_per_partition // block_k
+
+        # Match ModelOpt's exported shape so weight loading works without a
+        # custom loader: [out_blk, 1, in_blk, 1]
+        weight_scale = BlockQuantScaleParameter(
+            data=torch.empty((out_blks, 1, in_blks, 1), dtype=torch.float32),
+            input_dim=2,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        weight_scale[:] = torch.finfo(torch.float32).min
+        layer.register_parameter("weight_scale", weight_scale)
+
+    def process_weights_after_loading(self, layer: Module) -> None:
+        # Keep weight in [out, in] layout for W8A8BlockFp8LinearOp.
+        layer.weight = Parameter(layer.weight.data, requires_grad=False)
+
+        scale = layer.weight_scale
+        if scale.dim() == 4:
+            # [out_blk, 1, in_blk, 1] -> [out_blk, in_blk]
+            scale = scale.squeeze(1).squeeze(-1)
+        elif scale.dim() != 2:
+            raise ValueError(
+                "Unexpected ModelOpt FP8_PB_WO weight_scale shape: "
+                f"{tuple(scale.shape)}."
+            )
+
+        layer.weight_scale = Parameter(scale.contiguous(), requires_grad=False)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        return self.w8a8_block_fp8_linear.apply(
+            input=x,
+            weight=layer.weight,
+            weight_scale=layer.weight_scale,
+            input_scale=None,
+            bias=bias,
+        )
+
+
 class ModelOptFp8MoEMethod(FusedMoEMethodBase):
     """MoE method for ModelOpt FP8.
     Supports loading FP8 checkpoints with static weight scale and
@@ -796,7 +1038,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
             )
 
         # Expert selection
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
@@ -1599,7 +1841,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
             x_routing, _ = x
         else:
             x_routing = x
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x_routing,
             router_logits=router_logits,
         )
diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py
index 4bedb951a33f5..513f6f7b21abc 100644
--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -370,7 +370,7 @@ class MoeWNA16Method(FusedMoEMethodBase):
         from vllm.model_executor.layers.fused_moe import fused_experts
 
         assert layer.activation == "silu", "Only SiLU activation is supported."
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index e96e87d15787d..dc0fbfa7df35a 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -95,12 +95,12 @@ def get_mxfp4_backend_with_lora() -> Mxfp4Backend:
         # SM120 needs this fix: https://github.com/triton-lang/triton/pull/8498
         and (9, 0) <= current_platform.get_device_capability() < (11, 0)
     )
-    if envs.VLLM_MXFP4_USE_MARLIN or not triton_kernels_supported:
-        logger.info_once("[get_mxfp4_backend_with_lora] Using Marlin backend")
-        return Mxfp4Backend.MARLIN
+    if envs.VLLM_MXFP4_USE_MARLIN is False and triton_kernels_supported:
+        logger.info_once("[get_mxfp4_backend_with_lora] Using Triton backend")
+        return Mxfp4Backend.TRITON
 
-    logger.info_once("[get_mxfp4_backend_with_lora] Using Triton backend")
-    return Mxfp4Backend.TRITON
+    logger.info_once("[get_mxfp4_backend_with_lora] Using Marlin backend")
+    return Mxfp4Backend.MARLIN
 
 
 def get_mxfp4_backend(with_lora_support: bool) -> Mxfp4Backend:
@@ -896,7 +896,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
             raise NotImplementedError("EPLB is not supported for mxfp4")
 
         if self.mxfp4_backend == Mxfp4Backend.MARLIN:
-            topk_weights, topk_ids, _ = layer.select_experts(
+            topk_weights, topk_ids = layer.select_experts(
                 hidden_states=x,
                 router_logits=router_logits,
             )
@@ -990,7 +990,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
         ):
             from vllm.utils.flashinfer import flashinfer_cutlass_fused_moe
 
-            topk_weights, topk_ids, _ = layer.select_experts(
+            topk_weights, topk_ids = layer.select_experts(
                 hidden_states=x,
                 router_logits=router_logits,
             )
diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py
index 3640e5c452786..39bcd56bcd3dc 100644
--- a/vllm/model_executor/layers/quantization/quark/quark.py
+++ b/vllm/model_executor/layers/quantization/quark/quark.py
@@ -218,6 +218,49 @@ class QuarkConfig(QuantizationConfig):
         else:
             return False
 
+    def _is_fp8_w4a8(
+        self,
+        weight_quant: list[dict[str, Any]] | None,
+        input_quant: dict[str, Any] | None,
+    ) -> bool:
+        # Confirm weights and input quantized.
+        if weight_quant is None or input_quant is None:
+            return False
+
+        if not isinstance(weight_quant, list) or len(weight_quant) != 2:
+            return False
+
+        # Confirm weight scheme is supported
+        is_w4a8_dtype = (
+            weight_quant[0].get("dtype") == "fp8_e4m3"
+            and weight_quant[1].get("dtype") == "int4"
+            and input_quant.get("dtype") == "fp8_e4m3"
+        )
+        is_static_weight = not weight_quant[0].get("is_dynamic") and not weight_quant[
+            1
+        ].get("is_dynamic")
+        is_per_tensor_fp8_and_per_channel_int4_weight = (
+            weight_quant[0].get("qscheme") == "per_tensor"
+            and weight_quant[1].get("qscheme") == "per_channel"
+            and weight_quant[1].get("symmetric") is True
+            and weight_quant[1].get("ch_axis") == 0
+        )
+
+        if not (
+            is_w4a8_dtype
+            and is_static_weight
+            and is_per_tensor_fp8_and_per_channel_int4_weight
+        ):
+            return False
+
+        # Dynamic quantization is always supported if weights supported.
+        if input_quant.get("is_dynamic"):
+            return True
+
+        # Confirm activation scheme is supported.
+        is_per_tensor_activation = input_quant.get("qscheme") == "per_tensor"
+        return is_per_tensor_activation
+
     def _is_fp8_w8a8(
         self,
         weight_quant: dict[str, Any] | None,
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
index d84e22d1fa0f2..81970480319ab 100644
--- a/vllm/model_executor/layers/quantization/quark/quark_moe.py
+++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -63,8 +63,9 @@ class QuarkMoEMethod(FusedMoEMethodBase):
             )
         weight_config = layer_quant_config.get("weight")
         input_config = layer_quant_config.get("input_tensors")
-
-        if quant_config._is_fp8_w8a8(weight_config, input_config):
+        if quant_config._is_fp8_w4a8(weight_config, input_config):
+            return QuarkW4A8Fp8MoEMethod(weight_config, input_config, module.moe_config)
+        elif quant_config._is_fp8_w8a8(weight_config, input_config):
             return QuarkW8A8Fp8MoEMethod(weight_config, input_config, module.moe_config)
         elif quant_config._is_ocp_mx(weight_config, input_config):
             return QuarkOCP_MX_MoEMethod(weight_config, input_config, module.moe_config)
@@ -337,7 +338,7 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
         x: torch.Tensor,
         router_logits: torch.Tensor,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
@@ -396,6 +397,161 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
             )
 
 
+class QuarkW4A8Fp8MoEMethod(QuarkMoEMethod):
+    def __init__(
+        self,
+        weight_config: dict[str, Any],
+        input_config: dict[str, Any],
+        moe: FusedMoEConfig,
+    ):
+        super().__init__(moe)
+        self.weight_quant = weight_config
+        self.input_quant = input_config
+
+        assert rocm_aiter_ops.is_fused_moe_enabled(), (
+            "W4A8 FP8 MoE requires ROCm AITER fused MoE support."
+        )
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        params_dtype = torch.uint32
+        w13_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                hidden_size // 8,  # INT32 packing for W4
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        w2_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition // 8,  # INT32 packing for W4
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight", w13_weight)
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        # Per-tensor fp8 weight scales
+        w13_weight_scale = torch.nn.Parameter(
+            torch.ones(num_experts, 2, dtype=torch.float32), requires_grad=False
+        )
+        w2_weight_scale = torch.nn.Parameter(
+            torch.ones(num_experts, dtype=torch.float32), requires_grad=False
+        )
+        layer.register_parameter("w13_weight_scale", w13_weight_scale)
+        layer.register_parameter("w2_weight_scale", w2_weight_scale)
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
+        )
+        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+
+        # Per-channel int4 weight scales
+        w13_weight_scale_2 = torch.nn.Parameter(
+            torch.ones(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                dtype=torch.float32,
+            ),
+            requires_grad=False,
+        )
+        w2_weight_scale_2 = torch.nn.Parameter(
+            torch.ones(num_experts, hidden_size, dtype=torch.float32),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight_scale_2", w13_weight_scale_2)
+        layer.register_parameter("w2_weight_scale_2", w2_weight_scale_2)
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}
+        )
+        set_weight_attrs(w13_weight_scale_2, extra_weight_attrs)
+        set_weight_attrs(w2_weight_scale_2, extra_weight_attrs)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights(
+            layer.w13_weight.data, layer.w2_weight.data
+        )
+        layer.w13_weight = torch.nn.Parameter(shuffled_w13, requires_grad=False)
+        layer.w2_weight = torch.nn.Parameter(shuffled_w2, requires_grad=False)
+
+        # INT4-FP8 : offset INT4 w13_weight_scale1 to single w13_weight_scale
+        # Fp8 moe kernel needs single fp8 w13_weight_scale for w13 per expert.
+        # We won't do requant each expert's fp8 weight (not direct available),
+        # instead we adjust half of INT4 w13_weight_scale1 numbers
+        shard_size = layer.intermediate_size_per_partition
+        max_w13_scales = layer.w13_weight_scale.max(dim=1).values
+        assert torch.all(max_w13_scales != 0), "fp8 weight scale cannot be zero."
+        for expert_id in range(layer.local_num_experts):
+            start = 0
+            max_w13_scale_fp8 = max_w13_scales[expert_id]
+            for shard_id in range(2):
+                if layer.w13_weight_scale[expert_id][shard_id] != max_w13_scale_fp8:
+                    int4_rescale = (
+                        layer.w13_weight_scale[expert_id][shard_id] / max_w13_scale_fp8
+                    )
+                    layer.w13_weight_scale_2[expert_id][start : start + shard_size] *= (
+                        int4_rescale
+                    )
+                start += shard_size
+
+        layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales, requires_grad=False)
+
+        # special hack to asm_moe, which takes (weight_scale1 * weight_scale) as post
+        # GEMM scaling optimal design - shall apply per-column weight_scale1 before
+        # GEMM, and weight_scale post
+        for expert_id in range(layer.local_num_experts):
+            layer.w13_weight_scale_2[expert_id] *= max_w13_scales[expert_id]
+            layer.w2_weight_scale_2[expert_id] *= layer.w2_weight_scale[expert_id]
+
+    def get_fused_moe_quant_config(self, layer):
+        return fp8_w8a8_moe_quant_config(
+            w1_scale=layer.w13_weight_scale_2,
+            w2_scale=layer.w2_weight_scale_2,
+            per_out_ch_quant=True,
+        )
+
+    def apply(
+        self,
+        layer: FusedMoE,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        topk_weights, topk_ids = layer.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+        )
+
+        from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+            rocm_aiter_fused_experts,
+        )
+
+        return rocm_aiter_fused_experts(
+            hidden_states=x,
+            w1=layer.w13_weight,
+            w2=layer.w2_weight,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            activation=layer.activation,
+            apply_router_weight_on_input=layer.apply_router_weight_on_input,
+            quant_config=self.moe_quant_config,
+            expert_map=layer.expert_map,
+        )
+
+
 class QuarkOCP_MX_MoEMethod(QuarkMoEMethod):
     def __init__(
         self,
@@ -582,7 +738,7 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod):
         x: torch.Tensor,
         router_logits: torch.Tensor,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
diff --git a/vllm/model_executor/layers/quantization/rtn.py b/vllm/model_executor/layers/quantization/rtn.py
index b2ecb0b175f81..dce9c661ec332 100644
--- a/vllm/model_executor/layers/quantization/rtn.py
+++ b/vllm/model_executor/layers/quantization/rtn.py
@@ -359,7 +359,7 @@ class RTNMoEMethod(FusedMoEMethodBase):
         x: torch.Tensor,
         router_logits: torch.Tensor,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index bdc3d1fc7232d..15ea9f7d60fff 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -31,6 +31,7 @@ from vllm.model_executor.utils import replace_parameter
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
 from vllm.utils.deep_gemm import (
+    DeepGemmQuantScaleFMT,
     fp8_gemm_nt,
     is_deep_gemm_e8m0_used,
     is_deep_gemm_supported,
@@ -247,7 +248,6 @@ class W8A8BlockFp8LinearOp:
         self.act_quant_group_shape = act_quant_group_shape
         self.is_deep_gemm_supported = is_deep_gemm_supported()
         self.is_hopper = current_platform.is_device_capability(90)
-        self.is_blackwell = current_platform.is_device_capability_family(100)
         self.use_deep_gemm_e8m0 = is_deep_gemm_e8m0_used()
 
         # Get the correct blockscale mul and input quant operations.
@@ -303,7 +303,7 @@ class W8A8BlockFp8LinearOp:
         weight: torch.Tensor,
         weight_scale: torch.Tensor,
     ) -> torch.Tensor:
-        if self.use_deep_gemm_e8m0 and self.is_blackwell:
+        if DeepGemmQuantScaleFMT.from_oracle() == DeepGemmQuantScaleFMT.UE8M0:
             q_input, input_scale = per_token_group_quant_fp8_packed_for_deepgemm(
                 input_2d,
                 group_size=self.act_quant_group_shape.col,
@@ -1252,6 +1252,14 @@ def validate_fp8_block_shape(
     """Validate block quantization shapes for tensor parallelism."""
     from vllm.distributed import get_tensor_model_parallel_world_size
 
+    if getattr(layer, "allow_fp8_block_shape_mismatch", False):
+        logger.debug(
+            "Skipping FP8 block shape validation for layer %s due to detected"
+            " mismatch allowance.",
+            getattr(layer, "prefix", "<unknown>"),
+        )
+        return
+
     tp_size = getattr(layer, "tp_size", get_tensor_model_parallel_world_size())
     block_n, block_k = block_size[0], block_size[1]
 
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index 072b46f055210..3de2b6509e460 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -48,7 +48,7 @@ def query_marlin_supported_quant_types(
             -1 if capability_tuple is None else capability_tuple.to_int()
         )
 
-    if device_capability < 80:
+    if device_capability < 75:
         return []
 
     # - has_zp is True: return quant_types that has zero points
@@ -594,9 +594,15 @@ def apply_awq_marlin_linear(
 
     a_scales = None
     if input_dtype == torch.int8:
+        assert quant_type == scalar_types.uint4, (
+            "W8A8-INT8 is not supported by marlin kernel."
+        )
         reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)
         a_scales = a_scales * input_global_scale
     elif input_dtype == torch.float8_e4m3fn:
+        assert quant_type == scalar_types.uint4, (
+            "INT8 weight + FP8 activation is not supported."
+        )
         reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)
 
     output = ops.gptq_marlin_gemm(
@@ -649,9 +655,15 @@ def apply_rtn_marlin_linear(
 
     a_scales = None
     if input_dtype == torch.int8:
+        assert quant_type == scalar_types.uint4b8, (
+            "W8A8-INT8 is not supported by marlin kernel."
+        )
         reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)
         a_scales = a_scales * input_global_scale
     elif input_dtype == torch.float8_e4m3fn:
+        assert quant_type == scalar_types.uint4b8, (
+            "INT8 weight + FP8 activation is not supported."
+        )
         reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)
 
     output = ops.gptq_marlin_gemm(
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
index b94d5bbf36540..4d0a34c3be119 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
@@ -23,7 +23,7 @@ logger = init_logger(__name__)
 
 
 def is_fp4_marlin_supported():
-    return current_platform.has_device_capability(80)
+    return current_platform.has_device_capability(75)
 
 
 def nvfp4_marlin_process_scales(marlin_scales):
@@ -154,6 +154,12 @@ def prepare_fp4_layer_for_marlin(
     )
 
     is_nvfp4 = hasattr(layer, "weight_scale_2")
+    if input_dtype is not None and input_dtype.itemsize == 1:
+        if is_nvfp4:
+            raise RuntimeError("NVFP4 weight + INT8/FP8 activation is not supported.")
+        elif input_dtype != torch.float8_e4m3fn:
+            raise RuntimeError("MXFP4 weight + INT8 activation is not supported.")
+
     group_size = 16 if is_nvfp4 else 32
 
     part_size_n = layer.output_size_per_partition
@@ -231,6 +237,12 @@ def prepare_moe_fp4_layer_for_marlin(
     )
 
     is_nvfp4 = hasattr(layer, "w13_weight_scale_2")
+    if input_dtype is not None and input_dtype.itemsize == 1:
+        if is_nvfp4:
+            raise RuntimeError("NVFP4 weight + INT8/FP8 activation is not supported.")
+        elif input_dtype != torch.float8_e4m3fn:
+            raise RuntimeError("MXFP4 weight + INT8 activation is not supported.")
+
     group_size = 16 if is_nvfp4 else 32
 
     e = layer.num_experts
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
index c67e4f437cf0c..4d2f2fd71ad36 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
@@ -11,7 +11,6 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     marlin_make_workspace_new,
     marlin_permute_bias,
     marlin_permute_scales,
-    marlin_quant_input,
     should_use_atomic_add_reduce,
 )
 from vllm.model_executor.utils import replace_parameter
@@ -22,7 +21,7 @@ logger = init_logger(__name__)
 
 
 def is_fp8_marlin_supported():
-    return current_platform.has_device_capability(80)
+    return current_platform.has_device_capability(75)
 
 
 def fp8_fused_exponent_bias_into_scales(scales):
@@ -63,13 +62,11 @@ def apply_fp8_marlin_linear(
     inputs = reshaped_x
     a_scales = None
     if input_dtype is not None and input_dtype.itemsize == 1:
-        if input_dtype != torch.float8_e4m3fn:
-            raise RuntimeError("FP8 weight + INT8 activation is not supported.")
-
-        inputs, a_scales = marlin_quant_input(inputs, torch.float8_e4m3fn)
+        # inputs, a_scales = marlin_quant_input(inputs, torch.float8_e4m3fn)
+        raise RuntimeError("Marlin W8A8 is not supported.")
 
     output = ops.gptq_marlin_gemm(
-        a=reshaped_x,
+        a=inputs,
         c=None,
         b_q_weight=weight,
         b_bias=bias,
@@ -102,6 +99,8 @@ def prepare_fp8_layer_for_marlin(
         "be used leveraging the Marlin kernel. This may degrade "
         "performance for compute-heavy workloads."
     )
+    if input_dtype is not None and input_dtype.itemsize == 1:
+        raise RuntimeError("Marlin W8A8 is not supported.")
 
     part_size_n = layer.output_size_per_partition
     part_size_k = layer.input_size_per_partition
@@ -145,10 +144,20 @@ def prepare_fp8_layer_for_marlin(
     # marlin kernel only support channel-wise and group-wise quantization
     # we need to convert the scales
     if weight_block_size is None:
+        logical_widths = getattr(layer, "logical_widths", [])
         if scales.nelement() == 1:
             # tensor-wise quantization -> channel-wise quantization
             # (1, 1) =>(repeat)=> (1, size_n)
             scales = scales.view(1, 1).repeat_interleave(part_size_n, 1)
+        elif scales.nelement() == len(logical_widths):
+            # tensor-wise quantization with logical_widths ->
+            #    channel-wise quantization
+            assert sum(logical_widths) == part_size_n, (
+                f"Sum of logical_widths ({sum(logical_widths)}) must be equal "
+                f"to part_size_n ({part_size_n})"
+            )
+            lw_tensor = scales.new_tensor(logical_widths, dtype=torch.int64)
+            scales = scales.view(1, -1).repeat_interleave(lw_tensor, dim=1)
         elif scales.nelement() > 1 and scales.nelement() != part_size_n:
             assert part_size_n % scales.nelement() == 0
             s_size = scales.nelement()
@@ -199,6 +208,8 @@ def prepare_moe_fp8_layer_for_marlin(
         "be used leveraging the Marlin kernel. This may degrade "
         "performance for compute-heavy workloads."
     )
+    if input_dtype is not None and input_dtype.itemsize == 1:
+        raise RuntimeError("Marlin W8A8 is not supported.")
 
     e = layer.num_experts
     k = layer.hidden_size
diff --git a/vllm/model_executor/layers/rotary_embedding/base.py b/vllm/model_executor/layers/rotary_embedding/base.py
index afa69324c4e2e..7e83ea9a1355b 100644
--- a/vllm/model_executor/layers/rotary_embedding/base.py
+++ b/vllm/model_executor/layers/rotary_embedding/base.py
@@ -38,7 +38,10 @@ class RotaryEmbeddingBase(CustomOp):
         #                        and current_platform.is_cuda()
         #                        and has_flashinfer()
         #                        and self.head_size in [64, 128, 256, 512])
-        self.use_flashinfer = False
+
+        # Check if use_flashinfer is already set
+        if not hasattr(self, "use_flashinfer"):
+            self.use_flashinfer = False
 
         cache = self._compute_cos_sin_cache()
         if not self.use_flashinfer:
diff --git a/vllm/model_executor/layers/rotary_embedding/common.py b/vllm/model_executor/layers/rotary_embedding/common.py
index 50660c6ecc223..b86cd9f001d61 100644
--- a/vllm/model_executor/layers/rotary_embedding/common.py
+++ b/vllm/model_executor/layers/rotary_embedding/common.py
@@ -178,6 +178,37 @@ class ApplyRotaryEmb(CustomOp):
             output = output.to(origin_dtype)
         return output
 
+    def _pre_process(
+        self,
+        x: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Size, torch.dtype]:
+        origin_shape = x.shape
+        if len(origin_shape) == 3:
+            # x: [seq_len, num_heads, head_size]
+            x = x.unsqueeze(0)
+
+        origin_dtype = x.dtype
+        if self.enable_fp32_compute:
+            x = x.float()
+            cos = cos.float()
+            sin = sin.float()
+
+        return x, cos, sin, origin_shape, origin_dtype
+
+    def _post_process(
+        self,
+        output: torch.Tensor,
+        origin_shape: torch.Size,
+        origin_dtype: torch.dtype,
+    ) -> torch.Tensor:
+        if len(origin_shape) == 3:
+            output = output.squeeze(0)
+        if self.enable_fp32_compute:
+            output = output.to(origin_dtype)
+        return output
+
     def forward_native(
         self,
         x: torch.Tensor,
@@ -197,16 +228,7 @@ class ApplyRotaryEmb(CustomOp):
     ) -> torch.Tensor:
         from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb
 
-        origin_dtype = x.dtype
-        if self.enable_fp32_compute:
-            x = x.float()
-            cos = cos.float()
-            sin = sin.float()
-
-        origin_shape = x.shape
-        if len(origin_shape) == 3:
-            # x: [seq_len, num_heads, head_size]
-            x = x.unsqueeze(0)
+        x, cos, sin, origin_shape, origin_dtype = self._pre_process(x, cos, sin)
 
         """
         Arguments of apply_rotary_emb() in vllm_flash_attn:
@@ -218,10 +240,7 @@ class ApplyRotaryEmb(CustomOp):
         interleaved = not self.is_neox_style
         output = apply_rotary_emb(x, cos, sin, interleaved)
 
-        if len(origin_shape) == 3:
-            output = output.squeeze(0)
-        if self.enable_fp32_compute:
-            output = output.to(origin_dtype)
+        output = self._post_process(output, origin_shape, origin_dtype)
         return output
 
     def forward_hip(
@@ -231,16 +250,7 @@ class ApplyRotaryEmb(CustomOp):
         sin: torch.Tensor,
     ) -> torch.Tensor:
         if self.apply_rotary_emb_flash_attn is not None:
-            origin_dtype = x.dtype
-            if self.enable_fp32_compute:
-                x = x.float()
-                cos = cos.float()
-                sin = sin.float()
-
-            origin_shape = x.shape
-            if len(origin_shape) == 3:
-                # x: [seq_len, num_heads, head_size]
-                x = x.unsqueeze(0)
+            x, cos, sin, origin_shape, origin_dtype = self._pre_process(x, cos, sin)
 
             """
             Arguments of apply_rotary() in flash_attn:
@@ -254,10 +264,7 @@ class ApplyRotaryEmb(CustomOp):
                 x, cos, sin, interleaved=interleaved
             ).type_as(x)
 
-            if len(origin_shape) == 3:
-                output = output.squeeze(0)
-            if self.enable_fp32_compute:
-                output = output.to(origin_dtype)
+            output = self._post_process(output, origin_shape, origin_dtype)
         else:
             # Falling back to PyTorch native implementation.
             output = self.forward_native(x, cos, sin)
diff --git a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
index e72834e473c15..8402b65efcc04 100644
--- a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
@@ -6,6 +6,7 @@ import math
 import torch
 
 from vllm.platforms import current_platform
+from vllm.utils.flashinfer import has_flashinfer
 
 from .base import RotaryEmbeddingBase
 from .common import (
@@ -56,6 +57,13 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbeddingBase):
             / yarn_get_mscale(self.scaling_factor, float(mscale_all_dim))
             * attn_factor
         )
+        self.use_flashinfer = (
+            self.enabled()
+            and dtype in (torch.float16, torch.bfloat16)
+            and current_platform.is_cuda()
+            and has_flashinfer()
+            and head_size in [64, 128, 256, 512]
+        )
         super().__init__(
             head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
         )
@@ -162,4 +170,15 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbeddingBase):
         key: torch.Tensor | None = None,
         offsets: torch.Tensor | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor | None]:
-        return self.forward_native(positions, query, key, offsets)
+        if self.use_flashinfer:
+            torch.ops.vllm.flashinfer_rotary_embedding(
+                torch.add(positions, offsets) if offsets is not None else positions,
+                query,
+                key,
+                self.head_size,
+                self.cos_sin_cache,
+                self.is_neox_style,
+            )
+            return query, key
+        else:
+            return self.forward_native(positions, query, key, offsets)
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index 74b02e4c62583..08d7a851ac9ab 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -189,7 +189,9 @@ def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module],
             )
 
     convert_type = model_config.convert_type
-    if convert_type != "none" and supports_multimodal(model_cls):
+    if convert_type not in ["none", "mm_encoder_only"] and supports_multimodal(
+        model_cls
+    ):
         logger.debug_once("Detected conversion of Multi Modal model.")
         converted = try_create_mm_pooling_model_cls(model_cls)
         if converted is not None:
@@ -200,6 +202,11 @@ def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module],
 
     if convert_type == "none":
         pass
+    elif convert_type == "mm_encoder_only":
+        logger.debug_once("Converting to mm encoder only model.")
+        from vllm.model_executor.models.adapters import as_mm_encoder_only_model
+
+        model_cls = as_mm_encoder_only_model(model_cls)
     elif convert_type == "embed":
         logger.debug_once("Converting to embedding model.")
         model_cls = as_embedding_model(model_cls)
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 610e6a620ade2..0c5961561a7d9 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -23,6 +23,7 @@ import torch
 from huggingface_hub import HfFileSystem, hf_hub_download, snapshot_download
 from safetensors.torch import load, load_file, safe_open, save_file
 from tqdm.auto import tqdm
+from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
 
 from vllm import envs
 from vllm.config import ModelConfig
@@ -448,12 +449,31 @@ def download_weights_from_hf(
             fs = HfFileSystem()
             file_list = fs.ls(model_name_or_path, detail=False, revision=revision)
 
-            # Use the first pattern found in the HF repo's files.
-            for pattern in allow_patterns:
-                matching = fnmatch.filter(file_list, pattern)
-                if len(matching) > 0:
-                    allow_patterns = [pattern]
-                break
+            # If downloading safetensors and an index file exists, use the
+            # specific file names from the index to avoid downloading
+            # unnecessary files (e.g., from subdirectories like "original/").
+            index_file = f"{model_name_or_path}/{SAFE_WEIGHTS_INDEX_NAME}"
+            if "*.safetensors" in allow_patterns and index_file in file_list:
+                index_path = hf_hub_download(
+                    repo_id=model_name_or_path,
+                    filename=SAFE_WEIGHTS_INDEX_NAME,
+                    cache_dir=cache_dir,
+                    revision=revision,
+                )
+                with open(index_path) as f:
+                    weight_map = json.load(f)["weight_map"]
+                if weight_map:
+                    # Extra [] so that weight_map files are treated as a
+                    # single allow_pattern in the loop below
+                    allow_patterns = [list(set(weight_map.values()))]  # type: ignore[list-item]
+                else:
+                    allow_patterns = ["*.safetensors"]
+            else:
+                # Use the first pattern found in the HF repo's files.
+                for pattern in allow_patterns:
+                    if fnmatch.filter(file_list, pattern):
+                        allow_patterns = [pattern]
+                        break
         except Exception as e:
             logger.warning(
                 "Failed to get file list for '%s'. Trying each pattern in "
@@ -480,6 +500,9 @@ def download_weights_from_hf(
             )
             # If we have downloaded weights for this allow_pattern,
             # we don't need to check the rest.
+            # allow_pattern can be a list (from weight_map) or str (glob)
+            if isinstance(allow_pattern, list):
+                break
             if any(Path(hf_folder).glob(allow_pattern)):
                 break
         time_taken = time.perf_counter() - start_time
diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
index 504de9fe10871..acf1e57a59a97 100644
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -520,3 +520,64 @@ def seq_cls_model_loader(model, weights: Iterable[tuple[str, torch.Tensor]]):
     method = getattr(text_config, "method", None)
     assert method in SEQ_CLS_LOAD_METHODS, f"method {method} not supported"
     return SEQ_CLS_LOAD_METHODS[method](model, weights)
+
+
+def as_mm_encoder_only_model(cls: _T) -> _T:
+    """
+    Subclass an existing vLLM vl model to support mm encoder only for
+    EPD encoder instances.
+    """
+    if not hasattr(cls, "embed_multimodal"):
+        # Submodel case: return the original class.
+        return cls
+
+    if not hasattr(cls, "get_language_model_spec"):
+        raise TypeError(f"{cls} need to implement `get_language_model_spec` method.")
+
+    lm_model_cls, lm_attr = cls.get_language_model_spec()
+
+    if lm_model_cls is None or lm_attr is None:
+        raise TypeError(
+            f"{cls}.get_language_model_spec() must return (lm_model_cls, lm_attr)"
+        )
+
+    class DummyLM(nn.Module):
+        def __init__(self, *args, **kwargs):
+            self.make_empty_intermediate_tensors = None
+
+    class ModelForMMEncoderOnly(cls):
+        def __init__(
+            self,
+            *,
+            vllm_config: "VllmConfig",
+            prefix: str = "",
+            **kwargs: Any,
+        ) -> None:
+            self.is_mm_encoder_only_model = True
+            origin_init = lm_model_cls.__init__
+            try:
+                lm_model_cls.__init__ = DummyLM.__init__
+                super().__init__(vllm_config=vllm_config, prefix=prefix, **kwargs)
+
+                if hasattr(self, lm_attr):
+                    delattr(self, lm_attr)
+            finally:
+                lm_model_cls.__init__ = origin_init
+
+        def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+            from .utils import AutoWeightsLoader
+
+            origin_init_ = AutoWeightsLoader.__init__
+
+            def _new_init_(self, *args, **kwargs):
+                origin_init_(self, *args, **kwargs)
+                self.skip_prefixes = (self.skip_prefixes or []) + [f"{lm_attr}."]
+
+            try:
+                AutoWeightsLoader.__init__ = _new_init_
+                result = super().load_weights(weights)
+            finally:
+                AutoWeightsLoader.__init__ = origin_init_
+            return result
+
+    return ModelForMMEncoderOnly  # type: ignore
diff --git a/vllm/model_executor/models/aimv2.py b/vllm/model_executor/models/aimv2.py
index 3d000f3ac3ab5..96ca27ad02504 100644
--- a/vllm/model_executor/models/aimv2.py
+++ b/vllm/model_executor/models/aimv2.py
@@ -8,7 +8,7 @@ from collections.abc import Iterable
 import torch
 import torch.nn as nn
 
-from vllm.attention.layer import MultiHeadAttention
+from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.distributed.utils import divide
 from vllm.model_executor.layers.activation import SiluAndMul
@@ -126,7 +126,7 @@ class AIMv2Attention(nn.Module):
         self.tp_size = get_tensor_model_parallel_world_size()
         self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
 
-        self.attn = MultiHeadAttention(
+        self.attn = MMEncoderAttention(
             self.num_heads_per_partition, self.head_dim, self.scale
         )
 
diff --git a/vllm/model_executor/models/bagel.py b/vllm/model_executor/models/bagel.py
index 98229c6d4ca1b..cf45fb9fe8370 100644
--- a/vllm/model_executor/models/bagel.py
+++ b/vllm/model_executor/models/bagel.py
@@ -487,7 +487,7 @@ class BagelForConditionalGeneration(
         # Split by image
         return tuple(vision_embeds)
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         """Get multimodal embeddings from input."""
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py
index f31f99c0592b2..7387830b32bdc 100644
--- a/vllm/model_executor/models/blip.py
+++ b/vllm/model_executor/models/blip.py
@@ -9,7 +9,7 @@ import torch
 import torch.nn as nn
 from transformers import Blip2VisionConfig, BlipVisionConfig
 
-from vllm.attention.layer import MultiHeadAttention
+from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.conv import Conv2dLayer
@@ -122,7 +122,7 @@ class BlipAttention(nn.Module):
         self.tp_size = get_tensor_model_parallel_world_size()
         self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
 
-        self.attn = MultiHeadAttention(
+        self.attn = MMEncoderAttention(
             self.num_heads_per_partition, self.head_dim, self.scale
         )
 
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index 22f3ecad748e6..8e77b36e6feb5 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -14,7 +14,8 @@ from transformers import (
     CLIPVisionConfig,
 )
 
-from vllm.attention.layer import Attention, MultiHeadAttention
+from vllm.attention.layer import Attention
+from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
@@ -354,7 +355,7 @@ class CLIPAttention(nn.Module):
         quant_config: QuantizationConfig | None = None,
         *,
         prefix: str = "",
-        attn_cls: type[Attention] | type[MultiHeadAttention],
+        attn_cls: type[Attention] | type[MMEncoderAttention],
     ) -> None:
         super().__init__()
 
@@ -449,7 +450,7 @@ class CLIPEncoderLayer(nn.Module):
         quant_config: QuantizationConfig | None = None,
         *,
         prefix: str = "",
-        attn_cls: type[Attention] | type[MultiHeadAttention],
+        attn_cls: type[Attention] | type[MMEncoderAttention],
     ) -> None:
         super().__init__()
         self.self_attn = CLIPAttention(
@@ -493,7 +494,7 @@ class CLIPEncoder(nn.Module):
         num_hidden_layers_override: int | None = None,
         *,
         prefix: str = "",
-        attn_cls: type[Attention] | type[MultiHeadAttention],
+        attn_cls: type[Attention] | type[MMEncoderAttention],
     ) -> None:
         super().__init__()
 
@@ -638,7 +639,7 @@ class CLIPVisionTransformer(nn.Module):
             quant_config=quant_config,
             num_hidden_layers_override=num_hidden_layers_override,
             prefix=f"{prefix}.encoder",
-            attn_cls=MultiHeadAttention,
+            attn_cls=MMEncoderAttention,
         )
 
         num_hidden_layers = config.num_hidden_layers
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index a3624b1cfa5f2..d33b3fdf47467 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -88,6 +88,26 @@ class JinaRobertaModelConfig(VerifyAndUpdateConfig):
             }
 
 
+class LlamaBidirectionalConfig(VerifyAndUpdateConfig):
+    @staticmethod
+    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
+        from vllm.config.pooler import PoolingTypeStr
+
+        hf_config = vllm_config.model_config.hf_config
+        hf_config.is_causal = False
+
+        pooling_type_map: dict[str, PoolingTypeStr] = {
+            "avg": "MEAN",
+            "cls": "CLS",
+            "last": "LAST",
+        }
+
+        pooling_type = pooling_type_map.get(hf_config.pooling, None)
+        if pooling_type is None:
+            raise ValueError(f"pool_type {hf_config.pooling} not supported")
+        vllm_config.model_config.pooler_config.pooling_type = pooling_type
+
+
 class NomicBertModelConfig(VerifyAndUpdateConfig):
     @staticmethod
     def verify_and_update_config(vllm_config: "VllmConfig") -> None:
@@ -401,7 +421,7 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
             # of attention tokens that would fit mamba_page_size:
             # e.g. for mamba page size = 788kB
             #          attn_1_token = 2kB -> fits ~394 tokens
-            #      then round up to a mulitple of 256 -> 512 tokens
+            #      then round up to a multiple of 256 -> 512 tokens
             # End result:
             #  attn_block_size = 512
             #  mamba_block_size = 512 (aligned to a multiple of chunk_size)
@@ -509,6 +529,8 @@ MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = {
     "GteNewModel": GteNewModelConfig,
     "GteNewForSequenceClassification": GteNewModelConfig,
     "Gemma3TextModel": Gemma3TextModelConfig,
+    "LlamaBidirectionalForSequenceClassification": LlamaBidirectionalConfig,
+    "LlamaBidirectionalModel": LlamaBidirectionalConfig,
     "NomicBertModel": NomicBertModelConfig,
     "Qwen2ForProcessRewardModel": Qwen2ForProcessRewardModelConfig,
     "Qwen2ForRewardModel": Qwen2ForRewardModelConfig,
diff --git a/vllm/model_executor/models/deepencoder.py b/vllm/model_executor/models/deepencoder.py
index 8f1660891fcbf..045445d23b8f3 100644
--- a/vllm/model_executor/models/deepencoder.py
+++ b/vllm/model_executor/models/deepencoder.py
@@ -18,7 +18,7 @@ import torch.nn as nn
 import torch.nn.functional as F
 from transformers import CLIPVisionConfig
 
-from vllm.attention.layer import MultiHeadAttention
+from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
 from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -628,7 +628,7 @@ class DeepCLIPVisionTransformer(nn.Module):
             quant_config=quant_config,
             num_hidden_layers_override=num_hidden_layers_override,
             prefix=f"{prefix}.encoder",
-            attn_cls=MultiHeadAttention,
+            attn_cls=MMEncoderAttention,
         )
 
         num_hidden_layers = config.num_hidden_layers
diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py
index ca77b8322e2e8..c25e8422da157 100644
--- a/vllm/model_executor/models/deepseek_mtp.py
+++ b/vllm/model_executor/models/deepseek_mtp.py
@@ -141,6 +141,7 @@ class DeepSeekMultiTokenPredictor(nn.Module):
         self.embed_tokens = VocabParallelEmbedding(
             config.vocab_size,
             config.hidden_size,
+            prefix=maybe_prefix(prefix, "embed_tokens"),
         )
         self.logits_processor = LogitsProcessor(config.vocab_size)
 
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 6670143cda250..22d43a4bae18a 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -878,8 +878,11 @@ class Indexer(nn.Module):
         )
 
         q_pe, k_pe = rotary_emb(positions, q_pe, k_pe.unsqueeze(1))
-        q = torch.cat([q_pe.squeeze(0), q_nope], dim=-1)
-        k = torch.cat([k_pe.squeeze((0, 2)), k_nope], dim=-1)
+        # `rotary_emb` is shape-preserving; `q_pe` is already
+        # [num_tokens, n_head, rope_dim].
+        q = torch.cat([q_pe, q_nope], dim=-1)
+        # `k_pe` is [num_tokens, 1, rope_dim] (MQA).
+        k = torch.cat([k_pe.squeeze(1), k_nope], dim=-1)
 
         # we only quant q here since k quant is fused with cache insertion
         q = q.view(-1, self.head_dim)
diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py
index 70f72b5cb9beb..e6a201c669e96 100644
--- a/vllm/model_executor/models/gemma3.py
+++ b/vllm/model_executor/models/gemma3.py
@@ -19,7 +19,6 @@ from collections.abc import Iterable
 from itertools import islice
 
 import torch
-import torch.nn.functional as F
 from torch import nn
 from transformers import Gemma3TextConfig
 
@@ -226,77 +225,9 @@ class Gemma3Attention(nn.Module):
 
         q, k = self.rotary_emb(positions, q, k)
         attn_output = self.attn(q, k, v)
-
-        if not kwargs.get("has_images", False):
-            # Fast path for text-only inputs. The performance for the text-only
-            # inputs are not affected by the naive attention below.
-            output, _ = self.o_proj(attn_output)
-            return output
-
-        # NOTE(woosuk): Gemma3 uses bidirectional attention between image tokens
-        # that correspond to the same image while using causal attention
-        # otherwise. Current attention backends cannot handle this pattern, so
-        # we temporarily use a naive attention implementation with mask tensors.
-
-        # We intentionally keep the attention backend as-is and only override
-        # `attn_output` with the naive implementation's output. This minimizes
-        # changes to existing model runners and attention backends. The call to
-        # `self.attn(q, k, v)` is only used to populate the KV cache - its
-        # output is discarded and overwritten below. While this duplicates
-        # computation, it maintains compatibility.
-        # TODO(woosuk): Optimize by implementing custom attention kernels.
-        attn_output = self.naive_attn_with_masks(q, k, v, out=attn_output, **kwargs)
         output, _ = self.o_proj(attn_output)
         return output
 
-    def naive_attn_with_masks(
-        self,
-        q: torch.Tensor,
-        k: torch.Tensor,
-        v: torch.Tensor,
-        out: torch.Tensor,
-        **kwargs,
-    ) -> torch.Tensor:
-        # NOTE(woosuk): As described in the comment above, this code is not
-        # meant to be performant. It is only meant to be correct.
-        q = q.view(-1, self.num_heads, self.head_dim)
-        # Expand the key and value to handle GQA.
-        num_queries_per_kv = self.num_heads // self.num_kv_heads
-        k = k.view(-1, self.num_kv_heads, self.head_dim)
-        k = k.repeat_interleave(num_queries_per_kv, dim=-2)
-        v = v.view(-1, self.num_kv_heads, self.head_dim)
-        v = v.repeat_interleave(num_queries_per_kv, dim=-2)
-
-        if self.is_sliding:
-            attn_masks = kwargs["local_attn_masks"]
-        else:
-            attn_masks = kwargs["global_attn_masks"]
-
-        seq_lens = kwargs["seq_lens"]
-        start_idx = 0
-        for seq_len, attn_mask in zip(seq_lens, attn_masks):
-            end_idx = start_idx + seq_len
-            query = q[start_idx:end_idx].unsqueeze(0)
-            key = k[start_idx:end_idx].unsqueeze(0)
-            value = v[start_idx:end_idx].unsqueeze(0)
-
-            # Transpose.
-            query = query.transpose(1, 2)
-            key = key.transpose(1, 2)
-            value = value.transpose(1, 2)
-
-            output = F.scaled_dot_product_attention(
-                query,
-                key,
-                value,
-                attn_mask,
-                self.scaling,
-            )
-            output = output.transpose(1, 2).flatten(-2, -1)
-            out[start_idx:end_idx] = output
-            start_idx = end_idx
-        return out
-
 
 class Gemma3DecoderLayer(nn.Module):
     def __init__(
diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py
index 541d3b2beff83..6fb09be7c67f2 100644
--- a/vllm/model_executor/models/glm4_moe.py
+++ b/vllm/model_executor/models/glm4_moe.py
@@ -21,7 +21,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Inference-only GLM-4.5, GLM-4.6 model compatible with HuggingFace weights."""
+"""Inference-only GLM-4.5, GLM-4.6, GLM-4.7 model
+compatible with HuggingFace weights."""
 
 import typing
 from collections.abc import Callable, Iterable
diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py
index ec5af94e297c1..453a7812a1748 100644
--- a/vllm/model_executor/models/glm4v.py
+++ b/vllm/model_executor/models/glm4v.py
@@ -19,7 +19,7 @@ from transformers import BatchFeature, PreTrainedTokenizer, TensorType
 from transformers.image_utils import ImageInput
 from transformers.tokenization_utils_base import TextInput
 
-from vllm.attention.layer import MultiHeadAttention
+from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import get_tensor_model_parallel_world_size
@@ -135,7 +135,7 @@ class EVA2CLIPAttention(nn.Module):
             prefix=f"{prefix}.dense",
         )
 
-        self.attn = MultiHeadAttention(
+        self.attn = MMEncoderAttention(
             self.num_heads_per_rank, self.head_dim, self.scale
         )
         self.output_dropout = torch.nn.Dropout(config.dropout_prob)
diff --git a/vllm/model_executor/models/hunyuan_vision.py b/vllm/model_executor/models/hunyuan_vision.py
index be084f4ee0f8e..6fc56094af650 100644
--- a/vllm/model_executor/models/hunyuan_vision.py
+++ b/vllm/model_executor/models/hunyuan_vision.py
@@ -34,7 +34,7 @@ import torch.nn.functional as F
 from transformers import BatchFeature
 
 from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.attention.layer import MultiHeadAttention
+from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
 from vllm.config import MultiModalConfig, VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import parallel_state
@@ -232,7 +232,7 @@ class HunYuanVisionAttention(nn.Module):
         )
 
         self.scale = self.hidden_size_per_attention_head**-0.5
-        self.attn = MultiHeadAttention(
+        self.attn = MMEncoderAttention(
             self.num_attention_heads_per_partition,
             self.hidden_size_per_attention_head,
             self.scale,
diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py
index 06b8468e18db9..ee6ca5eacb176 100644
--- a/vllm/model_executor/models/idefics2_vision_model.py
+++ b/vllm/model_executor/models/idefics2_vision_model.py
@@ -27,7 +27,7 @@ from transformers.models.idefics2.configuration_idefics2 import (
     Idefics2VisionConfig,
 )
 
-from vllm.attention.layer import MultiHeadAttention
+from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.conv import Conv2dLayer
@@ -161,8 +161,8 @@ class Idefics2VisionAttention(nn.Module):
             prefix=f"{prefix}.out_proj",
             disable_tp=use_data_parallel,
         )
-        # Use unified MultiHeadAttention with Flash Attention support
-        self.attn = MultiHeadAttention(
+        # Use unified MMEncoderAttention with Flash Attention support
+        self.attn = MMEncoderAttention(
             self.num_heads_per_partition, self.head_dim, self.scale
         )
 
@@ -175,7 +175,7 @@ class Idefics2VisionAttention(nn.Module):
         )  # batch_size, q_len, 3 * num_heads_per_partition * head_dim
         query_states, key_states, value_states = qkv.chunk(3, dim=-1)
 
-        # Use unified MultiHeadAttention implementation
+        # Use unified MMEncoderAttention implementation
         out = self.attn(query_states, key_states, value_states)
         attn_output, _ = self.out_proj(out)
         return attn_output
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index cb99d57e8b8c7..f8288b92ebfae 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -94,6 +94,12 @@ class SupportsMultiModal(Protocol):
     `multimodal_config.mm_encoder_tp_mode="data"`.
     """
 
+    requires_raw_input_tokens: ClassVar[bool] = False
+    """
+    A flag that indicates this model processes input id tokens
+    in their raw form and not input embeddings.
+    """
+
     merge_by_field_config: ClassVar[bool | None] = None
     """
     [DEPRECATED] A flag that indicates which implementation of
@@ -141,6 +147,14 @@ class SupportsMultiModal(Protocol):
         """
         ...
 
+    @classmethod
+    def get_language_model_spec(cls) -> tuple[nn.Module | None, str | None]:
+        """
+        Return the language model spec:
+        (language model class, language model attr)
+        """
+        return None, None
+
     @overload
     def embed_input_ids(self, input_ids: Tensor) -> Tensor: ...
 
@@ -298,10 +312,18 @@ def supports_multimodal_raw_input_only(model: type[object] | object) -> bool:
     return getattr(model, "supports_multimodal_raw_input_only", False)
 
 
+def requires_raw_input_tokens(model: type[object] | object) -> bool:
+    return getattr(model, "requires_raw_input_tokens", False)
+
+
 def supports_multimodal_encoder_tp_data(model: type[object] | object) -> bool:
     return getattr(model, "supports_encoder_tp_data", False)
 
 
+def supports_mm_encoder_only(model: type[object] | object) -> bool:
+    return getattr(model, "is_mm_encoder_only_model", False)
+
+
 @overload
 def supports_multimodal_pruning(
     model: type[object],
diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py
index 61aeafc2ab436..5f7ba838aa3d9 100644
--- a/vllm/model_executor/models/intern_vit.py
+++ b/vllm/model_executor/models/intern_vit.py
@@ -15,7 +15,7 @@ import torch.nn as nn
 import torch.nn.functional as F
 from transformers import PretrainedConfig
 
-from vllm.attention.layer import MultiHeadAttention
+from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
 from vllm.distributed import (
     divide,
     get_tensor_model_parallel_rank,
@@ -207,7 +207,7 @@ class InternParallelAttention(nn.Module):
             disable_tp=use_data_parallel,
         )
 
-        self.attn = MultiHeadAttention(
+        self.attn = MMEncoderAttention(
             self.num_heads_per_partition, self.head_dim, self.scale
         )
 
diff --git a/vllm/model_executor/models/interns1_vit.py b/vllm/model_executor/models/interns1_vit.py
index cb0414bbc95a8..a16857d613226 100644
--- a/vllm/model_executor/models/interns1_vit.py
+++ b/vllm/model_executor/models/interns1_vit.py
@@ -14,7 +14,7 @@ import torch.nn as nn
 from transformers import PretrainedConfig
 from transformers.utils import torch_int
 
-from vllm.attention.layer import MultiHeadAttention
+from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -214,8 +214,8 @@ class InternSdpaAttention(nn.Module):
 
         self.projection_layer = nn.Linear(self.dummy_dim, self.embed_dim)
 
-        # Use unified MultiHeadAttention with automatic backend selection
-        self.attn = MultiHeadAttention(self.num_heads, self.head_dim, self.scale)
+        # Use unified MMEncoderAttention with automatic backend selection
+        self.attn = MMEncoderAttention(self.num_heads, self.head_dim, self.scale)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """x shape: (B, N, C)"""
@@ -228,7 +228,7 @@ class InternSdpaAttention(nn.Module):
             q = self.q_norm(q)
             k = self.k_norm(k)
 
-        # Use unified MultiHeadAttention with automatic backend selection
+        # Use unified MMEncoderAttention with automatic backend selection
         x = self.attn(q, k, v)
 
         x = self.projection_layer(x)
diff --git a/vllm/model_executor/models/jais2.py b/vllm/model_executor/models/jais2.py
new file mode 100644
index 0000000000000..aacc4abd43e61
--- /dev/null
+++ b/vllm/model_executor/models/jais2.py
@@ -0,0 +1,508 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Inference-only Jais2 model compatible with HuggingFace weights."""
+
+from collections.abc import Iterable
+
+import torch
+from torch import nn
+from transformers import Jais2Config
+
+from vllm.attention.layer import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import (
+    get_pp_group,
+    get_tensor_model_parallel_world_size,
+)
+from vllm.model_executor.layers.activation import ReLUSquaredActivation
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsLoRA, SupportsPP
+from .utils import (
+    AutoWeightsLoader,
+    PPMissingLayer,
+    extract_layer_index,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
+
+
+class Jais2MLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: QuantizationConfig | None = None,
+        bias: bool = False,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.up_proj = ColumnParallelLinear(
+            input_size=hidden_size,
+            output_size=intermediate_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            input_size=intermediate_size,
+            output_size=hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
+        )
+        self.act_fn = ReLUSquaredActivation()
+
+    def forward(self, x):
+        x, _ = self.up_proj(x)
+        x = self.act_fn(x)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class Jais2Attention(nn.Module):
+    def __init__(
+        self,
+        config: Jais2Config,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        max_position_embeddings: int = 8192,
+        quant_config: QuantizationConfig | None = None,
+        bias: bool = False,
+        cache_config: CacheConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        layer_idx = extract_layer_index(prefix)
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        # MistralConfig has an optional head_dim introduced by Mistral-Nemo
+        self.head_dim = getattr(
+            config, "head_dim", self.hidden_size // self.total_num_heads
+        )
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=hidden_size,
+            head_size=self.head_dim,
+            total_num_heads=self.total_num_heads,
+            total_num_kv_heads=self.total_num_kv_heads,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+
+        self.o_proj = RowParallelLinear(
+            input_size=self.total_num_heads * self.head_dim,
+            output_size=hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        is_neox_style = True
+        if quant_config is not None and quant_config.get_name() == "gguf":
+            is_neox_style = False
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            max_position=max_position_embeddings,
+            rope_parameters=getattr(config, "rope_parameters", None),
+            is_neox_style=is_neox_style,
+        )
+
+        if hasattr(config, "interleaved_sliding_window"):
+            interleaved_sliding_window = config.interleaved_sliding_window
+            if isinstance(interleaved_sliding_window, int):
+                sliding_window = interleaved_sliding_window
+            elif isinstance(interleaved_sliding_window, list):
+                sw_idx = layer_idx % len(interleaved_sliding_window)
+                sliding_window = interleaved_sliding_window[sw_idx]
+            else:
+                raise ValueError(
+                    f"{type(interleaved_sliding_window)} is not supported."
+                )
+        else:
+            sliding_window = None
+
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            per_layer_sliding_window=sliding_window,
+            prefix=f"{prefix}.attn",
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Jais2DecoderLayer(nn.Module):
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        config: Jais2Config,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        config = config or vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = self.get_quant_config(vllm_config)
+
+        self.hidden_size = config.hidden_size
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        # Support abacusai/Smaug-72B-v0.1 with attention_bias
+        # Support internlm/internlm-7b with bias
+        attention_bias = getattr(config, "attention_bias", False) or getattr(
+            config, "bias", False
+        )
+        self.self_attn = Jais2Attention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=getattr(
+                config, "num_key_value_heads", config.num_attention_heads
+            ),
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            bias=attention_bias,
+            cache_config=cache_config,
+            prefix=f"{prefix}.self_attn",
+        )
+        self.mlp = Jais2MLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            bias=getattr(config, "mlp_bias", False),
+            prefix=f"{prefix}.mlp",
+        )
+        self.input_layernorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps
+        )
+        self.post_attention_layernorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor | None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = (
+                self.input_layernorm(hidden_states + residual),
+                hidden_states + residual,
+            )
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        # Fully Connected
+        hidden_states, residual = (
+            self.post_attention_layernorm(hidden_states + residual),
+            hidden_states + residual,
+        )
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+    def get_quant_config(self, vllm_config: VllmConfig) -> QuantizationConfig | None:
+        """Get quantization config for this layer. Override in subclasses."""
+        return vllm_config.quant_config
+
+
+@support_torch_compile
+class Jais2Model(nn.Module):
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        layer_type: type[nn.Module] = Jais2DecoderLayer,
+    ):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+
+        self.config = config
+        self.quant_config = quant_config
+        self.padding_idx = config.pad_token_id
+
+        self.vocab_size = config.vocab_size
+        self.org_vocab_size = config.vocab_size
+        if get_pp_group().is_first_rank or (
+            config.tie_word_embeddings and get_pp_group().is_last_rank
+        ):
+            self.embed_tokens = VocabParallelEmbedding(
+                self.vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                quant_config=quant_config,
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: layer_type(
+                config=config,
+                vllm_config=vllm_config,
+                prefix=prefix,
+            ),
+            prefix=f"{prefix}.layers",
+        )
+        if get_pp_group().is_last_rank:
+            self.norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor | None,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors | tuple[torch.Tensor, list[torch.Tensor]]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.embed_input_ids(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            hidden_states, residual = layer(positions, hidden_states, residual)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
+
+        hidden_states, _ = self.norm(hidden_states + residual), residual
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if self.quant_config is not None and (
+                scale_name := self.quant_config.get_cache_scale(name)
+            ):
+                # Loading kv cache scales for compressed-tensors quantization
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                loaded_weight = loaded_weight[0]
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+            if "scale" in name:
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class Jais2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+    }
+
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.model = self._init_model(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
+
+        if get_pp_group().is_last_rank:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
+            if config.tie_word_embeddings:
+                self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
+
+            logit_scale = getattr(config, "logit_scale", 1.0)
+            self.logits_processor = LogitsProcessor(
+                config.vocab_size, scale=logit_scale
+            )
+        else:
+            self.lm_head = PPMissingLayer()
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors
+        )
+
+    def _init_model(self, vllm_config: VllmConfig, prefix: str = ""):
+        return Jais2Model(vllm_config=vllm_config, prefix=prefix)
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        model_output = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
+        return model_output
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor | None:
+        logits = self.logits_processor(self.lm_head, hidden_states)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 3507a2bc66c17..84f4211df4c20 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -57,7 +57,14 @@ from vllm.model_executor.model_loader.weight_utils import (
 )
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsEagle, SupportsEagle3, SupportsLoRA, SupportsPP
+from .adapters import as_embedding_model, as_seq_cls_model
+from .interfaces import (
+    SupportsEagle,
+    SupportsEagle3,
+    SupportsLoRA,
+    SupportsPP,
+)
+from .interfaces_base import attn_type
 from .utils import (
     AutoWeightsLoader,
     PPMissingLayer,
@@ -698,3 +705,17 @@ class LlamaForCausalLM(
                 name = name.replace(item, mapping[item])
 
         return name, loaded_weight
+
+
+@attn_type("encoder_only")
+class LlamaBidirectionalForSequenceClassification(as_seq_cls_model(LlamaForCausalLM)):
+    # This class sets the correct attention type and pooling type
+    # through LlamaBidirectionalConfig.
+    pass
+
+
+@attn_type("encoder_only")
+class LlamaBidirectionalModel(as_embedding_model(LlamaForCausalLM)):
+    # This class sets the correct attention type and pooling type
+    # through LlamaBidirectionalConfig.
+    pass
diff --git a/vllm/model_executor/models/longcat_flash.py b/vllm/model_executor/models/longcat_flash.py
index c5441283f9711..774737387639b 100644
--- a/vllm/model_executor/models/longcat_flash.py
+++ b/vllm/model_executor/models/longcat_flash.py
@@ -46,7 +46,7 @@ from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.fused_moe import FusedMoE, ZeroExpertFusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (
     MergedColumnParallelLinear,
@@ -179,7 +179,7 @@ class FlashConfig(PretrainedConfig):
         self.intermediate_size = (
             self.ffn_hidden_size
             if hasattr(self, "ffn_hidden_size")
-            else self.intermediate_size
+            else intermediate_size
         )
         if hasattr(self, "moe_intermediate_size"):
             self.moe_intermediate_size = self.moe_intermediate_size
@@ -280,10 +280,6 @@ class LongcatMoe(nn.Module):
     ):
         super().__init__()
         self.hidden_size = hidden_size
-        self.zero_expert_num = config.zero_expert_num
-        self.zero_expert_type = config.zero_expert_type
-        self.routed_scaling_factor = config.routed_scaling_factor
-        self.enable_eplb = enable_eplb
         # Gate always runs at half / full precision for now.
         self.rounter_params_dtype = params_dtype
         if config.router_dtype == "float32":
@@ -291,25 +287,27 @@ class LongcatMoe(nn.Module):
 
         self.router = LongcatRouter(
             config=config,
-            zero_expert_num=self.zero_expert_num,
+            zero_expert_num=config.zero_expert_num,
             rounter_params_dtype=self.rounter_params_dtype,
             prefix=f"{prefix}.gate",
         )
 
-        self.experts = FusedMoE(
+        assert config.zero_expert_num is not None
+        assert config.zero_expert_type is not None
+        self.experts = ZeroExpertFusedMoE(
+            zero_expert_num=config.zero_expert_num,
+            zero_expert_type=config.zero_expert_type,
+            router=self.router,
             num_experts=num_experts,
             top_k=top_k,
             hidden_size=hidden_size,
             intermediate_size=intermediate_size,
             reduce_results=True,
             params_dtype=params_dtype,
-            e_score_correction_bias=self.router.e_score_correction_bias,
             renormalize=False,
             quant_config=quant_config,
             prefix=f"{prefix}.experts",
-            zero_expert_num=self.zero_expert_num,
-            zero_expert_type=self.zero_expert_type,
-            enable_eplb=self.enable_eplb,
+            enable_eplb=enable_eplb,
             routed_scaling_factor=config.routed_scaling_factor,
         )
 
@@ -317,11 +315,34 @@ class LongcatMoe(nn.Module):
         num_tokens, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
 
-        router_logits = self.router(hidden_states.to(self.rounter_params_dtype))
-        final_hidden_states = self.experts(
-            hidden_states=hidden_states, router_logits=router_logits
+        # Align to FusedMoE padded hidden size to avoid dim mismatch
+        padded_hidden = self.experts.hidden_size
+        if hidden_dim < padded_hidden:
+            hidden_states_padded = torch.nn.functional.pad(
+                hidden_states,
+                (0, padded_hidden - hidden_dim),
+                mode="constant",
+                value=0.0,
+            )
+        else:
+            hidden_states_padded = hidden_states
+
+        router_logits_full = self.router(
+            hidden_states_padded.to(self.rounter_params_dtype)
         )
 
+        # ZeroExpertFusedMoE handles routing memoization and zero expert computation
+        # internally. Pass full router_logits (including zero experts) so that
+        # zero experts can be properly identified in routing.
+        final_hidden_states = self.experts(
+            hidden_states=hidden_states_padded,
+            router_logits=router_logits_full,  # Full logits (includes zero experts)
+        )
+
+        # Crop back to original hidden dimension if padded earlier
+        if padded_hidden != hidden_dim:
+            final_hidden_states = final_hidden_states[..., :hidden_dim]
+
         return final_hidden_states.view(num_tokens, hidden_dim)
 
 
@@ -419,6 +440,7 @@ class FlashDecoderLayer(nn.Module):
         hidden_states = self.self_attn[0](
             positions=positions,
             hidden_states=hidden_states,
+            llama_4_scaling=None,
         )
 
         hidden_states, residual = self.post_attention_layernorm[0](
@@ -438,6 +460,7 @@ class FlashDecoderLayer(nn.Module):
         hidden_states = self.self_attn[1](
             positions=positions,
             hidden_states=hidden_states,
+            llama_4_scaling=None,
         )
         hidden_states, residual = self.post_attention_layernorm[1](
             hidden_states, residual
diff --git a/vllm/model_executor/models/mimo_v2_flash.py b/vllm/model_executor/models/mimo_v2_flash.py
new file mode 100644
index 0000000000000..12b486f001e03
--- /dev/null
+++ b/vllm/model_executor/models/mimo_v2_flash.py
@@ -0,0 +1,720 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Iterable
+from itertools import islice
+
+import torch
+from torch import nn
+
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention
+from vllm.config import (
+    CacheConfig,
+    VllmConfig,
+    get_current_vllm_config,
+    str_dtype_to_torch_dtype,
+)
+from vllm.distributed import (
+    get_ep_group,
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_gather,
+)
+from vllm.logger import init_logger
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from vllm.model_executor.models.utils import sequence_parallel_chunk
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import MixtureOfExperts, SupportsPP
+from .utils import (
+    AutoWeightsLoader,
+    PPMissingLayer,
+    extract_layer_index,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
+
+logger = init_logger(__name__)
+
+
+class MiMoV2MLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: QuantizationConfig | None = None,
+        reduce_results: bool = True,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            reduce_results=reduce_results,
+            prefix=f"{prefix}.down_proj",
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class MiMoV2MoE(nn.Module):
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        is_nextn: bool = False,
+    ):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_text_config
+        parallel_config = vllm_config.parallel_config
+        quant_config = vllm_config.quant_config
+
+        self.tp_size = get_tensor_model_parallel_world_size()
+
+        self.ep_group = get_ep_group().device_group
+        self.ep_rank = get_ep_group().rank_in_group
+        self.ep_size = self.ep_group.size()
+        self.n_routed_experts = config.n_routed_experts
+
+        self.is_sequence_parallel = parallel_config.use_sequence_parallel_moe
+
+        if self.tp_size > config.n_routed_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {config.n_routed_experts}."
+            )
+
+        if config.hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {config.hidden_act}. "
+                "Only silu is supported for now."
+            )
+
+        vllm_config = get_current_vllm_config()
+        eplb_config = vllm_config.parallel_config.eplb_config
+        self.enable_eplb = parallel_config.enable_eplb
+
+        self.n_logical_experts = self.n_routed_experts
+        self.n_redundant_experts = eplb_config.num_redundant_experts
+        self.n_physical_experts = self.n_logical_experts + self.n_redundant_experts
+        self.n_local_physical_experts = self.n_physical_experts // self.ep_size
+
+        self.physical_expert_start = self.ep_rank * self.n_local_physical_experts
+        self.physical_expert_end = (
+            self.physical_expert_start + self.n_local_physical_experts
+        )
+
+        dtype = getattr(config, "moe_router_dtype", "float32")
+        self.gate_dtype = str_dtype_to_torch_dtype(dtype)
+        self.gate = nn.Linear(
+            config.hidden_size,
+            config.n_routed_experts,
+            bias=False,
+            dtype=self.gate_dtype,
+        )
+        self.gate.e_score_correction_bias = nn.Parameter(
+            torch.empty(config.n_routed_experts, dtype=self.gate_dtype)
+        )
+
+        self.experts = FusedMoE(
+            num_experts=self.n_routed_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            reduce_results=True,
+            renormalize=config.norm_topk_prob,
+            quant_config=quant_config,
+            prefix=f"{prefix}.experts",
+            e_score_correction_bias=self.gate.e_score_correction_bias,
+            enable_eplb=self.enable_eplb,
+            num_redundant_experts=self.n_redundant_experts,
+            is_sequence_parallel=self.is_sequence_parallel,
+            use_grouped_topk=True,
+            num_expert_group=config.n_group,
+            topk_group=config.topk_group,
+            scoring_func="sigmoid",
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        assert hidden_states.dim() <= 2, "MiMoV2MoE only supports 1D or 2D inputs"
+        is_input_1d = hidden_states.dim() == 1
+        num_tokens, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+
+        if self.is_sequence_parallel:
+            hidden_states = sequence_parallel_chunk(hidden_states)
+
+        if self.gate_dtype is not None:
+            gate_input = hidden_states.to(self.gate_dtype)
+        else:
+            gate_input = hidden_states
+        router_logits = self.gate(gate_input)
+        final_hidden_states = self.experts(
+            hidden_states=hidden_states, router_logits=router_logits
+        )
+
+        if self.is_sequence_parallel:
+            final_hidden_states = tensor_model_parallel_all_gather(
+                final_hidden_states, 0
+            )
+            final_hidden_states = final_hidden_states[:num_tokens]
+
+        return final_hidden_states.squeeze(0) if is_input_1d else final_hidden_states
+
+
+class MiMoV2Attention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        head_dim: int,
+        v_head_dim: int | None = None,
+        sliding_window_size: int = -1,
+        attention_bias: bool = False,
+        add_swa_attention_sink_bias: bool = False,
+        layer_id: int = 0,
+        rope_theta: float = 1000000,
+        max_position_embeddings: int = 32768,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        partial_rotary_factor: float = 1.0,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.layer_id = layer_id
+        tp_size = get_tensor_model_parallel_world_size()
+
+        self.total_num_heads = num_heads
+        self.num_heads = self.total_num_heads // tp_size
+
+        self.total_num_kv_heads = num_kv_heads
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+
+        self.head_dim = head_dim
+
+        self.v_head_dim = v_head_dim if v_head_dim is not None else head_dim
+
+        self.q_size = self.num_heads * self.head_dim
+        self.k_size = self.num_kv_heads * self.head_dim
+        self.v_size = self.num_kv_heads * self.v_head_dim
+
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=attention_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+            v_head_size=self.v_head_dim,
+        )
+
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.v_head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            reduce_results=True,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        self.rotary_emb = get_rope(
+            head_size=self.head_dim,
+            max_position=max_position_embeddings,
+            rope_parameters={
+                "rope_type": "default",
+                "rope_theta": rope_theta,
+                "partial_rotary_factor": partial_rotary_factor,
+            },
+        )
+
+        self.attention_sink_bias = (
+            torch.nn.Parameter(torch.empty(self.num_heads), requires_grad=False)
+            if add_swa_attention_sink_bias
+            else None
+        )
+
+        sliding_window = sliding_window_size if sliding_window_size > -1 else None
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            per_layer_sliding_window=sliding_window,
+            attn_type=AttentionType.DECODER,
+            prefix=f"{prefix}.attn",
+            sinks=self.attention_sink_bias,
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.k_size, self.v_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+
+        v = v.view(-1, self.num_kv_heads, self.v_head_dim)
+        v = torch.nn.functional.pad(v, [0, self.head_dim - self.v_head_dim], value=0)
+        v = v.view(-1, self.num_kv_heads * self.head_dim)
+
+        attn_output = self.attn(q, k, v)
+
+        attn_output = attn_output.view(-1, self.num_heads, self.head_dim)[
+            ..., : self.v_head_dim
+        ].reshape(-1, self.num_heads * self.v_head_dim)
+
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class MiMoV2FlashDecoderLayer(nn.Module):
+    def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__()
+        config = vllm_config.model_config.hf_text_config
+        quant_config = vllm_config.quant_config
+        layer_id = extract_layer_index(prefix)
+
+        self.hidden_size = config.hidden_size
+        self.config = config
+        self.layer_id = layer_id
+
+        rope_theta = getattr(config, "rope_theta", 1000000)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 32768)
+
+        if self.is_compressed_softmax_layer():
+            self.self_attn = MiMoV2Attention(
+                hidden_size=self.hidden_size,
+                num_heads=config.swa_num_attention_heads,
+                num_kv_heads=config.swa_num_key_value_heads,
+                head_dim=config.swa_head_dim,
+                v_head_dim=getattr(config, "swa_v_head_dim", None),
+                sliding_window_size=config.sliding_window_size,
+                attention_bias=config.attention_bias,
+                add_swa_attention_sink_bias=getattr(
+                    config, "add_swa_attention_sink_bias", False
+                ),
+                layer_id=layer_id,
+                rope_theta=getattr(config, "swa_rope_theta", rope_theta),
+                max_position_embeddings=max_position_embeddings,
+                quant_config=quant_config,
+                partial_rotary_factor=getattr(config, "partial_rotary_factor", 1.0),
+                prefix=f"{prefix}.self_attn",
+            )
+        else:
+            self.self_attn = MiMoV2Attention(
+                hidden_size=self.hidden_size,
+                num_heads=config.num_attention_heads,
+                num_kv_heads=config.num_key_value_heads,
+                head_dim=config.head_dim,
+                v_head_dim=getattr(config, "v_head_dim", None),
+                sliding_window_size=-1,  # normal attention
+                attention_bias=config.attention_bias,
+                layer_id=layer_id,
+                rope_theta=rope_theta,
+                max_position_embeddings=max_position_embeddings,
+                quant_config=quant_config,
+                partial_rotary_factor=getattr(config, "partial_rotary_factor", 1.0),
+                prefix=f"{prefix}.self_attn",
+            )
+
+        self.is_layer_sparse = self.is_moe_layer(layer_id)
+        if self.is_layer_sparse:
+            self.mlp = MiMoV2MoE(
+                vllm_config=vllm_config,
+                prefix=f"{prefix}.mlp",
+            )
+        else:
+            self.mlp = MiMoV2MLP(
+                hidden_size=self.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+            )
+
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.layernorm_epsilon)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.layernorm_epsilon
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor | None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+    def is_moe_layer(self, layer_idx: int) -> bool:
+        return (
+            hasattr(self.config, "moe_layer_freq")
+            and layer_idx >= 0
+            and not isinstance(self.config.moe_layer_freq, int)
+            and self.config.moe_layer_freq[layer_idx]
+        )
+
+    def is_compressed_softmax_layer(self) -> bool:
+        return self.config.hybrid_layer_pattern[self.layer_id] == 1
+
+
+class MiMoV2Model(nn.Module):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config.get_text_config()
+        quant_config = vllm_config.quant_config
+        eplb_config = vllm_config.parallel_config.eplb_config
+
+        self.config = config
+        self.quant_config = quant_config
+        self.vocab_size = config.vocab_size
+        self.num_redundant_experts = eplb_config.num_redundant_experts
+        self.v_scale = getattr(config, "attention_value_scale", None)
+
+        if get_pp_group().is_first_rank or (
+            config.tie_word_embeddings and get_pp_group().is_last_rank
+        ):
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=f"{prefix}.embed_tokens",
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: MiMoV2FlashDecoderLayer(
+                vllm_config=vllm_config,
+                prefix=prefix,
+            ),
+            prefix=f"{prefix}.layers",
+        )
+
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.layernorm_epsilon)
+        else:
+            self.norm = PPMissingLayer()
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.embed_input_ids(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for idx, layer in enumerate(
+            islice(self.layers, self.start_layer, self.end_layer)
+        ):
+            hidden_states, residual = layer(positions, hidden_states, residual)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+
+        return hidden_states
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        return FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.n_routed_experts,
+            num_redundant_experts=self.num_redundant_experts,
+        )
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        tp_rank = get_tensor_model_parallel_rank()
+        tp_size = get_tensor_model_parallel_world_size()
+
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: set[str] = set()
+        expert_params_mapping = self.get_expert_mapping()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                continue
+            if "mtp" in name:
+                continue
+
+            if self.quant_config is not None:
+                cache_scale_name = self.quant_config.get_cache_scale(name)
+                if cache_scale_name is not None and cache_scale_name in params_dict:
+                    param = params_dict[cache_scale_name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+
+                    kv_scale = loaded_weight
+                    if kv_scale.dim() > 0 and kv_scale.numel() > 1:
+                        kv_scale = kv_scale.view(-1)[0]
+
+                    weight_loader(param, kv_scale)
+                    loaded_params.add(cache_scale_name)
+                    continue
+
+            expert_matched = False
+            for param_name, weight_name, expert_id, shard_id in expert_params_mapping:
+                if weight_name not in name:
+                    continue
+
+                name_rewritten = name.replace(weight_name, param_name)
+
+                if is_pp_missing_parameter(name_rewritten, self):
+                    continue
+
+                if (
+                    name_rewritten.endswith(".bias") or name_rewritten.endswith("_bias")
+                ) and name_rewritten not in params_dict:
+                    continue
+
+                if name_rewritten not in params_dict:
+                    continue
+
+                param = params_dict[name_rewritten]
+                weight_loader = param.weight_loader
+
+                weight_loader(
+                    param,
+                    loaded_weight,
+                    name_rewritten,
+                    shard_id=shard_id,
+                    expert_id=expert_id,
+                )
+                loaded_params.add(name_rewritten)
+                expert_matched = True
+                break
+
+            if expert_matched:
+                continue
+
+            stacked_matched = False
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name_rewritten = name.replace(weight_name, param_name)
+
+                if (
+                    name_rewritten.endswith(".bias")
+                    and name_rewritten not in params_dict
+                ):
+                    continue
+
+                if is_pp_missing_parameter(name_rewritten, self):
+                    continue
+
+                if name_rewritten not in params_dict:
+                    continue
+
+                param = params_dict[name_rewritten]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+
+                if param_name == "qkv_proj" and shard_id == "v":
+                    v_scale = (
+                        self.v_scale
+                        if self.v_scale is not None
+                        else getattr(self.config, "attention_value_scale", None)
+                    )
+                    if v_scale is not None and (
+                        name.endswith("weight_scale_inv") or name.endswith(".bias")
+                    ):
+                        loaded_weight *= float(v_scale)
+
+                weight_loader(param, loaded_weight, shard_id)
+                loaded_params.add(name_rewritten)
+
+                stacked_matched = True
+                break
+
+            if stacked_matched:
+                continue
+
+            if name.endswith(".bias") and name not in params_dict:
+                continue
+
+            orig_name = name
+            mapped_name = maybe_remap_kv_scale_name(name, params_dict)
+            name = mapped_name if mapped_name is not None else orig_name
+
+            if name not in params_dict:
+                continue
+
+            param = params_dict[name]
+
+            if "attention_sink_bias" in name:
+                total_heads = loaded_weight.shape[0]
+                heads_per_rank = total_heads // tp_size
+                head_start = tp_rank * heads_per_rank
+                narrow_weight = loaded_weight.narrow(0, head_start, heads_per_rank)
+
+                param.data.copy_(narrow_weight)
+                loaded_params.add(name)
+            else:
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+                loaded_params.add(name)
+
+        return loaded_params
+
+
+class MiMoV2FlashForCausalLM(nn.Module, SupportsPP, MixtureOfExperts):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+
+        self.config = config
+        self.quant_config = quant_config
+        self.model = MiMoV2Model(
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "model"),
+        )
+
+        if get_pp_group().is_last_rank:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
+        else:
+            self.lm_head = PPMissingLayer()
+
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors
+        )
+
+    def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
+        self.model.aux_hidden_state_layers = layers
+
+    def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]:
+        num_layers = len(self.model.layers)
+        return (2, num_layers // 2, num_layers - 3)
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor | None:
+        logits = self.logits_processor(self.lm_head, hidden_states)
+        return logits
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        return self.model.get_expert_mapping()
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index fe963cc6644fb..886d5151e43ff 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -31,7 +31,7 @@ from transformers.models.llama4.image_processing_llama4_fast import (
     get_best_fit,
 )
 
-from vllm.attention.layer import MultiHeadAttention
+from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import get_tensor_model_parallel_world_size
@@ -255,7 +255,7 @@ class Llama4VisionAttention(nn.Module):
         self.attention_dropout = config.attention_dropout
         self.scaling = self.head_dim**-0.5
 
-        self.attn = MultiHeadAttention(
+        self.attn = MMEncoderAttention(
             self.num_local_heads, self.head_dim, self.scaling
         )
 
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 71c6b1aa2e814..9c741e1f5071f 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -17,7 +17,8 @@ from transformers import BatchFeature, PretrainedConfig, ProcessorMixin, TensorT
 from transformers.image_utils import ImageInput
 from transformers.tokenization_utils_base import TextInput
 
-from vllm.attention.layer import Attention, MultiHeadAttention
+from vllm.attention.layer import Attention
+from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
@@ -222,7 +223,7 @@ class MultiHeadDotProductAttention(nn.Module):
         )
 
         self.scale = self.head_dim**-0.5
-        self.attn = MultiHeadAttention(
+        self.attn = MMEncoderAttention(
             self.num_heads, self.head_dim, self.scale, num_kv_heads=self.num_kv_heads
         )
 
diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py
index f9bce4bf981b2..94deeb867c9f8 100644
--- a/vllm/model_executor/models/qwen2_5_omni_thinker.py
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -70,7 +70,6 @@ from vllm.multimodal.inputs import (
     MultiModalFeatureSpec,
     MultiModalFieldConfig,
     MultiModalKwargsItems,
-    NestedTensors,
 )
 from vllm.multimodal.parse import (
     AudioProcessorItems,
@@ -1150,27 +1149,6 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
             handle_oov_mm_token=handle_oov_mm_token,
         )
 
-    def embed_multimodal_v0(self, **kwargs: object) -> NestedTensors | None:
-        audio_input = self._parse_and_validate_audio_input(**kwargs)
-        image_input = self._parse_and_validate_image_input(**kwargs)
-        video_input = self._parse_and_validate_video_input(**kwargs)
-
-        if audio_input is None and image_input is None and video_input is None:
-            return None
-
-        multimodal_embeddings: list[tuple[NestedTensors, str]] = []
-
-        if audio_input is not None:
-            audio_embeds = self._process_audio_input(audio_input)
-            multimodal_embeddings.append((audio_embeds, "audio"))
-        if image_input is not None:
-            image_embeds = self._process_image_input(image_input)
-            multimodal_embeddings.append((image_embeds, "image"))
-        if video_input is not None:
-            video_embeds = self._process_video_input(video_input)
-            multimodal_embeddings.append((video_embeds, "video"))
-        return multimodal_embeddings
-
     def forward(
         self,
         input_ids: torch.Tensor,
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index b730ac0315893..0b44ff622f05b 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -34,7 +34,7 @@ import einops
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from transformers import BatchFeature
+from transformers import BatchFeature, Qwen2ForCausalLM
 from transformers.models.qwen2_5_vl import Qwen2_5_VLProcessor
 from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import (
     Qwen2_5_VLConfig,
@@ -1567,3 +1567,11 @@ class Qwen2_5_VLForConditionalGeneration(
             connector="visual.merger.",
             tower_model="visual.",
         )
+
+    @classmethod
+    def get_language_model_spec(cls) -> tuple[nn.Module | None, str | None]:
+        """
+        Return the language model spec:
+        (language model class, language model attr)
+        """
+        return Qwen2ForCausalLM, "language_model"
diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
index 089129e443c01..5ca6b3d852ac3 100755
--- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py
+++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
@@ -323,7 +323,7 @@ class Qwen3Omni_VisionTransformer(nn.Module):
             hidden_size=self.hidden_size,
         )
 
-        # vit pos embeding, TODO: spatial_patch_size vs patch_size
+        # vit pos embedding, TODO: spatial_patch_size vs patch_size
         if self.apply_vit_abs_pos_embed:
             self.pos_embed = nn.Embedding(self.num_grid_per_side**2, self.hidden_size)
         else:
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index 4838f68e06f70..fea73557f1e82 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -2090,3 +2090,11 @@ class Qwen3VLForConditionalGeneration(
             connector="visual.merger",
             tower_model="visual.",
         )
+
+    @classmethod
+    def get_language_model_spec(cls) -> tuple[nn.Module | None, str | None]:
+        """
+        Return the language model spec:
+        (language model class, language model attr)
+        """
+        return Qwen3LLMForCausalLM, "language_model"
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 4575e91e13959..fd39afe259ae3 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -46,6 +46,7 @@ from .interfaces import (
     has_noops,
     is_attention_free,
     is_hybrid,
+    requires_raw_input_tokens,
     supports_cross_encoding,
     supports_mamba_prefix_caching,
     supports_multimodal,
@@ -127,6 +128,7 @@ _TEXT_GENERATION_MODELS = {
     "InternLM2VEForCausalLM": ("internlm2_ve", "InternLM2VEForCausalLM"),
     "InternLM3ForCausalLM": ("llama", "LlamaForCausalLM"),
     "JAISLMHeadModel": ("jais", "JAISLMHeadModel"),
+    "Jais2ForCausalLM": ("jais2", "Jais2ForCausalLM"),
     "JambaForCausalLM": ("jamba", "JambaForCausalLM"),
     "KimiLinearForCausalLM": ("kimi_linear", "KimiLinearForCausalLM"),  # noqa: E501
     "Lfm2ForCausalLM": ("lfm2", "Lfm2ForCausalLM"),
@@ -151,6 +153,7 @@ _TEXT_GENERATION_MODELS = {
     "MptForCausalLM": ("mpt", "MPTForCausalLM"),
     "MPTForCausalLM": ("mpt", "MPTForCausalLM"),
     "MiMoForCausalLM": ("mimo", "MiMoForCausalLM"),
+    "MiMoV2FlashForCausalLM": ("mimo_v2_flash", "MiMoV2FlashForCausalLM"),
     "NemotronForCausalLM": ("nemotron", "NemotronForCausalLM"),
     "NemotronHForCausalLM": ("nemotron_h", "NemotronHForCausalLM"),
     "OlmoForCausalLM": ("olmo", "OlmoForCausalLM"),
@@ -201,6 +204,7 @@ _EMBEDDING_MODELS = {
     "GteNewModel": ("bert_with_rope", "GteNewModel"),
     "InternLM2ForRewardModel": ("internlm2", "InternLM2ForRewardModel"),
     "JambaForSequenceClassification": ("jamba", "JambaForSequenceClassification"),  # noqa: E501
+    "LlamaBidirectionalModel": ("llama", "LlamaBidirectionalModel"),
     "LlamaModel": ("llama", "LlamaForCausalLM"),
     **{
         # Multiple models share the same architecture, so we include them all
@@ -244,6 +248,11 @@ _CROSS_ENCODER_MODELS = {
         "bert_with_rope",
         "GteNewForSequenceClassification",
     ),
+    "JinaVLForRanking": ("jina_vl", "JinaVLForSequenceClassification"),
+    "LlamaBidirectionalForSequenceClassification": (
+        "llama",
+        "LlamaBidirectionalForSequenceClassification",
+    ),
     "ModernBertForSequenceClassification": (
         "modernbert",
         "ModernBertForSequenceClassification",
@@ -257,8 +266,6 @@ _CROSS_ENCODER_MODELS = {
         "roberta",
         "RobertaForSequenceClassification",
     ),
-    # [Auto-converted (see adapters.py)]
-    "JinaVLForRanking": ("jina_vl", "JinaVLForSequenceClassification"),  # noqa: E501,
 }
 
 _MULTIMODAL_MODELS = {
@@ -416,6 +423,7 @@ _MULTIMODAL_MODELS = {
     ),
     "UltravoxModel": ("ultravox", "UltravoxModel"),
     "VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"),  # noqa: E501
+    "VoxtralStreamingGeneration": ("voxtral_streaming", "VoxtralStreamingGeneration"),  # noqa: E501
     # [Encoder-decoder]
     "WhisperForConditionalGeneration": ("whisper", "WhisperForConditionalGeneration"),  # noqa: E501
 }
@@ -533,6 +541,7 @@ class _ModelInfo:
     supports_cross_encoding: bool
     supports_multimodal: bool
     supports_multimodal_raw_input_only: bool
+    requires_raw_input_tokens: bool
     supports_multimodal_encoder_tp_data: bool
     supports_pp: bool
     has_inner_state: bool
@@ -556,6 +565,7 @@ class _ModelInfo:
             supports_multimodal_raw_input_only=supports_multimodal_raw_input_only(
                 model
             ),
+            requires_raw_input_tokens=requires_raw_input_tokens(model),
             supports_multimodal_encoder_tp_data=supports_multimodal_encoder_tp_data(
                 model
             ),
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index 2600dc1c9f79c..799afc7ca2e51 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -16,8 +16,8 @@ from transformers import (
     SiglipVisionConfig,
 )
 
-from vllm.attention.layer import MultiHeadAttention
 from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention
+from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
@@ -379,7 +379,7 @@ class SiglipAttention(nn.Module):
         quant_config: QuantizationConfig | None = None,
         *,
         prefix: str = "",
-        attn_cls: type[EncoderOnlyAttention] | type[MultiHeadAttention],
+        attn_cls: type[EncoderOnlyAttention] | type[MMEncoderAttention],
     ) -> None:
         super().__init__()
 
@@ -481,7 +481,7 @@ class SiglipEncoderLayer(nn.Module):
         quant_config: QuantizationConfig | None = None,
         *,
         prefix: str = "",
-        attn_cls: type[EncoderOnlyAttention] | type[MultiHeadAttention],
+        attn_cls: type[EncoderOnlyAttention] | type[MMEncoderAttention],
     ) -> None:
         super().__init__()
 
@@ -527,7 +527,7 @@ class SiglipEncoder(nn.Module):
         num_hidden_layers_override: int | None = None,
         *,
         prefix: str = "",
-        attn_cls: type[EncoderOnlyAttention] | type[MultiHeadAttention],
+        attn_cls: type[EncoderOnlyAttention] | type[MMEncoderAttention],
     ) -> None:
         super().__init__()
 
@@ -700,7 +700,7 @@ class SiglipVisionTransformer(nn.Module):
             quant_config=quant_config,
             num_hidden_layers_override=num_hidden_layers_override,
             prefix=f"{prefix}.encoder",
-            attn_cls=MultiHeadAttention,
+            attn_cls=MMEncoderAttention,
         )
 
         num_hidden_layers = config.num_hidden_layers
diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py
index e5038e56a2708..3c965721b9dae 100644
--- a/vllm/model_executor/models/step3_vl.py
+++ b/vllm/model_executor/models/step3_vl.py
@@ -15,7 +15,7 @@ from torchvision import transforms
 from torchvision.transforms.functional import InterpolationMode
 from transformers import BatchFeature, PretrainedConfig, TensorType
 
-from vllm.attention.layer import MultiHeadAttention
+from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import get_tensor_model_parallel_world_size
@@ -753,8 +753,8 @@ class Step3VisionAttention(nn.Module):
             disable_tp=use_data_parallel,
         )
 
-        # Use unified MultiHeadAttention with automatic backend selection
-        self.attn = MultiHeadAttention(self.num_heads, self.head_dim, self.scale)
+        # Use unified MMEncoderAttention with automatic backend selection
+        self.attn = MMEncoderAttention(self.num_heads, self.head_dim, self.scale)
 
     def forward(
         self,
@@ -767,7 +767,7 @@ class Step3VisionAttention(nn.Module):
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.chunk(chunks=3, dim=-1)
 
-        # Use unified MultiHeadAttention with automatic backend selection
+        # Use unified MMEncoderAttention with automatic backend selection
         attn_output = self.attn(q, k, v)
 
         attn_output, _ = self.out_proj(attn_output)
diff --git a/vllm/model_executor/models/transformers/multimodal.py b/vllm/model_executor/models/transformers/multimodal.py
index 9d77dee2810c3..fcf9a0d077abe 100644
--- a/vllm/model_executor/models/transformers/multimodal.py
+++ b/vllm/model_executor/models/transformers/multimodal.py
@@ -22,6 +22,7 @@ from typing import TYPE_CHECKING
 import torch
 
 from vllm.config.utils import getattr_iter
+from vllm.logger import init_logger
 from vllm.model_executor.models.interfaces import SupportsMRoPE, SupportsMultiModal
 from vllm.model_executor.models.utils import WeightsMapper
 from vllm.multimodal import MultiModalKwargsItems
@@ -36,6 +37,7 @@ from vllm.multimodal.inputs import (
 from vllm.multimodal.parse import ImageProcessorItems, MultiModalDataItems
 from vllm.multimodal.processing import BaseMultiModalProcessor, BaseProcessingInfo
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 
 if TYPE_CHECKING:
@@ -52,6 +54,8 @@ DYNAMIC_ARG_DIMS = {
     "inputs_embeds": 0,
 }
 
+logger = init_logger(__name__)
+
 
 class MultiModalProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self):
@@ -345,8 +349,29 @@ class MultiModalMixin(SupportsMultiModal, SupportsMRoPE):
 
         num_image_patches = kwargs.pop("num_image_patches")
         kwargs.pop("token_type_ids", None)  # used only in `forward`
+
         if pixel_values is not None:
-            vision_embeddings = self.model.get_image_features(pixel_values, **kwargs)
+            # ROCm: Force math SDP backend for vision encoder to avoid accuracy issues
+            # with flash_sdp and mem_efficient_sdp
+            if current_platform.is_rocm():
+                # TODO: [ROCm] Fix accuracy issues with flash backend
+                logger.debug(
+                    "ROCm platform detected. Forcing math SDP backend "
+                    "for vision encoder. Currently ROCm platform has "
+                    "accuracy issues with `flash_sdp` and"
+                    "`mem_efficient_sdp` backends. See issue: "
+                    "https://github.com/vllm-project/vllm/issues/30167"
+                )
+                with torch.nn.attention.sdpa_kernel(
+                    backends=[torch.nn.attention.SDPBackend.MATH]
+                ):
+                    vision_embeddings = self.model.get_image_features(
+                        pixel_values, **kwargs
+                    )
+            else:
+                vision_embeddings = self.model.get_image_features(
+                    pixel_values, **kwargs
+                )
 
             if isinstance(vision_embeddings, torch.Tensor):
                 if vision_embeddings.ndim == 2:
@@ -364,6 +389,11 @@ class MultiModalMixin(SupportsMultiModal, SupportsMRoPE):
                 ]
 
             return vision_embeddings
+        else:
+            logger.debug(
+                "No pixel values or image embeddings provided for multimodal embedding."
+            )
+            return None
 
     def get_mrope_input_positions(
         self,
diff --git a/vllm/model_executor/models/transformers/utils.py b/vllm/model_executor/models/transformers/utils.py
index b807f45b5d52b..c7844381eb633 100644
--- a/vllm/model_executor/models/transformers/utils.py
+++ b/vllm/model_executor/models/transformers/utils.py
@@ -22,7 +22,6 @@ from typing import TYPE_CHECKING, Literal
 
 import torch
 from torch import nn
-from transformers.configuration_utils import ALLOWED_LAYER_TYPES
 
 from vllm.config.utils import getattr_iter
 from vllm.logger import init_logger
@@ -32,6 +31,7 @@ from vllm.model_executor.layers.linear import (
     ReplicatedLinear,
     RowParallelLinear,
 )
+from vllm.transformers_utils.config import is_rope_parameters_nested
 
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
@@ -207,7 +207,7 @@ def can_enable_torch_compile(vllm_config: "VllmConfig") -> bool:
     rope_parameters: dict | None = getattr(text_config, "rope_parameters", None) or {}
     if rope_parameters:
         # Nest rope_parameters if not nested already to simplify logic
-        if not set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES):
+        if not is_rope_parameters_nested(rope_parameters):
             rope_parameters = {"": rope_parameters}
         return all(rp["rope_type"] != "dynamic" for rp in rope_parameters.values())
     return True
diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py
index 331f0c54ecfbc..cbba1af89190c 100644
--- a/vllm/model_executor/models/voxtral.py
+++ b/vllm/model_executor/models/voxtral.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import inspect
 import math
 from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property
@@ -116,10 +117,7 @@ class VoxtralProcessorAdapter:
         self,
         audio_length: int,
     ) -> int:
-        pad_audio_length = self._audio_processor.next_multiple_of_chunk_frames(
-            audio_length, self.sampling_rate
-        )
-        return ceil(pad_audio_length / (self.sampling_rate // self.frame_rate))
+        return ceil(audio_length / (self.sampling_rate // self.frame_rate))
 
     def __call__(
         self,
@@ -158,7 +156,14 @@ class VoxtralProcessorAdapter:
             assert audio.ndim == 1
 
             # pad if necessary
-            audio = self._audio_processor.pad(audio, self.sampling_rate)
+            # TODO(Patrick) - remove once mistral-common is bumped
+            sig = inspect.signature(self._audio_processor.pad)
+            if "is_online_streaming" in sig.parameters:
+                audio = self._audio_processor.pad(
+                    audio, self.sampling_rate, is_online_streaming=False
+                )
+            else:
+                audio = self._audio_processor.pad(audio, self.sampling_rate)
 
             audio_tokens = [self.begin_audio_token_id] + [
                 self.audio_token_id
@@ -510,6 +515,7 @@ class VoxtralForConditionalGeneration(
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         remapping_rules = [
+            (r"mm_streams_embeddings.embedding_module\.(.*)", r"\1"),
             (r"mm_whisper_embeddings\.(.*)", r"\1"),
             (r"audio_language_projection\.(.*)", r"audio_language_adapter.\1"),
             (
@@ -535,13 +541,16 @@ class VoxtralForConditionalGeneration(
         def llm_weights_generator():
             nonlocal loaded_weights
             for name, w in weights:
-                is_encoder = (
-                    name.startswith("mm_whisper_embeddings")
-                    and not name.startswith("mm_whisper_embeddings.tok_embeddings")
-                    and not name.startswith(
-                        "mm_whisper_embeddings.audio_language_projection"
+                is_encoder = False
+                for k in [
+                    "mm_whisper_embeddings",
+                    "mm_streams_embeddings.embedding_module",
+                ]:
+                    is_encoder |= (
+                        name.startswith(k)
+                        and not name.startswith(f"{k}.tok_embeddings")
+                        and not name.startswith(f"{k}.audio_language_projection")
                     )
-                )
 
                 for pattern, repl in remapping_rules:
                     if re.fullmatch(pattern, name):
@@ -676,6 +685,7 @@ class VoxtralEncoderModel(nn.Module):
     packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
 
     mistral_remapping = [
+        (r"mm_streams_embeddings.embedding_module\.(.*)", r"\1"),
         (
             r"whisper_encoder\.conv_layers\.0\.(weight|bias)",
             r"whisper_encoder.conv1.\1",
@@ -684,6 +694,14 @@ class VoxtralEncoderModel(nn.Module):
             r"whisper_encoder\.conv_layers\.1\.(weight|bias)",
             r"whisper_encoder.conv2.\1",
         ),
+        (
+            r"whisper_encoder\.conv_layers\.0\.conv\.(weight|bias)",
+            r"whisper_encoder.conv1.\1",
+        ),  # noqa: E501
+        (
+            r"whisper_encoder\.conv_layers\.1\.conv\.(weight|bias)",
+            r"whisper_encoder.conv2.\1",
+        ),  # noqa: E501
         (
             r"whisper_encoder\.transformer\.layers\.(\d+)\.attention\.w([qkv])\.(weight|bias)",  # noqa: E501
             r"whisper_encoder.layers.\1.self_attn.\2_proj.\3",
diff --git a/vllm/model_executor/models/voxtral_streaming.py b/vllm/model_executor/models/voxtral_streaming.py
new file mode 100644
index 0000000000000..2e79e24e6f194
--- /dev/null
+++ b/vllm/model_executor/models/voxtral_streaming.py
@@ -0,0 +1,243 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import math
+from collections.abc import Mapping
+
+import torch
+
+from vllm.config.vllm import VllmConfig
+from vllm.logger import init_logger
+from vllm.model_executor.models.interfaces import MultiModalEmbeddings
+from vllm.model_executor.models.voxtral import (
+    VoxtralDummyInputsBuilder,
+    VoxtralForConditionalGeneration,
+    VoxtralMultiModalProcessor,
+    VoxtralProcessingInfo,
+)
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.cache import _I, BaseMultiModalProcessorCache
+from vllm.multimodal.inputs import (
+    MultiModalKwargsOptionalItems,
+)
+from vllm.multimodal.parse import MultiModalDataItems
+from vllm.multimodal.processing import (
+    MultiModalPromptUpdates,
+    PlaceholderFeaturesInfo,
+)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+
+from .utils import (
+    _flatten_embeddings,
+)
+
+logger = init_logger(__name__)
+
+
+class VoxtralStreamingMultiModalProcessor(VoxtralMultiModalProcessor):
+    def __init__(
+        self,
+        info: _I,
+        dummy_inputs: BaseDummyInputsBuilder[_I],
+        *,
+        cache: BaseMultiModalProcessorCache | None = None,
+    ) -> None:
+        # streaming can't make use of a cache yet
+        super().__init__(info, dummy_inputs, cache=None)
+
+    def _maybe_apply_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        prompt_ids: list[int],
+        mm_kwargs: MultiModalKwargsOptionalItems,
+        mm_prompt_updates: MultiModalPromptUpdates,
+        is_update_applied: bool,
+    ) -> tuple[list[int], Mapping[str, list[PlaceholderFeaturesInfo]]]:
+        # there are no placeholder audio tokens for streaming
+        # so we need to build the place placeholder positions manually
+
+        # in streaming there is always only one audio input
+        audios = mm_kwargs.get("audio", [])
+        assert len(audios) == 1, (
+            f"Expected only one audio input for streaming, got {mm_kwargs=}"
+        )
+        tokenizer = self.info.get_tokenizer()
+        audio_config = tokenizer.instruct.audio_encoder.audio_config
+
+        num_audio_samples = audios[0]["audio_arrays"].data.shape[0]
+        length = audio_config.num_audio_tokens(num_audio_samples)
+
+        features_info = PlaceholderFeaturesInfo(
+            modality="audio",
+            item_idx=0,
+            start_idx=0,
+            tokens=length
+            * [0],  # only used for length computation, so we can take dummy inputs
+            is_embed=None,
+        )
+        return prompt_ids, {"audio": [features_info]}
+
+
+class TimeEmbedding(torch.nn.Module):
+    """Sinusoidal Embedding for encoding time"""
+
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+        inv_freq = torch.exp(
+            -math.log(self.theta)
+            * torch.arange(self.dim // 2).float()
+            / (self.dim // 2)
+        )
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+    def forward(self, t: torch.Tensor) -> torch.Tensor:
+        t = t[..., None]  # (B,) -> (B, 1) or (B, T) -> (B, T, 1)
+        inv_freq = self.inv_freq.to(device=t.device, dtype=t.dtype)
+        emb = (
+            t * inv_freq
+        )  # (B, 1) x (D/2,) -> (B, D/2) or (B, T, 1) x (D/2,) -> (B, T, D/2)
+        return torch.cat((emb.cos(), emb.sin()), dim=-1)  # (B, D) or (B, T, D)
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    VoxtralStreamingMultiModalProcessor,
+    info=VoxtralProcessingInfo,
+    dummy_inputs=VoxtralDummyInputsBuilder,
+)
+class VoxtralStreamingGeneration(VoxtralForConditionalGeneration):
+    requires_raw_input_tokens = True
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+        self.time_embedding: TimeEmbedding = TimeEmbedding(
+            dim=self.config.text_config.hidden_size
+        )
+
+        audio_config = self.tokenizer.instruct.audio_encoder.audio_config
+        _n_delay_tokens = (
+            audio_config.frame_rate * audio_config.transcription_delay_ms / 1000
+        )
+        assert _n_delay_tokens.is_integer(), (
+            f"n_delay_tokens must be integer, got {_n_delay_tokens}"
+        )
+
+        self.n_delay_tokens = int(_n_delay_tokens)
+
+    @property
+    def audio_config(self):
+        return self.tokenizer.instruct.audio_encoder.audio_config
+
+    def embed_input_ids(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: MultiModalEmbeddings | None = None,
+        *,
+        is_multimodal: torch.Tensor | None = None,
+        # Multi-modal token ID may exceed vocab size
+        handle_oov_mm_token: bool = True,
+    ) -> torch.Tensor:
+        """Pass post-conv embeddings directly as input"""
+        # for streaming we simply flatten the multimodal embeddings
+        # to be in tensor format, we treat the input ids later
+        assert multimodal_embeddings is not None
+        assert len(multimodal_embeddings) > 0, (
+            "For streaming you must provide a multimodal_embedding at every step."
+        )
+        mm_embeds_flat = _flatten_embeddings(multimodal_embeddings)
+        return mm_embeds_flat
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        **kwargs: object,
+    ) -> torch.Tensor | IntermediateTensors:
+        assert inputs_embeds is not None
+        assert input_ids is not None
+
+        pool_size = self.config.audio_config.block_pool_size
+        inputs_embeds = inputs_embeds.view(
+            inputs_embeds.shape[0] * pool_size, inputs_embeds.shape[1] // pool_size
+        )
+
+        audio_hidden_states = self.whisper_encoder.whisper_encoder.forward_layers(
+            inputs_embeds
+        )
+
+        num_tokens, audio_hidden_size = audio_hidden_states.shape
+        assert num_tokens % self.downsample_factor == 0
+        audio_hidden_states = audio_hidden_states.reshape(
+            num_tokens // self.downsample_factor,
+            audio_hidden_size * self.downsample_factor,
+        )
+        audio_text_embeds = self.audio_language_adapter(audio_hidden_states)
+
+        text_embeds = self.language_model.embed_input_ids(input_ids)
+
+        # sum pool text and audio embeddings
+        inputs_embeds = audio_text_embeds + text_embeds
+
+        time_tensor = torch.tensor(
+            [self.n_delay_tokens],
+            device=inputs_embeds.device,
+            dtype=inputs_embeds.dtype,
+        )
+        inputs_embeds = inputs_embeds + self.time_embedding(time_tensor)
+
+        hidden_states = self.language_model.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds=inputs_embeds
+        )
+
+        return hidden_states
+
+    def embed_multimodal(
+        self, **kwargs
+    ) -> list[torch.Tensor] | torch.Tensor | tuple[torch.Tensor, ...] | None:
+        """Transform audio waveforms -> initial whisper post-conv embeddings"""
+        audio_inputs = self._parse_and_validate_audio_arrays(**kwargs)
+
+        assert audio_inputs is not None, (
+            "For streaming you must provide an audio input at every step."
+        )
+
+        multiple_of = self.audio_config.raw_audio_length_per_tok
+        assert all(
+            (this_audio := audio.shape[0]) % multiple_of == 0 for audio in audio_inputs
+        ), (
+            f"Every input audio waveform has to be a multiple of {multiple_of}, but"
+            f" one is {this_audio} with {(this_audio / multiple_of)=}."
+        )
+
+        mel_features = [
+            self.whisper_encoder.compute_whisper_melspec(audio).to(
+                self.whisper_encoder.dtype
+            )
+            for audio in audio_inputs
+        ]
+        seq_lens = [mel.shape[1] for mel in mel_features]
+        # [total_num_20ms_frames, hidden_size]
+        audio_embeddings = self.whisper_encoder.whisper_encoder.forward_conv(
+            mel_features
+        )[0]
+        conv_stride = self.whisper_encoder.whisper_encoder.total_stride
+        audio_embeddings_per_sample = audio_embeddings.split(
+            [s // conv_stride for s in seq_lens], dim=0
+        )
+
+        # audio_embeddings per sample need to be divisible by 4
+        pool_size = self.config.audio_config.block_pool_size
+        assert all(
+            (this_shape := sample.shape[0]) % pool_size == 0
+            for sample in audio_embeddings_per_sample
+        ), f"Every audio embedding has to be a multiple of 4, but one is {this_shape}."
+
+        audio_embeddings_per_sample = [
+            e.view(e.shape[0] // pool_size, e.shape[1] * pool_size)
+            for e in audio_embeddings_per_sample
+        ]
+        return audio_embeddings_per_sample
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index b513e3513b2e2..f1bae28debad2 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -1,9 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import enum
 import math
 from collections.abc import Iterable, Mapping, Sequence
 from contextlib import nullcontext
+from functools import partial
 from typing import Annotated, Literal, cast
 
 import numpy as np
@@ -16,9 +18,12 @@ from transformers import (
 )
 from transformers.models.whisper.modeling_whisper import sinusoids
 
-from vllm.attention.backends.abstract import AttentionType
-from vllm.attention.layer import Attention, MultiHeadAttention
+from vllm.attention.backends.abstract import (
+    AttentionType,
+)
+from vllm.attention.layer import Attention
 from vllm.attention.layers.cross_attention import CrossAttention
+from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
 from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import get_tensor_model_parallel_world_size
@@ -34,6 +39,11 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.whisper_utils import (
+    ISO639_1_SUPPORTED_LANGS,
+    WhisperAttentionWithBlockPooling,
+    WhisperCausalConv1d,
+)
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (
     MultiModalDataDict,
@@ -64,67 +74,11 @@ from .utils import (
 
 logger = init_logger(__name__)
 
-# From https://platform.openai.com/docs/guides/speech-to-text/supported-languages
 
-ISO639_1_SUPPORTED_LANGS = {
-    "af": "Afrikaans",
-    "ar": "Arabic",
-    "hy": "Armenian",
-    "az": "Azerbaijani",
-    "be": "Belarusian",
-    "bs": "Bosnian",
-    "bg": "Bulgarian",
-    "ca": "Catalan",
-    "zh": "Chinese",
-    "hr": "Croatian",
-    "cs": "Czech",
-    "da": "Danish",
-    "nl": "Dutch",
-    "en": "English",
-    "et": "Estonian",
-    "fi": "Finnish",
-    "fr": "French",
-    "gl": "Galician",
-    "de": "German",
-    "el": "Greek",
-    "he": "Hebrew",
-    "hi": "Hindi",
-    "hu": "Hungarian",
-    "is": "Icelandic",
-    "id": "Indonesian",
-    "it": "Italian",
-    "ja": "Japanese",
-    "kn": "Kannada",
-    "kk": "Kazakh",
-    "ko": "Korean",
-    "lv": "Latvian",
-    "lt": "Lithuanian",
-    "mk": "Macedonian",
-    "ms": "Malay",
-    "mr": "Marathi",
-    "mi": "Maori",
-    "ne": "Nepali",
-    "no": "Norwegian",
-    "fa": "Persian",
-    "pl": "Polish",
-    "pt": "Portuguese",
-    "ro": "Romanian",
-    "ru": "Russian",
-    "sr": "Serbian",
-    "sk": "Slovak",
-    "sl": "Slovenian",
-    "es": "Spanish",
-    "sw": "Swahili",
-    "sv": "Swedish",
-    "tl": "Tagalog",
-    "ta": "Tamil",
-    "th": "Thai",
-    "tr": "Turkish",
-    "uk": "Ukrainian",
-    "ur": "Urdu",
-    "vi": "Vietnamese",
-    "cy": "Welsh",
-}
+class WhisperPosEmbedType(enum.Enum):
+    SINUSOIDAL = "sinusoidal"
+    NOPE = "nope"
+    LEARNED = "learned"
 
 
 class WhisperAudioInputs(TensorSchema):
@@ -141,7 +95,7 @@ class WhisperAudioInputs(TensorSchema):
     ]
 
 
-class WhisperEncoderAttention(MultiHeadAttention):
+class WhisperEncoderAttention(MMEncoderAttention):
     """Multi-headed attention for Whisper encoder with 2D tensor support."""
 
     def forward(
@@ -184,6 +138,8 @@ class WhisperAttention(nn.Module):
         num_heads: int,
         bias: bool = True,
         attn_type: AttentionType = AttentionType.DECODER,
+        per_layer_sliding_window: int | None = None,
+        block_pool_size: int = 1,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
@@ -242,7 +198,14 @@ class WhisperAttention(nn.Module):
                 attn_type=self.attn_type,
             )
         else:  # AttentionType.DECODER (regular decoder self-attention)
-            self.attn = Attention(
+            if block_pool_size > 1:
+                attn_cls = partial(
+                    WhisperAttentionWithBlockPooling, block_pool_size=block_pool_size
+                )
+            else:
+                attn_cls = Attention
+
+            self.attn = attn_cls(
                 self.num_heads,
                 self.head_dim,
                 self.scaling,
@@ -251,6 +214,7 @@ class WhisperAttention(nn.Module):
                 quant_config=quant_config,
                 prefix=f"{prefix}.attn",
                 attn_type=self.attn_type,
+                per_layer_sliding_window=per_layer_sliding_window,
             )
 
     def _init_qkv(
@@ -386,6 +350,9 @@ class WhisperEncoderLayer(nn.Module):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
+        is_causal = getattr(config, "is_causal", False)
+        sliding_window = getattr(config, "sliding_window", None)
+        block_pool_size = getattr(config, "block_pool_size", 1)
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
 
@@ -393,7 +360,9 @@ class WhisperEncoderLayer(nn.Module):
         self.self_attn = WhisperAttention(
             embed_dim=self.embed_dim,
             num_heads=config.encoder_attention_heads,
-            attn_type=AttentionType.ENCODER,
+            attn_type=AttentionType.DECODER if is_causal else AttentionType.ENCODER,
+            block_pool_size=block_pool_size,
+            per_layer_sliding_window=sliding_window,
             cache_config=cache_config,
             quant_config=quant_config,
             prefix=f"{prefix}.self_attn",
@@ -492,12 +461,21 @@ class WhisperEncoder(nn.Module):
         super().__init__()
         config = vllm_config.model_config.hf_config
         embed_dim = config.d_model
+
+        self.pos_embed_type = WhisperPosEmbedType(
+            getattr(config, "pos_embed", "sinusoidal")
+        )
         self.num_mel_bins = config.num_mel_bins
         self.max_source_positions = config.max_source_positions
         self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
 
-        self.conv1 = nn.Conv1d(self.num_mel_bins, embed_dim, kernel_size=3, padding=1)
-        self.conv2 = nn.Conv1d(embed_dim, embed_dim, kernel_size=3, stride=2, padding=1)
+        is_causal = getattr(config, "is_causal", False)
+        Conv1d = WhisperCausalConv1d if is_causal else partial(nn.Conv1d, padding=1)
+
+        self.conv1 = Conv1d(self.num_mel_bins, embed_dim, kernel_size=3)
+        self.conv2 = Conv1d(embed_dim, embed_dim, stride=2, kernel_size=3)
+
+        self.total_stride = self.conv1.stride[0] * self.conv2.stride[0]
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.encoder_layers,
             lambda prefix: WhisperEncoderLayer(
@@ -507,29 +485,54 @@ class WhisperEncoder(nn.Module):
         )
         self.layer_norm = nn.LayerNorm(config.d_model)
 
-        maybe_fp32_init_ctx = (
-            set_default_torch_dtype(torch.float32) if init_in_fp32 else nullcontext()
-        )
-
-        with (
-            torch.no_grad(),
-            maybe_fp32_init_ctx,
+        if is_causal and self.pos_embed_type != WhisperPosEmbedType.NOPE:
+            raise ValueError(
+                "Only NOPE position embeddings are supported "
+                f"for causal models, but got {self.pos_embed_type}"
+            )
+        elif self.pos_embed_type in (
+            WhisperPosEmbedType.SINUSOIDAL,
+            WhisperPosEmbedType.LEARNED,
         ):
-            self.embed_positions = nn.Embedding(self.max_source_positions, embed_dim)
-            self.embed_positions.weight.copy_(
-                sinusoids(*self.embed_positions.weight.shape)
+            maybe_fp32_init_ctx = (
+                set_default_torch_dtype(torch.float32)
+                if init_in_fp32
+                else nullcontext()
             )
 
-    def forward(self, input_features: torch.Tensor | list[torch.Tensor]):
+            with (
+                torch.no_grad(),
+                maybe_fp32_init_ctx,
+            ):
+                self.embed_positions = nn.Embedding(
+                    self.max_source_positions, embed_dim
+                )
+                self.embed_positions.weight.copy_(
+                    sinusoids(*self.embed_positions.weight.shape)
+                )
+
+    def forward_conv(
+        self, input_features: torch.Tensor | list[torch.Tensor]
+    ) -> torch.Tensor:
         hidden_states = []
         input_is_batched = False
         for features in input_features:
             embeds = nn.functional.gelu(self.conv1(features))
             embeds = nn.functional.gelu(self.conv2(embeds))
-            embeds = embeds.transpose(-1, -2)
-            embeds = (embeds + self.embed_positions.weight[: embeds.size(-2), :]).to(
-                embeds.dtype
-            )
+
+            if self.pos_embed_type in (
+                WhisperPosEmbedType.SINUSOIDAL,
+                WhisperPosEmbedType.LEARNED,
+            ):
+                embeds = embeds.transpose(-1, -2)
+                embeds = (
+                    embeds + self.embed_positions.weight[: embeds.size(-2), :]
+                ).to(embeds.dtype)
+            elif self.pos_embed_type == WhisperPosEmbedType.NOPE:
+                embeds = embeds.transpose(-1, -2).to(embeds.dtype)
+            else:
+                raise ValueError(f"Unknown pos_embed_type: {self.pos_embed_type}")
+
             hidden_states.append(embeds)
             input_is_batched = embeds.ndim > 2
         # Input to MHA must be B x T x D
@@ -539,12 +542,19 @@ class WhisperEncoder(nn.Module):
         else:
             hidden_states = torch.stack(hidden_states, dim=0)
 
+        return hidden_states
+
+    def forward_layers(self, hidden_states: torch.Tensor) -> torch.Tensor:
         for encoder_layer in self.layers:
             hidden_states = encoder_layer(hidden_states)
 
         hidden_states = self.layer_norm(hidden_states)
         return hidden_states
 
+    def forward(self, input_features: torch.Tensor | list[torch.Tensor]):
+        hidden_states = self.forward_conv(input_features)
+        return self.forward_layers(hidden_states)
+
 
 class WhisperDecoder(nn.Module):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
diff --git a/vllm/model_executor/models/whisper_utils.py b/vllm/model_executor/models/whisper_utils.py
new file mode 100644
index 0000000000000..077b4aff6fec9
--- /dev/null
+++ b/vllm/model_executor/models/whisper_utils.py
@@ -0,0 +1,299 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import copy
+import functools
+import math
+from dataclasses import replace
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from vllm.attention.backends.abstract import (
+    AttentionBackend,
+    AttentionMetadata,
+    AttentionType,
+)
+from vllm.attention.layer import Attention
+from vllm.attention.selector import get_attn_backend
+from vllm.config import CacheConfig, VllmConfig
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
+from vllm.v1.attention.backends.utils import (
+    CommonAttentionMetadata,
+    subclass_attention_backend_with_overrides,
+)
+from vllm.v1.kv_cache_interface import AttentionSpec
+
+# From https://platform.openai.com/docs/guides/speech-to-text/supported-languages
+ISO639_1_SUPPORTED_LANGS = {
+    "af": "Afrikaans",
+    "ar": "Arabic",
+    "hy": "Armenian",
+    "az": "Azerbaijani",
+    "be": "Belarusian",
+    "bs": "Bosnian",
+    "bg": "Bulgarian",
+    "ca": "Catalan",
+    "zh": "Chinese",
+    "hr": "Croatian",
+    "cs": "Czech",
+    "da": "Danish",
+    "nl": "Dutch",
+    "en": "English",
+    "et": "Estonian",
+    "fi": "Finnish",
+    "fr": "French",
+    "gl": "Galician",
+    "de": "German",
+    "el": "Greek",
+    "he": "Hebrew",
+    "hi": "Hindi",
+    "hu": "Hungarian",
+    "is": "Icelandic",
+    "id": "Indonesian",
+    "it": "Italian",
+    "ja": "Japanese",
+    "kn": "Kannada",
+    "kk": "Kazakh",
+    "ko": "Korean",
+    "lv": "Latvian",
+    "lt": "Lithuanian",
+    "mk": "Macedonian",
+    "ms": "Malay",
+    "mr": "Marathi",
+    "mi": "Maori",
+    "ne": "Nepali",
+    "no": "Norwegian",
+    "fa": "Persian",
+    "pl": "Polish",
+    "pt": "Portuguese",
+    "ro": "Romanian",
+    "ru": "Russian",
+    "sr": "Serbian",
+    "sk": "Slovak",
+    "sl": "Slovenian",
+    "es": "Spanish",
+    "sw": "Swahili",
+    "sv": "Swedish",
+    "tl": "Tagalog",
+    "ta": "Tamil",
+    "th": "Thai",
+    "tr": "Turkish",
+    "uk": "Ukrainian",
+    "ur": "Urdu",
+    "vi": "Vietnamese",
+    "cy": "Welsh",
+}
+
+
+def _pad1d(
+    x: torch.Tensor,
+    paddings: tuple[int, int],
+    mode: str = "constant",
+    value: float = 0.0,
+) -> torch.Tensor:
+    """Tiny wrapper around F.pad, just to allow for
+    reflect padding on small input.
+    If this is the case, we insert extra 0 padding
+    to the right before the reflection happen.
+    """
+    length = x.shape[-1]
+    padding_left, padding_right = paddings
+    assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
+    if mode == "reflect":
+        max_pad = max(padding_left, padding_right)
+        extra_pad = 0
+        if length <= max_pad:
+            extra_pad = max_pad - length + 1
+            x = F.pad(x, (0, extra_pad))
+        padded = F.pad(x, paddings, mode, value)
+        end = padded.shape[-1] - extra_pad
+        return padded[..., :end]
+    else:
+        return F.pad(x, paddings, mode, value)
+
+
+class WhisperCausalConv1d(nn.Conv1d):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: int = 0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            bias=bias,
+        )
+        self._stride = self.stride[0]
+        self._effective_kernel_size = (kernel_size - 1) * self.dilation[0] + 1
+        self._padding_total = self._effective_kernel_size - self._stride
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        n_frames = (
+            x.shape[-1] - self._effective_kernel_size + self._padding_total
+        ) / self._stride + 1
+        target_length = (math.ceil(n_frames) - 1) * self._stride + (
+            self._effective_kernel_size - self._padding_total
+        )
+        extra_padding = target_length - x.shape[-1]
+        x = _pad1d(x, (self._padding_total, extra_padding), mode="constant")
+        return super().forward(x)
+
+
+@functools.lru_cache
+def create_whisper_attention_backend_with_block_pooling(
+    underlying_attn_backend: AttentionBackend, block_pool_size: int
+) -> type[AttentionBackend]:
+    prefix = "WhisperAttentionWithBlockPooling_"
+    underlying_builder = underlying_attn_backend.get_builder_cls()
+
+    class WhisperAttentionWithBlockPoolingBuilder(underlying_builder):  # type: ignore
+        def __init__(
+            self,
+            kv_cache_spec: AttentionSpec,
+            layer_names: list[str],
+            vllm_config: VllmConfig,
+            device: torch.device,
+        ):
+            assert kv_cache_spec.num_kv_heads % block_pool_size == 0
+            kv_cache_spec = replace(
+                kv_cache_spec,
+                block_size=kv_cache_spec.block_size * block_pool_size,
+                num_kv_heads=kv_cache_spec.num_kv_heads // block_pool_size,
+            )
+            super().__init__(kv_cache_spec, layer_names, vllm_config, device)
+
+        def build(
+            self,
+            common_prefix_len: int,
+            common_attn_metadata: CommonAttentionMetadata,
+            fast_build: bool = False,
+        ) -> AttentionMetadata:
+            new_common_attn_metadata = copy.deepcopy(common_attn_metadata)
+            new_common_attn_metadata.query_start_loc *= block_pool_size
+            new_common_attn_metadata.query_start_loc_cpu *= block_pool_size
+            new_common_attn_metadata.seq_lens *= block_pool_size
+            new_common_attn_metadata._seq_lens_cpu *= block_pool_size
+            new_common_attn_metadata._num_computed_tokens_cpu *= block_pool_size
+            new_common_attn_metadata.num_actual_tokens *= block_pool_size
+            new_common_attn_metadata.max_query_len *= block_pool_size
+            new_common_attn_metadata.max_seq_len *= block_pool_size
+            original_slot_mapping = common_attn_metadata.slot_mapping
+            common_prefix_len *= block_pool_size
+            new_common_attn_metadata.slot_mapping = (
+                (
+                    original_slot_mapping.unsqueeze(1) * block_pool_size
+                    + torch.arange(block_pool_size, device=original_slot_mapping.device)
+                )
+                .flatten()
+                .clamp(min=-1)
+            )
+            return super().build(
+                common_prefix_len, new_common_attn_metadata, fast_build
+            )
+
+    if not issubclass(underlying_attn_backend, FlashAttentionBackend):
+        raise NotImplementedError(
+            f"{underlying_attn_backend} is not yet supported."
+            "Contributions to support more backends are much "
+            "appreciated."
+        )
+
+    attn_backend = subclass_attention_backend_with_overrides(
+        name_prefix=prefix,
+        attention_backend_cls=underlying_attn_backend,
+        overrides={
+            "get_builder_cls": lambda: WhisperAttentionWithBlockPoolingBuilder,
+            "get_kv_cache_shape": lambda num_blocks,
+            block_size,
+            num_kv_heads,
+            head_size,
+            cache_dtype_str: (
+                2,
+                num_blocks,
+                # we stretch each block by `block_pool_size`
+                block_size * block_pool_size,
+                num_kv_heads // block_pool_size,
+                head_size,
+            ),  # TODO: generalize to other backends
+        },
+    )
+
+    return attn_backend
+
+
+class WhisperAttentionWithBlockPooling(Attention):
+    """Attention layer with block pooling."""
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int | None = None,
+        alibi_slopes: list[float] | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        logits_soft_cap: float | None = None,
+        per_layer_sliding_window: int | None = None,
+        prefix: str = "",
+        attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: str | None = None,
+        block_pool_size: int = 1,
+        attn_backend: type[AttentionBackend] | None = None,
+        **extra_impl_args,
+    ) -> None:
+        self.block_pool_size = block_pool_size
+        dtype = torch.get_default_dtype()
+
+        if cache_config is not None:
+            kv_cache_dtype = cache_config.cache_dtype
+            block_size = cache_config.block_size
+        else:
+            kv_cache_dtype = "auto"
+            block_size = 16
+
+        underlying_attn_backend = get_attn_backend(
+            head_size,
+            dtype,
+            kv_cache_dtype,
+            block_size,
+            attn_type=attn_type,
+        )
+        attn_backend = create_whisper_attention_backend_with_block_pooling(
+            underlying_attn_backend, block_pool_size
+        )
+
+        super().__init__(
+            num_heads=num_heads,
+            head_size=head_size,
+            scale=scale,
+            num_kv_heads=num_kv_heads,
+            alibi_slopes=alibi_slopes,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            logits_soft_cap=logits_soft_cap,
+            per_layer_sliding_window=per_layer_sliding_window,
+            prefix=prefix,
+            attn_type=attn_type,
+            kv_sharing_target_layer_name=kv_sharing_target_layer_name,
+            attn_backend=attn_backend,
+            **extra_impl_args,
+        )
+
+    def get_kv_cache_spec(self, vllm_config: VllmConfig):
+        kv_cache_spec = super().get_kv_cache_spec(vllm_config)
+        assert isinstance(kv_cache_spec, AttentionSpec)
+        kv_cache_spec = replace(
+            kv_cache_spec,
+            num_kv_heads=self.block_pool_size * kv_cache_spec.num_kv_heads,
+        )
+        return kv_cache_spec
diff --git a/vllm/model_executor/warmup/deep_gemm_warmup.py b/vllm/model_executor/warmup/deep_gemm_warmup.py
index 936f6b1e28ce1..2bbc655bd935f 100644
--- a/vllm/model_executor/warmup/deep_gemm_warmup.py
+++ b/vllm/model_executor/warmup/deep_gemm_warmup.py
@@ -10,7 +10,7 @@ import torch
 from tqdm import tqdm
 
 import vllm.envs as envs
-from vllm.distributed.parallel_state import get_dp_group
+from vllm.distributed.parallel_state import get_dp_group, is_global_first_rank
 from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts
 from vllm.model_executor.layers.fused_moe.deep_gemm_utils import compute_aligned_M
 from vllm.model_executor.layers.fused_moe.layer import FusedMoE, FusedMoEModularMethod
@@ -175,7 +175,30 @@ def _fused_moe_grouped_gemm_may_use_deep_gemm(module: torch.nn.Module) -> bool:
 FP8_GEMM_NT_WARMUP_CACHE: set[torch.Size] = set()
 
 
-def _deepgemm_fp8_gemm_nt_warmup(w: torch.Tensor, ws: torch.Tensor, max_tokens: int):
+def _get_fp8_gemm_nt_m_values(w: torch.Tensor, max_tokens: int) -> list[int]:
+    """Get the M values to warmup for a given weight tensor."""
+    n, _ = w.size()
+    device = w.device
+
+    # Use optimal M values only if VLLM_DEEP_GEMM_WARMUP is set to "relax".
+    # Otherwise warmup all token sizes to avoid JIT compilation in hotpath
+    if envs.VLLM_DEEP_GEMM_WARMUP == "relax":
+        return _generate_optimal_warmup_m_values(max_tokens, n, device)
+    else:
+        assert envs.VLLM_DEEP_GEMM_WARMUP == "full", (
+            "Expected "
+            'VLLM_DEEP_GEMM_WARMUP env to be set to "full" but got '
+            f"{envs.VLLM_DEEP_GEMM_WARMUP}"
+        )
+        return list(range(1, max_tokens + 1))
+
+
+def _deepgemm_fp8_gemm_nt_warmup(
+    w: torch.Tensor,
+    ws: torch.Tensor,
+    max_tokens: int,
+    pbar: tqdm | None = None,
+):
     if w.size() in FP8_GEMM_NT_WARMUP_CACHE:
         return
 
@@ -189,27 +212,14 @@ def _deepgemm_fp8_gemm_nt_warmup(w: torch.Tensor, ws: torch.Tensor, max_tokens:
     )
     out = torch.empty((max_tokens, n), device=device, dtype=torch.bfloat16)
 
-    # Use optimal M values only if VLLM_DEEP_GEMM_WARMUP is set to "relax".
-    # Otherwise warmup all token sizes to avoid JIT compilation in hotpath
-    if envs.VLLM_DEEP_GEMM_WARMUP == "relax":
-        m_values = _generate_optimal_warmup_m_values(max_tokens, n, device)
-        desc = f"DeepGemm(fp8_gemm_nt) warmup (W={w.size()}) [relaxed]"
-    else:
-        assert envs.VLLM_DEEP_GEMM_WARMUP == "full", (
-            "Expected "
-            'VLLM_DEEP_GEMM_WARMUP env to be set to "full" but got '
-            f"{envs.VLLM_DEEP_GEMM_WARMUP}"
-        )
-        m_values = list(range(1, max_tokens + 1))
-        desc = f"DeepGemm(fp8_gemm_nt) warmup (W={w.size()}) [all tokens]"
-
-    pbar = tqdm(total=len(m_values), desc=desc)
+    m_values = _get_fp8_gemm_nt_m_values(w, max_tokens)
 
     for num_tokens in m_values:
         fp8_gemm_nt(
             (a1q[:num_tokens], a1q_scales[:num_tokens]), (w, ws), out[:num_tokens]
         )
-        pbar.update(1)
+        if pbar is not None:
+            pbar.update(1)
 
     FP8_GEMM_NT_WARMUP_CACHE.add(w.size())
 
@@ -217,20 +227,12 @@ def _deepgemm_fp8_gemm_nt_warmup(w: torch.Tensor, ws: torch.Tensor, max_tokens:
 GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE: set[torch.Size] = set()
 
 
-def _deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(
+def _get_grouped_gemm_params(
     w1: torch.Tensor,
     w2: torch.Tensor,
-    w1_scale: torch.Tensor,
-    w2_scale: torch.Tensor,
     num_topk: int,
     max_tokens: int,
-):
-    if (
-        w1.size() in GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE
-        and w2.size() in GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE
-    ):
-        return
-
+) -> tuple[int, int, torch.Tensor]:
     assert w1.size(0) == w2.size(0), "w1 and w2 must have the same number of experts"
 
     block_m = get_mk_alignment_for_contiguous_layout()[0]
@@ -253,6 +255,27 @@ def _deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(
     )
     expert_ids = torch.repeat_interleave(expert_ids_block, block_m, dim=0)
 
+    return MAX_M, block_m, expert_ids
+
+
+def _deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    num_topk: int,
+    max_tokens: int,
+    pbar: tqdm | None = None,
+):
+    if (
+        w1.size() in GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE
+        and w2.size() in GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE
+    ):
+        return
+
+    MAX_M, block_m, expert_ids = _get_grouped_gemm_params(w1, w2, num_topk, max_tokens)
+    device = w1.device
+
     def _warmup(w: torch.Tensor, w_scale: torch.Tensor):
         _, n, k = w.size()
         a1q = torch.empty((MAX_M, k), device=device, dtype=torch.float8_e4m3fn)
@@ -261,15 +284,8 @@ def _deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(
         )
         out = torch.empty((MAX_M, n), device=device, dtype=torch.bfloat16)
 
-        # Generate M values in block_m increments (already optimized for MoE)
         m_values = list(range(block_m, MAX_M + 1, block_m))
 
-        pbar = tqdm(
-            total=len(m_values),
-            desc=f"DeepGemm(m_grouped_fp8_gemm_nt_contiguous) warmup (W={w.size()}) "
-            f"[{len(m_values)} values, block_m={block_m}]",
-        )
-
         for num_tokens in m_values:
             m_grouped_fp8_gemm_nt_contiguous(
                 (a1q[:num_tokens], a1q_scales[:num_tokens]),
@@ -277,7 +293,8 @@ def _deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(
                 out[:num_tokens],
                 expert_ids[:num_tokens],
             )
-            pbar.update(1)
+            if pbar is not None:
+                pbar.update(1)
 
     for w, ws in [(w1, w1_scale), (w2, w2_scale)]:
         if w.size() not in GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE:
@@ -285,16 +302,18 @@ def _deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(
             GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE.add(w.size())
 
 
-def deepgemm_fp8_gemm_nt_warmup(model: torch.nn.Module, max_tokens: int):
+def deepgemm_fp8_gemm_nt_warmup(
+    model: torch.nn.Module, max_tokens: int, pbar: tqdm | None = None
+):
     dg_modules = [m for m in model.modules() if _fp8_linear_may_use_deep_gemm(m)]
 
     for dgm in dg_modules:
         w, ws, _ = _extract_data_from_linear_base_module(dgm)
-        _deepgemm_fp8_gemm_nt_warmup(w=w, ws=ws, max_tokens=max_tokens)
+        _deepgemm_fp8_gemm_nt_warmup(w=w, ws=ws, max_tokens=max_tokens, pbar=pbar)
 
 
 def deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(
-    model: torch.nn.Module, max_tokens: int
+    model: torch.nn.Module, max_tokens: int, pbar: tqdm | None = None
 ):
     dg_modules = [
         m for m in model.modules() if _fused_moe_grouped_gemm_may_use_deep_gemm(m)
@@ -305,10 +324,48 @@ def deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(
             dgm
         )
         _deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(
-            w13, w2, w13_scale, w2_scale, num_topk, max_tokens
+            w13, w2, w13_scale, w2_scale, num_topk, max_tokens, pbar=pbar
         )
 
 
+def _count_warmup_iterations(model: torch.nn.Module, max_tokens: int) -> int:
+    seen_fp8_sizes: set[torch.Size] = set(FP8_GEMM_NT_WARMUP_CACHE)
+    seen_grouped_sizes: set[torch.Size] = set(
+        GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE
+    )
+
+    total = 0
+    for m in model.modules():
+        if _fp8_linear_may_use_deep_gemm(m):
+            w, _, _ = _extract_data_from_linear_base_module(m)
+            if w.size() not in seen_fp8_sizes:
+                total += len(_get_fp8_gemm_nt_m_values(w, max_tokens))
+                seen_fp8_sizes.add(w.size())
+        elif _fused_moe_grouped_gemm_may_use_deep_gemm(m):
+            w13, _, w2, _, num_topk = _extract_data_from_fused_moe_module(m)
+            if w13.size() in seen_grouped_sizes and w2.size() in seen_grouped_sizes:
+                continue
+            MAX_M, block_m, _ = _get_grouped_gemm_params(w13, w2, num_topk, max_tokens)
+            n_values = (MAX_M - block_m) // block_m + 1
+            if w13.size() not in seen_grouped_sizes:
+                total += n_values
+                seen_grouped_sizes.add(w13.size())
+            if w2.size() not in seen_grouped_sizes:
+                total += n_values
+                seen_grouped_sizes.add(w2.size())
+    return total
+
+
 def deep_gemm_warmup(model: torch.nn.Module, max_tokens: int):
-    deepgemm_fp8_gemm_nt_warmup(model, max_tokens)
-    deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(model, max_tokens)
+    total = _count_warmup_iterations(model, max_tokens)
+    if total == 0:
+        return
+
+    # Only show progress bar on rank 0 to avoid cluttered output
+    if is_global_first_rank():
+        with tqdm(total=total, desc="DeepGEMM warmup") as pbar:
+            deepgemm_fp8_gemm_nt_warmup(model, max_tokens, pbar)
+            deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(model, max_tokens, pbar)
+    else:
+        deepgemm_fp8_gemm_nt_warmup(model, max_tokens, None)
+        deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(model, max_tokens, None)
diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py
index 51b8f77f29088..57e7be6344cd1 100644
--- a/vllm/multimodal/audio.py
+++ b/vllm/multimodal/audio.py
@@ -111,11 +111,16 @@ class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
     def load_file(self, filepath: Path) -> tuple[npt.NDArray, float]:
         return librosa.load(filepath, sr=None)
 
-    def encode_base64(self, media: tuple[npt.NDArray, int]) -> str:
+    def encode_base64(
+        self,
+        media: tuple[npt.NDArray, int],
+        *,
+        audio_format: str = "WAV",
+    ) -> str:
         audio, sr = media
 
         with BytesIO() as buffer:
-            soundfile.write(buffer, audio, sr, format="WAV")
+            soundfile.write(buffer, audio, sr, format=audio_format)
             data = buffer.getvalue()
 
         return base64.b64encode(data).decode("utf-8")
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index 1506ecb8c7aa0..8e1178bc7ea44 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -8,8 +8,12 @@ import pybase64
 import torch
 from PIL import Image
 
+from vllm.logger import init_logger
+
 from .base import MediaIO, MediaWithBytes
 
+logger = init_logger(__file__)
+
 
 def rescale_image_size(
     image: Image.Image, size_factor: float, transpose: int = -1
@@ -104,8 +108,17 @@ class ImageMediaIO(MediaIO[Image.Image]):
         self,
         media: Image.Image,
         *,
-        image_format: str = "JPEG",
+        image_format: str | None = None,
     ) -> str:
+        if image_format is None:
+            logger.warning_once(
+                "The default format of `ImageMediaIO.encode_base64` will be changed "
+                'from "JPEG" to "PNG" in v0.15 to avoid lossy compression. '
+                "To continue using the old default, "
+                'pass `format="JPEG"` explicitly to silence this warning.'
+            )
+            image_format = "JPEG"
+
         image = media
 
         with BytesIO() as buffer:
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index a69afc3176cab..64c03f8d4da94 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -126,6 +126,30 @@ class ProcessorBatchItems(ModalityDataItems[Sequence[_T], _T]):
         return {}
 
 
+def validate_embedding_ndim(
+    tensor: torch.Tensor,
+    modality: str,
+    index: int | None = None,
+) -> None:
+    """Validate tensor ndim for multimodal embeddings.
+
+    Single embeddings should be 2D (seq_len, hidden_size).
+    Batched embeddings should be 3D (batch, seq_len, hidden_size).
+
+    Args:
+        tensor: The tensor to validate.
+        modality: The modality name for error messages (e.g., "image", "audio").
+        index: Optional index for list items, included in error messages.
+    """
+    if tensor.ndim < 2 or tensor.ndim > 3:
+        idx_str = f" [{index}]" if index is not None else ""
+        raise ValueError(
+            f"{modality.capitalize()} embedding{idx_str} must be 2D "
+            f"(seq_len, hidden_size) or 3D (batch, seq_len, hidden_size), "
+            f"got {tensor.ndim}D tensor with shape {tuple(tensor.shape)}"
+        )
+
+
 class EmbeddingItems(
     ModalityDataItems[torch.Tensor | list[torch.Tensor], torch.Tensor]
 ):
@@ -134,6 +158,63 @@ class EmbeddingItems(
     or a list of embedding tensors (one per item).
     """
 
+    def __init__(
+        self,
+        data: torch.Tensor | list[torch.Tensor],
+        modality: str,
+        expected_hidden_size: int | None = None,
+    ) -> None:
+        super().__init__(data, modality)
+
+        # Validate ndim first (before hidden_size which depends on correct ndim)
+        self._validate_ndim()
+
+        # Validate hidden dimension if expected size is provided
+        if expected_hidden_size is not None:
+            self._validate_hidden_size(expected_hidden_size)
+
+    def _validate_ndim(self) -> None:
+        """Validate that embedding tensors have correct ndim (2D or 3D)."""
+        if isinstance(self.data, torch.Tensor):
+            validate_embedding_ndim(self.data, self.modality)
+        else:
+            # List of tensors: each should be 2D (seq_len, hidden_size)
+            for idx, tensor in enumerate(self.data):
+                if tensor.ndim != 2:
+                    raise ValueError(
+                        f"{self.modality.capitalize()} embedding [{idx}] must be "
+                        f"2D (seq_len, hidden_size), got {tensor.ndim}D tensor "
+                        f"with shape {tuple(tensor.shape)}"
+                    )
+
+    def _validate_hidden_size(self, expected_hidden_size: int) -> None:
+        """Validate that embedding hidden dimension matches expected size.
+
+        This validates hidden dimensions to prevent vulnerabilities: Embeddings
+        with correct ndim but wrong hidden dimension could bypass initial
+        checks and cause crashes during model inference when dimensions don't match.
+        """
+        if isinstance(self.data, torch.Tensor):
+            # Batched tensor: shape is (batch, seq_len, hidden_size)
+            actual_hidden_size = self.data.shape[-1]
+            if actual_hidden_size != expected_hidden_size:
+                raise ValueError(
+                    f"{self.modality.capitalize()} embedding hidden dimension "
+                    f"mismatch: got {actual_hidden_size}, but model expects "
+                    f"{expected_hidden_size}. Embedding shape: {tuple(self.data.shape)}"
+                )
+        else:
+            # List of tensors: each has shape (seq_len, hidden_size)
+            for idx, tensor in enumerate(self.data):
+                actual_hidden_size = tensor.shape[-1]
+                if actual_hidden_size != expected_hidden_size:
+                    raise ValueError(
+                        f"{self.modality.capitalize()} embedding [{idx}] hidden "
+                        f"dimension mismatch: got {actual_hidden_size}, but model "
+                        f"expects {expected_hidden_size}. "
+                        f"Embedding shape: {tuple(tensor.shape)}"
+                    )
+
     def _unwrap(
         self, item: torch.Tensor | MediaWithBytes[torch.Tensor]
     ) -> torch.Tensor:
@@ -228,8 +309,12 @@ class AudioProcessorItems(ProcessorBatchItems[HfAudioItem]):
 
 
 class AudioEmbeddingItems(EmbeddingItems):
-    def __init__(self, data: torch.Tensor | list[torch.Tensor]) -> None:
-        super().__init__(data, "audio")
+    def __init__(
+        self,
+        data: torch.Tensor | list[torch.Tensor],
+        expected_hidden_size: int | None = None,
+    ) -> None:
+        super().__init__(data, "audio", expected_hidden_size)
 
 
 class ImageSize(NamedTuple):
@@ -256,8 +341,12 @@ class ImageProcessorItems(ProcessorBatchItems[HfImageItem]):
 
 
 class ImageEmbeddingItems(EmbeddingItems):
-    def __init__(self, data: torch.Tensor | list[torch.Tensor]) -> None:
-        super().__init__(data, "image")
+    def __init__(
+        self,
+        data: torch.Tensor | list[torch.Tensor],
+        expected_hidden_size: int | None = None,
+    ) -> None:
+        super().__init__(data, "image", expected_hidden_size)
 
 
 class VideoProcessorItems(ProcessorBatchItems[HfVideoItem]):
@@ -287,8 +376,12 @@ class VideoProcessorItems(ProcessorBatchItems[HfVideoItem]):
 
 
 class VideoEmbeddingItems(EmbeddingItems):
-    def __init__(self, data: torch.Tensor | list[torch.Tensor]) -> None:
-        super().__init__(data, "video")
+    def __init__(
+        self,
+        data: torch.Tensor | list[torch.Tensor],
+        expected_hidden_size: int | None = None,
+    ) -> None:
+        super().__init__(data, "video", expected_hidden_size)
 
 
 _D = TypeVar("_D", bound=ModalityDataItems[Any, Any])
@@ -363,6 +456,10 @@ class MultiModalDataParser:
     Args:
         target_sr (float, optional): Enables automatic resampling of audio
             items to the model's expected sampling rate.
+        expected_hidden_size (int, optional): Expected hidden dimension for
+            embedding inputs. If provided, validates that user-supplied
+            embeddings have the correct hidden size to prevent crashes
+            during model inference.
     """
 
     def __init__(
@@ -371,6 +468,7 @@ class MultiModalDataParser:
         target_sr: float | None = None,
         audio_resample_method: Literal["librosa", "scipy"] = "librosa",
         video_needs_metadata: bool = False,
+        expected_hidden_size: int | None = None,
     ) -> None:
         super().__init__()
 
@@ -379,6 +477,7 @@ class MultiModalDataParser:
             method=audio_resample_method,
         )
         self.video_needs_metadata = video_needs_metadata
+        self.expected_hidden_size = expected_hidden_size
 
     @classmethod
     def is_embeddings(
@@ -443,7 +542,7 @@ class MultiModalDataParser:
             return None
 
         if self.is_embeddings(data):
-            return AudioEmbeddingItems(data)
+            return AudioEmbeddingItems(data, self.expected_hidden_size)
 
         data_items: list[AudioItem]
         if (
@@ -481,7 +580,7 @@ class MultiModalDataParser:
             return None
 
         if self.is_embeddings(data):
-            return ImageEmbeddingItems(data)
+            return ImageEmbeddingItems(data, self.expected_hidden_size)
 
         if (
             isinstance(data, (PILImage.Image, MediaWithBytes))
@@ -507,7 +606,7 @@ class MultiModalDataParser:
             return None
 
         if self.is_embeddings(data):
-            return VideoEmbeddingItems(data)
+            return VideoEmbeddingItems(data, self.expected_hidden_size)
 
         data_items: list[VideoItem]
         if (
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 0390773783961..3bbdab3b393c5 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1330,7 +1330,15 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         of [`MultiModalDataParser`][vllm.multimodal.parse.MultiModalDataParser]
         that has additional subparsers.
         """
-        return MultiModalDataParser()
+        # Get expected hidden size for embedding validation if mm_embeds enabled
+        # This validates hidden dimensions to prevent vulnerabilities: embeddings
+        # with correct ndim but wrong shape could cause crashes at inference time
+        mm_config = self.info.ctx.model_config.get_multimodal_config()
+        expected_hidden_size = None
+        if mm_config.enable_mm_embeds:
+            expected_hidden_size = self.info.ctx.model_config.get_inputs_embeds_size()
+
+        return MultiModalDataParser(expected_hidden_size=expected_hidden_size)
 
     def validate_num_items(
         self,
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 7fd05af583b0a..b2b0d1734727c 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -3,6 +3,7 @@
 
 import asyncio
 import atexit
+import mimetypes
 from collections.abc import Generator, Set
 from concurrent.futures import ThreadPoolExecutor
 from itertools import groupby
@@ -357,17 +358,31 @@ class MediaConnector:
 def encode_audio_base64(
     audio: np.ndarray,
     sampling_rate: int,
+    *,
+    format: str = "WAV",
 ) -> str:
     """Encode audio as base64."""
     audio_io = AudioMediaIO()
-    return audio_io.encode_base64((audio, sampling_rate))
+    return audio_io.encode_base64((audio, sampling_rate), audio_format=format)
+
+
+def encode_audio_url(
+    audio: np.ndarray,
+    sampling_rate: int,
+    *,
+    format: str = "WAV",
+) -> str:
+    """Encode audio as a data URL."""
+    audio_b64 = encode_audio_base64(audio, sampling_rate, format=format)
+    mimetype = mimetypes.types_map.get("." + format.lower(), "audio")
+    return f"data:{mimetype};base64,{audio_b64}"
 
 
 def encode_image_base64(
     image: Image.Image,
     *,
     image_mode: str = "RGB",
-    format: str = "JPEG",
+    format: str | None = None,
 ) -> str:
     """
     Encode a pillow image to base64 format.
@@ -378,10 +393,45 @@ def encode_image_base64(
     return image_io.encode_base64(image, image_format=format)
 
 
-def encode_video_base64(frames: npt.NDArray) -> str:
+def encode_image_url(
+    image: Image.Image,
+    *,
+    image_mode: str = "RGB",
+    format: str = "PNG",
+) -> str:
+    """
+    Encode a pillow image as a data URL.
+
+    By default, the image is converted into RGB format before being encoded.
+    """
+    image_b64 = encode_image_base64(image, image_mode=image_mode, format=format)
+    mimetype = mimetypes.types_map.get("." + format.lower(), "image")
+    return f"data:{mimetype};base64,{image_b64}"
+
+
+def encode_video_base64(
+    frames: npt.NDArray,
+    *,
+    format: str = "JPEG",
+) -> str:
     image_io = ImageMediaIO()
     video_io = VideoMediaIO(image_io)
-    return video_io.encode_base64(frames)
+    return video_io.encode_base64(frames, video_format=format)
+
+
+def encode_video_url(
+    frames: npt.NDArray,
+    *,
+    format: str = "JPEG",
+) -> str:
+    video_b64 = encode_video_base64(frames, format=format)
+
+    if format.lower() == "jpeg":
+        mimetype = "video/jpeg"
+    else:
+        mimetype = mimetypes.types_map.get("." + format.lower(), "video")
+
+    return f"data:{mimetype};base64,{video_b64}"
 
 
 def argsort_mm_positions(
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index c237f7cf887c1..b95287906c1fe 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -8,6 +8,7 @@ from typing import TYPE_CHECKING, Optional
 import torch
 
 import vllm.envs as envs
+from vllm.attention.backends.abstract import AttentionType
 from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.logger import init_logger
 from vllm.utils.torch_utils import cuda_device_count_stateless
@@ -204,7 +205,7 @@ class RocmPlatform(Platform):
             assert block_size == 1, (
                 "Sparse MLA backend on ROCm only supports block size 1 for now."
             )
-            logger.info_once("Using Sparse MLA backend on V1 engine.")
+            logger.info_once("Using Sparse MLA backend.")
             return AttentionBackendEnum.ROCM_AITER_MLA_SPARSE.get_path()
 
         if attn_selector_config.use_mla:
@@ -239,16 +240,16 @@ class RocmPlatform(Platform):
             return AttentionBackendEnum.FLEX_ATTENTION.get_path()
 
         if selected_backend == AttentionBackendEnum.TRITON_ATTN:
-            logger.info("Using Triton Attention backend on V1 engine.")
+            logger.info("Using Triton Attention backend.")
             return AttentionBackendEnum.TRITON_ATTN.get_path()
 
         if selected_backend == AttentionBackendEnum.ROCM_ATTN:
-            logger.info("Using Rocm Attention backend on V1 engine.")
+            logger.info("Using Rocm Attention backend.")
             return AttentionBackendEnum.ROCM_ATTN.get_path()
 
         if selected_backend == AttentionBackendEnum.ROCM_AITER_FA:
             if on_gfx9():
-                logger.info("Using Aiter Flash Attention backend on V1 engine.")
+                logger.info("Using Aiter Flash Attention backend.")
                 return AttentionBackendEnum.ROCM_AITER_FA.get_path()
             else:
                 raise ValueError(
@@ -257,25 +258,25 @@ class RocmPlatform(Platform):
                 )
 
         if selected_backend == AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN:
-            logger.info("Using Aiter Unified Attention backend on V1 engine.")
+            logger.info("Using Aiter Unified Attention backend.")
             return AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN.get_path()
 
         # Handle automatic backend selection based on environment variables
         if selected_backend is None:
             # Priority 1: Check for AITER Unified Attention (must check before MHA)
             if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION:
-                logger.info("Using Aiter Unified Attention backend on V1 engine.")
+                logger.info("Using Aiter Unified Attention backend.")
                 return AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN.get_path()
 
             # Priority 2: Check for AITER MHA (Flash Attention)
             # Only use if explicitly enabled (not just VLLM_ROCM_USE_AITER=1)
             if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MHA and on_gfx9():
-                logger.info("Using Aiter Flash Attention backend on V1 engine.")
+                logger.info("Using Aiter Flash Attention backend.")
                 return AttentionBackendEnum.ROCM_AITER_FA.get_path()
 
             # Priority 3: Check for ROCM_ATTN (prefill-decode split)
             if envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION:
-                logger.info("Using Rocm Attention backend on V1 engine.")
+                logger.info("Using Rocm Attention backend.")
                 return AttentionBackendEnum.ROCM_ATTN.get_path()
 
             # Priority 4: Check for AITER enabled without specific flags
@@ -285,11 +286,19 @@ class RocmPlatform(Platform):
                 and on_gfx9()
                 and envs.VLLM_ROCM_USE_AITER_MHA is not False
             ):
-                logger.info("Using Aiter Flash Attention backend on V1 engine.")
+                logger.info("Using Aiter Flash Attention backend.")
                 return AttentionBackendEnum.ROCM_AITER_FA.get_path()
 
+            # Priority 5: If model is Encoder-only self-attention type
+            if (
+                attn_selector_config.attn_type is not None
+                and attn_selector_config.attn_type == AttentionType.ENCODER_ONLY
+            ):
+                logger.info("Using FlexAttention backend.")
+                return AttentionBackendEnum.FLEX_ATTENTION.get_path()
+
             # Default: Triton Unified Attention
-            logger.info("Using Triton Attention backend on V1 engine.")
+            logger.info("Using Triton Attention backend.")
             return AttentionBackendEnum.TRITON_ATTN.get_path()
 
         raise RuntimeError(
@@ -324,14 +333,19 @@ class RocmPlatform(Platform):
 
         from vllm._aiter_ops import rocm_aiter_ops
 
-        if rocm_aiter_ops.is_mha_enabled():
-            # Note: AITER FA is only supported for Qwen-VL models.
-            # TODO: Add support for other VL models in their model class.
+        if rocm_aiter_ops.is_enabled():
+            logger.info_once("Using AITER Flash Attention backend for ViT model.")
             return AttentionBackendEnum.ROCM_AITER_FA
 
-        if on_gfx9() and find_spec("flash_attn") is not None:
+        if (
+            on_gfx9()
+            and find_spec("flash_attn") is not None
+            and (dtype == torch.float16 or dtype == torch.bfloat16)
+        ):
+            logger.info_once("Using Flash Attention backend for ViT model.")
             return AttentionBackendEnum.FLASH_ATTN
 
+        logger.info_once("Using Torch SDPA backend for ViT model.")
         return AttentionBackendEnum.TORCH_SDPA
 
     @classmethod
@@ -394,7 +408,7 @@ class RocmPlatform(Platform):
         parallel_config = vllm_config.parallel_config
         is_eager_execution = compilation_config == CUDAGraphMode.NONE
         use_aiter_rms_norm = rocm_aiter_ops.is_rmsnorm_enabled()
-        use_aiter_fp8_linear = rocm_aiter_ops.is_linear_fp8_enaled()
+        use_aiter_fp8_linear = rocm_aiter_ops.is_linear_fp8_enabled()
 
         if compilation_config.cudagraph_mode.has_full_cudagraphs():
             # decode context parallel does not support full cudagraphs
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index af8979af36643..2e39a216a10a0 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -156,10 +156,15 @@ class XPUPlatform(Platform):
 
         if vllm_config.lora_config is not None:
             compilation_config.mode = CompilationMode.NONE
-
+        # decrease triton kernel compilation scratch space for speculative decoding
+        if vllm_config.speculative_config is not None:
+            os.environ["IGC_ForceOCLSIMDWidth"] = "16"  # noqa: SIM112
         # check and update parallel config
         parallel_config = vllm_config.parallel_config
-        parallel_config.worker_cls = "vllm.v1.worker.xpu_worker.XPUWorker"
+        # Only override worker_cls if it's still the default "auto"
+        # This allows custom workers (like vllm-omni workers) to be used on XPU
+        if parallel_config.worker_cls == "auto":
+            parallel_config.worker_cls = "vllm.v1.worker.xpu_worker.XPUWorker"
         if vllm_config.kv_transfer_config is not None:
             vllm_config.kv_transfer_config.enable_permute_local_kv = True
 
diff --git a/vllm/reasoning/mistral_reasoning_parser.py b/vllm/reasoning/mistral_reasoning_parser.py
index de3d1296ec734..48a36b4c6634c 100644
--- a/vllm/reasoning/mistral_reasoning_parser.py
+++ b/vllm/reasoning/mistral_reasoning_parser.py
@@ -104,7 +104,7 @@ class MistralReasoningParser(BaseThinkingReasoningParser):
         # 3. Both BOT and EOT have been outputted.
         elif has_bot_token and has_eot_token:
             return input_ids[:bot_token_index] + input_ids[eot_token_index + 1 :]
-        # 4. Only EOT has been outputted => this should not have occured for a model
+        # 4. Only EOT has been outputted => this should not have occurred for a model
         #    well prompted and trained.
         else:
             return input_ids[:eot_token_index] + input_ids[eot_token_index + 1 :]
diff --git a/vllm/tool_parsers/__init__.py b/vllm/tool_parsers/__init__.py
index 181d8bcba9553..ee92727e1c9a4 100644
--- a/vllm/tool_parsers/__init__.py
+++ b/vllm/tool_parsers/__init__.py
@@ -42,6 +42,10 @@ _TOOL_PARSERS_TO_REGISTER = {
         "glm4_moe_tool_parser",
         "Glm4MoeModelToolParser",
     ),
+    "glm47": (
+        "glm47_moe_tool_parser",
+        "Glm47MoeModelToolParser",
+    ),
     "granite-20b-fc": (
         "granite_20b_fc_tool_parser",
         "Granite20bFCToolParser",
diff --git a/vllm/tool_parsers/glm47_moe_tool_parser.py b/vllm/tool_parsers/glm47_moe_tool_parser.py
new file mode 100644
index 0000000000000..ae42a640d9413
--- /dev/null
+++ b/vllm/tool_parsers/glm47_moe_tool_parser.py
@@ -0,0 +1,23 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import regex as re
+
+from vllm.logger import init_logger
+from vllm.tokenizers import TokenizerLike
+from vllm.tool_parsers.glm4_moe_tool_parser import Glm4MoeModelToolParser
+
+logger = init_logger(__name__)
+
+
+class Glm47MoeModelToolParser(Glm4MoeModelToolParser):
+    def __init__(self, tokenizer: TokenizerLike):
+        super().__init__(tokenizer)
+        self.func_detail_regex = re.compile(
+            r"<tool_call>(.*?)(<arg_key>.*?)?</tool_call>", re.DOTALL
+        )
+        self.func_arg_regex = re.compile(
+            r"<arg_key>(.*?)</arg_key>(?:\\n|\s)*<arg_value>(.*?)</arg_value>",
+            re.DOTALL,
+        )
diff --git a/vllm/tool_parsers/glm4_moe_tool_parser.py b/vllm/tool_parsers/glm4_moe_tool_parser.py
index d254fcb5240a5..ebfd91297b417 100644
--- a/vllm/tool_parsers/glm4_moe_tool_parser.py
+++ b/vllm/tool_parsers/glm4_moe_tool_parser.py
@@ -114,7 +114,8 @@ class Glm4MoeModelToolParser(ToolParser):
                     ToolCall(
                         type="function",
                         function=FunctionCall(
-                            name=tc_name, arguments=json.dumps(arg_dct)
+                            name=tc_name,
+                            arguments=json.dumps(arg_dct, ensure_ascii=False),
                         ),
                     )
                 )
diff --git a/vllm/tool_parsers/minimax_m2_tool_parser.py b/vllm/tool_parsers/minimax_m2_tool_parser.py
index a1ab75f548bfc..67bd0e61620da 100644
--- a/vllm/tool_parsers/minimax_m2_tool_parser.py
+++ b/vllm/tool_parsers/minimax_m2_tool_parser.py
@@ -138,37 +138,167 @@ class MinimaxM2ToolParser(ToolParser):
         return name_str
 
     def _convert_param_value(self, value: str, param_type: str) -> Any:
-        """Convert parameter value to the correct type."""
+        """Convert parameter value to the correct type (legacy single-type version)."""
+        return self._convert_param_value_with_types(value, [param_type])
+
+    def _extract_types_from_schema(self, schema: Any) -> list[str]:
+        """
+        Extract all possible types from a JSON schema definition.
+        Handles anyOf, oneOf, allOf, type arrays, and enum fields.
+
+        Args:
+            schema: The JSON schema definition for a parameter
+
+        Returns:
+            List of type strings (e.g., ["string", "integer", "null"])
+        """
+        if schema is None:
+            return ["string"]
+
+        if not isinstance(schema, dict):
+            return ["string"]
+
+        types: set[str] = set()
+
+        # Handle direct "type" field
+        if "type" in schema:
+            type_value = schema["type"]
+            if isinstance(type_value, str):
+                types.add(type_value)
+            elif isinstance(type_value, list):
+                for t in type_value:
+                    if isinstance(t, str):
+                        types.add(t)
+
+        # Handle enum - infer types from enum values
+        if "enum" in schema and isinstance(schema["enum"], list) and schema["enum"]:
+            for value in schema["enum"]:
+                if value is None:
+                    types.add("null")
+                elif isinstance(value, bool):
+                    types.add("boolean")
+                elif isinstance(value, int):
+                    types.add("integer")
+                elif isinstance(value, float):
+                    types.add("number")
+                elif isinstance(value, str):
+                    types.add("string")
+                elif isinstance(value, list):
+                    types.add("array")
+                elif isinstance(value, dict):
+                    types.add("object")
+
+        # Handle anyOf, oneOf, allOf - recursively extract types
+        for choice_field in ("anyOf", "oneOf", "allOf"):
+            if choice_field in schema and isinstance(schema[choice_field], list):
+                for choice in schema[choice_field]:
+                    extracted = self._extract_types_from_schema(choice)
+                    types.update(extracted)
+
+        # If no types found, default to string
+        if not types:
+            return ["string"]
+
+        return list(types)
+
+    def _convert_param_value_with_types(
+        self, value: str, param_types: list[str]
+    ) -> Any:
+        """
+        Convert parameter value to the correct type based on a list of possible types.
+        Tries each type in order until one succeeds.
+
+        Args:
+            value: The string value to convert
+            param_types: List of possible type strings
+
+        Returns:
+            The converted value
+        """
         if value.lower() == "null":
             return None
 
-        param_type = param_type.lower()
-        if param_type in ["string", "str", "text"]:
+        # Normalize types
+        normalized_types = [t.lower() for t in param_types]
+
+        # Try null first if it's in the list
+        if "null" in normalized_types or value.lower() in ("null", "none", "nil"):
+            return None
+
+        # Try each type in order of preference (most specific first, string as fallback)
+        # Priority: integer > number > boolean > object > array > string
+        type_priority = [
+            "integer",
+            "int",
+            "number",
+            "float",
+            "boolean",
+            "bool",
+            "object",
+            "array",
+            "string",
+            "str",
+            "text",
+        ]
+
+        for param_type in type_priority:
+            if param_type not in normalized_types:
+                continue
+
+            if param_type in ["string", "str", "text"]:
+                return value
+            elif param_type in ["integer", "int"]:
+                try:
+                    return int(value)
+                except (ValueError, TypeError):
+                    continue
+            elif param_type in ["number", "float"]:
+                try:
+                    val = float(value)
+                    return val if val != int(val) else int(val)
+                except (ValueError, TypeError):
+                    continue
+            elif param_type in ["boolean", "bool"]:
+                lower_val = value.lower().strip()
+                if lower_val in ["true", "1", "yes", "on"]:
+                    return True
+                elif lower_val in ["false", "0", "no", "off"]:
+                    return False
+                continue
+            elif param_type in ["object", "array"]:
+                try:
+                    return json.loads(value)
+                except json.JSONDecodeError:
+                    continue
+
+        # Fallback: try JSON parse, then return as string
+        try:
+            return json.loads(value)
+        except json.JSONDecodeError:
             return value
-        elif param_type in ["integer", "int"]:
-            try:
-                return int(value)
-            except (ValueError, TypeError):
-                return value
-        elif param_type in ["number", "float"]:
-            try:
-                val = float(value)
-                return val if val != int(val) else int(val)
-            except (ValueError, TypeError):
-                return value
-        elif param_type in ["boolean", "bool"]:
-            return value.lower() in ["true", "1"]
-        elif param_type in ["object", "array"]:
-            try:
-                return json.loads(value)
-            except json.JSONDecodeError:
-                return value
-        else:
-            # Try JSON parse first, fallback to string
-            try:
-                return json.loads(value)
-            except json.JSONDecodeError:
-                return value
+
+    def _get_param_types_from_config(
+        self, param_name: str, param_config: dict
+    ) -> list[str]:
+        """
+        Get parameter types from parameter configuration.
+        Handles anyOf, oneOf, allOf, and direct type definitions.
+
+        Args:
+            param_name: The name of the parameter
+            param_config: The properties dict from the tool schema
+
+        Returns:
+            List of type strings
+        """
+        if param_name not in param_config:
+            return ["string"]
+
+        param_schema = param_config[param_name]
+        if not isinstance(param_schema, dict):
+            return ["string"]
+
+        return self._extract_types_from_schema(param_schema)
 
     def _parse_single_invoke(
         self, invoke_str: str, tools: list | None
@@ -207,17 +337,11 @@ class MinimaxM2ToolParser(ToolParser):
                 if param_value.endswith("\n"):
                     param_value = param_value[:-1]
 
-                # Get parameter type
-                param_type = "string"
-                if (
-                    param_name in param_config
-                    and isinstance(param_config[param_name], dict)
-                    and "type" in param_config[param_name]
-                ):
-                    param_type = param_config[param_name]["type"]
+                # Get parameter types (supports anyOf/oneOf/allOf)
+                param_type = self._get_param_types_from_config(param_name, param_config)
 
                 # Convert value
-                param_dict[param_name] = self._convert_param_value(
+                param_dict[param_name] = self._convert_param_value_with_types(
                     param_value, param_type
                 )
 
@@ -593,7 +717,7 @@ class MinimaxM2ToolParser(ToolParser):
                         # Store raw value for later processing
                         self.accumulated_params[self.current_param_name] = param_value
 
-                        # Get parameter configuration for type conversion
+                        # Get parameter configuration with anyOf support
                         param_config = {}
                         if self.streaming_request and self.streaming_request.tools:
                             for tool in self.streaming_request.tools:
@@ -610,17 +734,12 @@ class MinimaxM2ToolParser(ToolParser):
                                         param_config = params["properties"]
                                     break
 
-                        # Get parameter type
-                        param_type = "string"
-                        if (
-                            self.current_param_name in param_config
-                            and isinstance(param_config[self.current_param_name], dict)
-                            and "type" in param_config[self.current_param_name]
-                        ):
-                            param_type = param_config[self.current_param_name]["type"]
+                        # Get parameter types (supports anyOf/oneOf/allOf)
+                        param_type = self._get_param_types_from_config(
+                            self.current_param_name, param_config
+                        )
 
-                        # Convert param value to appropriate type
-                        converted_value = self._convert_param_value(
+                        converted_value = self._convert_param_value_with_types(
                             param_value, param_type
                         )
 
diff --git a/vllm/tool_parsers/mistral_tool_parser.py b/vllm/tool_parsers/mistral_tool_parser.py
index 49a175f69f434..35b853b0ad7e1 100644
--- a/vllm/tool_parsers/mistral_tool_parser.py
+++ b/vllm/tool_parsers/mistral_tool_parser.py
@@ -131,78 +131,105 @@ class MistralToolParser(ToolParser):
         request: ChatCompletionRequest,
     ) -> ExtractedToolCallInformation:
         """
-        Extract the tool calls from a complete model response. Requires
-        find-and-replacing single quotes with double quotes for JSON parsing,
-        make sure your tool call arguments don't ever include quotes!
+        Extract the tool calls from a complete model response.
+
+        Content and tool calls formatting depends on the Mistral's tokenizer version
+        used to train the model:
+
+        - < v11: `content[BOT] [{tool_call1},{tool_call2}]`
+        - >= v11: `content[BOT]tool_name1{args_call1}[BOT]tool_name2{args_call2}`
+
+        with [BOT] the tool call token.
+
+        Note:
+            For tokenizer versions >= v11, tool calls with arguments wrongly formatted
+            are still returned as tool calls. This is to allow the model to know it
+            tried to make a tool call. It reduces chance of another failure and
+            prevents that the context is filled with tool calls wrongly placed in
+            assistant message contents.
         """
 
-        # case -- if a tool call token is not present, return a text response
+        # If the tool call token is not present, return a text response
         if self.bot_token not in model_output:
             return ExtractedToolCallInformation(
                 tools_called=False, tool_calls=[], content=model_output
             )
 
-        # first remove the BOT token
-        tool_content = model_output.replace(self.bot_token, "").strip()
+        content_and_raw_tool_calls = model_output.split(self.bot_token)
+        content = content_and_raw_tool_calls[0]
+        raw_tool_calls = content_and_raw_tool_calls[1:]
 
-        try:
+        # >= v11: content[BOT]tool_name1{args_call1}[BOT]tool_name2{args_call2}
+        if not self._is_pre_v11:
+            tool_calls = []
+            for raw_tool_call in raw_tool_calls:
+                if "{" not in raw_tool_call:
+                    continue
+
+                end_name = raw_tool_call.find("{")
+                tool_name, args = (
+                    raw_tool_call[:end_name],
+                    raw_tool_call[end_name:],
+                )
+
+                tool_calls.append({"name": tool_name, "arguments": args})
+
+        # < v11: content[BOT] [{tool_call1},{tool_call2}]
+        else:
+            if len(raw_tool_calls) != 1:
+                raise ValueError(
+                    "Only one BOT token should have been outputted, "
+                    f"but got {model_output}."
+                )
+            stringified_tool_calls = raw_tool_calls[0].strip()
             try:
-                if not self._is_pre_v11:
-                    function_call_arr = []
-                    for single_tool_content in model_output.split(self.bot_token):
-                        if "{" not in single_tool_content:
-                            continue
-
-                        end_name = single_tool_content.find("{")
-                        fn_name, args = (
-                            single_tool_content[:end_name],
-                            single_tool_content[end_name:],
-                        )
-
-                        # fn_name is encoded outside serialized json dump
-                        # only arguments are serialized
-                        function_call_arr.append(
-                            {"name": fn_name, "arguments": json.loads(args)}
-                        )
-                else:
-                    function_call_arr = json.loads(tool_content)
+                tool_calls = json.loads(stringified_tool_calls)
             except json.JSONDecodeError:
                 # use a regex to find the part corresponding to the tool call.
                 # NOTE: This use case should not happen if the model is trained
                 # correctly. It's an easy possible fix so it's included, but
                 # can be brittle for very complex / highly nested tool calls
-                raw_tool_call = self.tool_call_regex.findall(tool_content)[0]
-                function_call_arr = json.loads(raw_tool_call)
-
-            # Tool Call
-            tool_calls: list[MistralToolCall] = [
-                MistralToolCall(
-                    type="function",
-                    function=FunctionCall(
-                        name=raw_function_call["name"],
-                        # function call args are JSON but as a string
-                        arguments=json.dumps(
-                            raw_function_call["arguments"], ensure_ascii=False
+                try:
+                    raw_tool_call = self.tool_call_regex.findall(
+                        stringified_tool_calls
+                    )[0]
+                    tool_calls = json.loads(raw_tool_call)
+                except (IndexError, json.JSONDecodeError):
+                    logger.exception("Error in extracting tool call from response: {e}")
+                    # If raw decoding and decoding post regex rule fails, then just
+                    # return content.
+                    return ExtractedToolCallInformation(
+                        tools_called=False,
+                        tool_calls=[],
+                        content=stringified_tool_calls,
+                    )
+            else:
+                tool_calls = [
+                    {
+                        "name": tool_call["name"],
+                        "arguments": json.dumps(
+                            tool_call["arguments"], ensure_ascii=False
                         ),
-                    ),
-                )
-                for raw_function_call in function_call_arr
-            ]
+                    }
+                    for tool_call in tool_calls
+                ]
 
-            # get any content before  the tool call
-            content = model_output.split(self.bot_token)[0]
-            return ExtractedToolCallInformation(
-                tools_called=True,
-                tool_calls=tool_calls,
-                content=content if len(content) > 0 else None,
+        mistral_tool_calls: list[MistralToolCall] = [
+            MistralToolCall(
+                type="function",
+                function=FunctionCall(
+                    name=tool_call["name"],
+                    arguments=tool_call["arguments"],
+                ),
             )
+            for tool_call in tool_calls
+        ]
 
-        except Exception:
-            logger.exception("Error in extracting tool call from response.")
-            # return information to just treat the tool call as regular JSON
-            return ExtractedToolCallInformation(
-                tools_called=False, tool_calls=[], content=tool_content
-            )
+        return ExtractedToolCallInformation(
+            tools_called=True,
+            tool_calls=mistral_tool_calls,
+            content=content if len(content) > 0 else None,
+        )
 
     def extract_tool_calls_streaming(
         self,
diff --git a/vllm/tool_parsers/openai_tool_parser.py b/vllm/tool_parsers/openai_tool_parser.py
index db92ea8982d70..da1a9c773f78f 100644
--- a/vllm/tool_parsers/openai_tool_parser.py
+++ b/vllm/tool_parsers/openai_tool_parser.py
@@ -79,6 +79,15 @@ class OpenAIToolParser(ToolParser):
                 elif msg.channel == "commentary" and not msg.recipient:
                     commentary_content = msg_text
 
+        # Extract partial content from the parser state if the generation was truncated
+        if parser.current_content:
+            if parser.current_channel == "final":
+                final_content = parser.current_content
+            elif (
+                parser.current_channel == "commentary" and not parser.current_recipient
+            ):
+                commentary_content = parser.current_content
+
         return ExtractedToolCallInformation(
             tools_called=len(tool_calls) > 0,
             tool_calls=tool_calls,
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index a11d37b4b2edf..ecb9849bb3b5e 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -15,7 +15,6 @@ from huggingface_hub import (
 )
 from packaging.version import Version
 from transformers import GenerationConfig, PretrainedConfig
-from transformers.configuration_utils import ALLOWED_LAYER_TYPES
 from transformers.models.auto.image_processing_auto import get_image_processor_config
 from transformers.models.auto.modeling_auto import (
     MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
@@ -44,6 +43,16 @@ from .repo_utils import (
     with_retry,
 )
 
+try:
+    # Transformers v5
+    from transformers.configuration_utils import ALLOWED_ATTENTION_LAYER_TYPES
+except ImportError:
+    # Transformers v4
+    from transformers.configuration_utils import (
+        ALLOWED_LAYER_TYPES as ALLOWED_ATTENTION_LAYER_TYPES,
+    )
+
+
 if envs.VLLM_USE_MODELSCOPE:
     from modelscope import AutoConfig
 else:
@@ -104,6 +113,14 @@ _AUTO_CONFIG_KWARGS_OVERRIDES: dict[str, dict[str, Any]] = {
 }
 
 
+def is_rope_parameters_nested(rope_parameters: dict[str, Any]) -> bool:
+    """Check if rope_parameters is nested by layer types."""
+    # Cannot be nested if rope_parameters is empty
+    if not rope_parameters:
+        return False
+    return set(rope_parameters.keys()).issubset(ALLOWED_ATTENTION_LAYER_TYPES)
+
+
 class HFConfigParser(ConfigParserBase):
     def parse(
         self,
@@ -313,20 +330,26 @@ def patch_rope_parameters(config: PretrainedConfig) -> None:
     rope_theta = getattr_iter(config, names, None, warn=True)
     names = ["partial_rotary_factor", "rotary_pct", "rotary_emb_fraction"]
     partial_rotary_factor = getattr_iter(config, names, None, warn=True)
+    ompe = getattr(config, "original_max_position_embeddings", None)
 
     if Version(version("transformers")) < Version("5.0.0.dev0"):
         # Transformers v4 installed, legacy config fields may be present
         if (rope_scaling := getattr(config, "rope_scaling", None)) is not None:
             config.rope_parameters = rope_scaling
+        if (
+            rope_theta is not None
+            or partial_rotary_factor is not None
+            or ompe is not None
+        ) and not getattr(config, "rope_parameters", None):
+            config.rope_parameters = {"rope_type": "default"}
+        # Patch legacy fields into rope_parameters
         if rope_theta is not None:
-            if not hasattr(config, "rope_parameters"):
-                config.rope_parameters = {"rope_type": "default"}
             config.rope_parameters["rope_theta"] = rope_theta
         if partial_rotary_factor is not None:
-            if not hasattr(config, "rope_parameters"):
-                config.rope_parameters = {"rope_type": "default"}
             config.rope_parameters["partial_rotary_factor"] = partial_rotary_factor
-    elif rope_theta is not None or hasattr(config, "rope_parameters"):
+        if ompe is not None:
+            config.rope_parameters["original_max_position_embeddings"] = ompe
+    elif rope_theta is not None or getattr(config, "rope_parameters", None):
         # Transformers v5 installed
         # Patch these fields in case they used non-standard names
         if rope_theta is not None:
@@ -341,12 +364,8 @@ def patch_rope_parameters(config: PretrainedConfig) -> None:
     if getattr(config, "rope_parameters", None) is None:
         return
 
-    # Add original_max_position_embeddings if present
-    if ompe := getattr(config, "original_max_position_embeddings", None):
-        config.rope_parameters["original_max_position_embeddings"] = ompe
-
     # Handle nested rope_parameters in interleaved sliding attention models
-    if set(config.rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES):
+    if is_rope_parameters_nested(config.rope_parameters):
         for rope_parameters_layer_type in config.rope_parameters.values():
             patch_rope_parameters_dict(rope_parameters_layer_type)
     else:
diff --git a/vllm/transformers_utils/configs/mistral.py b/vllm/transformers_utils/configs/mistral.py
index d59169d95f0c9..4776c892eb722 100644
--- a/vllm/transformers_utils/configs/mistral.py
+++ b/vllm/transformers_utils/configs/mistral.py
@@ -184,18 +184,42 @@ def _remap_mistral_audio_args(config: dict) -> dict:
     whisper_args = config["multimodal"].pop("whisper_model_args")
     encoder_args = whisper_args["encoder_args"]
     downsample_args = whisper_args["downsample_args"]
+    downsample_factor = downsample_args["downsample_factor"]
+
+    # make sure that k/v blocks can be allocated with
+    # unified k/v cache class and pool whisper k/v cache blocks
+    # with downsample_factor:1 ratio
+    if encoder_args.get("causal"):
+        block_pool_size = downsample_factor
+        config["projection_size"] = downsample_factor * encoder_args["dim"]
+    else:
+        block_pool_size = 1
+
+    _maybe_sliding_window = encoder_args.get("ragged_attention", None)
+    if _maybe_sliding_window is None:
+        sliding_window = None
+    elif _maybe_sliding_window.isdigit():
+        sliding_window = int(_maybe_sliding_window)
+    else:
+        raise NotImplementedError(f"Unsupported: {_maybe_sliding_window=}")
+
+    architecture = (
+        "VoxtralStreamingGeneration"
+        if encoder_args.get("causal")
+        else "VoxtralForConditionalGeneration"
+    )
 
     quant_config = config.get("quantization_config")
     config = {
-        "model_type": "whixtral",
-        "architectures": ["VoxtralForConditionalGeneration"],
+        "model_type": "voxtral",
+        "architectures": [architecture],
         "text_config": PretrainedConfig.from_dict(config),
         "audio_config": WhisperConfig(
             num_mel_bins=encoder_args["audio_encoding_args"]["num_mel_bins"],
             window_size=encoder_args["audio_encoding_args"]["window_size"],
             sampling_rate=encoder_args["audio_encoding_args"]["sampling_rate"],
             hop_length=encoder_args["audio_encoding_args"]["hop_length"],
-            downsample_factor=downsample_args["downsample_factor"],
+            downsample_factor=downsample_factor,
             d_model=encoder_args["dim"],
             encoder_layers=encoder_args["n_layers"],
             encoder_ffn_dim=encoder_args["hidden_dim"],
@@ -203,6 +227,10 @@ def _remap_mistral_audio_args(config: dict) -> dict:
             vocab_size=encoder_args["vocab_size"],
             max_source_positions=encoder_args["max_source_positions"],
             is_encoder_decoder=False,  # Override WhisperConfig default
+            is_causal=encoder_args.get("causal", False),
+            sliding_window=sliding_window,
+            block_pool_size=block_pool_size,
+            pos_embed=encoder_args.get("pos_embed", "sinusoidal"),
         ),
     }
     if quant_config:
diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py
index 3d4f8449ad3b6..56c9ca361eaef 100644
--- a/vllm/utils/deep_gemm.py
+++ b/vllm/utils/deep_gemm.py
@@ -32,15 +32,34 @@ class DeepGemmQuantScaleFMT(Enum):
     # element contains 4 scale values.
     UE8M0 = 2
 
-    @staticmethod
-    def from_oracle() -> "DeepGemmQuantScaleFMT":
-        if not is_deep_gemm_e8m0_used():
-            return DeepGemmQuantScaleFMT.FLOAT32
-        return (
-            DeepGemmQuantScaleFMT.UE8M0
-            if current_platform.is_device_capability_family(100)
-            else DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0
+    @classmethod
+    def init_oracle_cache(cls) -> None:
+        """Initialize the oracle decision and store it in the class cache"""
+        cached = getattr(cls, "_oracle_cache", None)
+        if cached is not None:
+            return
+
+        use_e8m0 = (
+            envs.VLLM_USE_DEEP_GEMM_E8M0
+            and is_deep_gemm_supported()
+            and (_fp8_gemm_nt_impl is not None)
         )
+        if not use_e8m0:
+            cls._oracle_cache = cls.FLOAT32  # type: ignore
+            return
+
+        cls._oracle_cache = (  # type: ignore
+            cls.UE8M0
+            if current_platform.is_device_capability_family(100)
+            else cls.FLOAT32_CEIL_UE8M0
+        )
+
+    @classmethod
+    def from_oracle(cls) -> "DeepGemmQuantScaleFMT":
+        """Return the pre-initialized oracle decision"""
+        cached = getattr(cls, "_oracle_cache", None)
+        assert cached is not None, "DeepGemmQuantScaleFMT oracle cache not initialized"
+        return cached
 
 
 @functools.cache
@@ -149,6 +168,7 @@ def _lazy_init() -> None:
     _transform_sf_into_required_layout_impl = getattr(
         _dg, "transform_sf_into_required_layout", None
     )
+    DeepGemmQuantScaleFMT.init_oracle_cache()
 
 
 def get_num_sms() -> int:
@@ -369,7 +389,7 @@ def should_use_deepgemm_for_fp8_linear(
 
     # Verify DeepGEMM N/K dims requirements
     # NOTE: Also synchronized with test_w8a8_block_fp8_deep_gemm_matmul
-    # test inside kernels/quatization/test_block_fp8.py
+    # test inside kernels/quantization/test_block_fp8.py
     N_MULTIPLE = 64
     K_MULTIPLE = 128
 
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 2740a6916fd97..623ae892ecdaf 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -16,6 +16,7 @@ from flashinfer import (
 from flashinfer.decode import _get_range_buf, trtllm_batch_decode_with_kv_cache
 from flashinfer.prefill import trtllm_batch_context_with_kv_cache
 from flashinfer.utils import FP4Tensor
+from typing_extensions import override
 
 from vllm import envs
 from vllm.attention.backends.abstract import (
@@ -59,6 +60,7 @@ from vllm.v1.attention.backends.utils import (
     split_decodes_and_prefills,
 )
 from vllm.v1.kv_cache_interface import AttentionSpec
+from vllm.v1.utils import CpuGpuBuffer
 
 FLASHINFER_WORKSPACE_BUFFER_SIZE_BATCH_INVARIANT = 2048 * 1024 * 1024
 
@@ -181,7 +183,6 @@ class BatchDCPPrefillWrapper:
         paged_kv_indptr_cpu: torch.Tensor,
         paged_kv_indices: torch.Tensor,
         paged_kv_last_page_len_cpu: torch.Tensor,
-        prefill_start: int,
         page_size: int,
         num_qo_heads: int,
         dcp_world_size: int,
@@ -200,7 +201,7 @@ class BatchDCPPrefillWrapper:
             qo_indptr_cpu,
             paged_kv_indptr_cpu,
             paged_kv_indices,
-            paged_kv_last_page_len_cpu[prefill_start:],
+            paged_kv_last_page_len_cpu,
             num_qo_heads * dcp_world_size,
             num_kv_heads,
             head_dim,
@@ -380,40 +381,103 @@ class FlashInferBackend(AttentionBackend):
 
 
 @dataclass
-class FlashInferMetadata:
-    num_actual_tokens: int  # Number of tokens excluding padding.
+class FIPrefill:
+    """Metadata for the native FlashInfer prefill pathway (non-TRTLLM)."""
 
-    # The data type of the query
-    q_data_type: torch.dtype
+    wrapper: BatchPrefillWithPagedKVCacheWrapper | BatchDCPPrefillWrapper
+
+
+@dataclass
+class FIDecode:
+    """Metadata for the native FlashInfer decode pathway (non-TRTLLM)."""
+
+    wrapper: BatchDecodeWithPagedKVCacheWrapper
+
+
+@dataclass
+class TRTLLMPrefill:
+    """Metadata for the TRTLLM prefill pathway."""
+
+    block_tables: torch.Tensor
+    """
+    The slice of the block table tensor corresponding *only* to prefill requests.
+    Shape: [num_prefills, max_num_blocks_per_seq]
+    """
+
+    seq_lens: torch.Tensor
+    """
+    The slice of the sequence lengths tensor corresponding *only* to prefill requests.
+    Shape: [num_prefills]
+    """
+
+    cum_seq_lens_q: torch.Tensor
+    cum_seq_lens_kv: torch.Tensor
+
+    max_q_len: int
+    """
+    The maximum query length *among prefill requests*. 
+    """
+
+    max_seq_len: int
+    """The maximum sequence length for KV Cache."""
+
+
+@dataclass
+class TRTLLMDecode:
+    """Metadata for the TRTLLM decode pathway."""
+
+    block_tables: torch.Tensor
+    """
+    The slice of the block table tensor corresponding *only* to decode requests.
+    Shape: [num_decodes, max_num_blocks_per_seq]
+    """
+
+    seq_lens: torch.Tensor
+    """
+    The slice of the sequence lengths tensor corresponding *only* to decode requests.
+    Shape: [num_decodes]
+    """
+
+    max_seq_len: int
+    """The maximum sequence length for KV Cache."""
+
+
+@dataclass
+class FlashInferMetadata:
+    num_actual_tokens: int
+    """Total number of tokens in the batch (excluding padding)."""
 
     slot_mapping: torch.Tensor
+    """Tensor for writing K/V to the cache. Shape: [num_actual_tokens]"""
 
-    # For flashinfer trtllm batch decode
-    max_q_len: int
-    max_q_len_prefill: int
-    max_seq_len: int
-    seq_lens: torch.Tensor
-    block_table_tensor: torch.Tensor
-    prefill_use_trtllm: bool
-    decode_use_trtllm: bool
+    q_data_type: torch.dtype
 
-    # For handling prefill decode split
     num_decodes: int
     num_decode_tokens: int
     num_prefills: int
     num_prefill_tokens: int
 
-    # For cascade attention (CPU for planning).
+    prefill: FIPrefill | TRTLLMPrefill | None
+    """
+    Holds the metadata for the prefill portion of the batch.
+    Will be `None` if `num_prefill_tokens == 0`.
+    """
+
+    decode: FIDecode | TRTLLMDecode | None
+    """
+    Holds the metadata for the decode portion of the batch.
+    Will be `None` if `num_decode_tokens == 0`.
+    """
+
+    # --- Special Case: Cascade Attention ---
+
     use_cascade: bool
+    """
+    If True, the entire batch is a cascade attention call, and the
+    `prefill` and `decode` fields will both be None.
+    """
 
-    prefill_wrapper: (
-        BatchPrefillWithPagedKVCacheWrapper | BatchDCPPrefillWrapper | None
-    ) = None
-    decode_wrapper: BatchDecodeWithPagedKVCacheWrapper | None = None
-    cascade_wrapper: MultiLevelCascadeAttentionWrapper | None = None
-
-    qo_indptr_gpu: torch.Tensor | None = None
-    paged_kv_indptr_gpu: torch.Tensor | None = None
+    cascade_wrapper: MultiLevelCascadeAttentionWrapper | None
 
 
 class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
@@ -482,6 +546,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
             self.dcp_world_size = 1
             self.dcp_rank = 0
             self.dcp_kv_cache_interleave_size = 1
+        self.use_dcp = self.dcp_world_size > 1
 
         self.num_qo_heads = self.model_config.get_num_attention_heads(
             self.vllm_config.parallel_config
@@ -535,34 +600,14 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                 "sinks, please use trtllm on blackwell or flash attention on "
                 "earlier GPUs."
             )
-        # Preparing persistent buffers (device-side)
-        self.paged_kv_indptr = torch.zeros(
-            max_num_reqs + 1, dtype=torch.int32, device=self.device
-        )
-        self.paged_kv_indices = torch.zeros(
-            max_num_pages,  # max num pages possible
-            dtype=torch.int32,
-            device=self.device,
-        )
-        self.paged_kv_last_page_len = torch.zeros(
-            max_num_reqs, dtype=torch.int32, device=self.device
-        )
-        # host-side buffer
-        pin_memory = is_pin_memory_available()
-        self.paged_kv_indptr_cpu = torch.zeros(
-            max_num_reqs + 1, dtype=torch.int32, device="cpu", pin_memory=pin_memory
-        )
-        self.paged_kv_indptr_np = self.paged_kv_indptr_cpu.numpy()
-        self.paged_kv_indptr_buffer = torch.zeros_like(
-            self.paged_kv_indptr_cpu, pin_memory=pin_memory
-        )
-        self.paged_kv_indices_cpu = torch.zeros(
-            max_num_pages, dtype=torch.int32, device="cpu", pin_memory=pin_memory
-        )
-        self.paged_kv_last_page_len_cpu = torch.zeros(
-            max_num_reqs, dtype=torch.int32, device="cpu", pin_memory=pin_memory
-        )
-        self.paged_kv_last_page_len_np = self.paged_kv_last_page_len_cpu.numpy()
+        # Preparing persistent buffers
+        self.pin_memory = is_pin_memory_available()
+        self.paged_kv_indptr = self._make_buffer(max_num_reqs + 1)
+        self.paged_kv_indptr_cpu_buffer = torch.zeros_like(
+            self.paged_kv_indptr.cpu, pin_memory=self.pin_memory
+        )  # Extra buffer for mutable paged_kv_indptr.cpu in cuda graph mode
+        self.paged_kv_indices = self._make_buffer(max_num_pages)
+        self.paged_kv_last_page_len = self._make_buffer(max_num_reqs)
 
         if self.head_dim == 256 and current_platform.is_device_capability_family(100):
             # https://github.com/flashinfer-ai/flashinfer/issues/1993 reports that
@@ -573,6 +618,18 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                 "passing --block-size 32 or --block-size 64."
             )
 
+    def _make_buffer(
+        self, *size: int | torch.SymInt, dtype: torch.dtype = torch.int32
+    ) -> CpuGpuBuffer:
+        return CpuGpuBuffer(
+            *size,
+            dtype=dtype,
+            device=self.device,
+            pin_memory=self.pin_memory,
+            with_numpy=True,
+        )
+
+    @override  # type: ignore[misc]
     @classmethod
     def get_cudagraph_support(
         cls: type["FlashInferMetadataBuilder"],
@@ -607,7 +664,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
         self,
     ) -> BatchPrefillWithPagedKVCacheWrapper | BatchDCPPrefillWrapper:
         if self._prefill_wrapper is None:
-            if self.dcp_world_size > 1:
+            if self.use_dcp:
                 self._prefill_wrapper = BatchDCPPrefillWrapper(
                     workspace_buffer=self._get_workspace_buffer(),
                 )
@@ -626,9 +683,9 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
 
         if decode_wrapper is None:
             if use_cudagraph:
-                paged_kv_indptr = self.paged_kv_indptr[: batch_size + 1]
-                paged_kv_indices = self.paged_kv_indices
-                paged_kv_last_page_len = self.paged_kv_last_page_len[:batch_size]
+                paged_kv_indptr = self.paged_kv_indptr.gpu[: batch_size + 1]
+                paged_kv_indices = self.paged_kv_indices.gpu
+                paged_kv_last_page_len = self.paged_kv_last_page_len.gpu[:batch_size]
             else:
                 paged_kv_indptr = None
                 paged_kv_indices = None
@@ -661,6 +718,60 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
             )
         return self._cascade_wrapper
 
+    def _compute_flashinfer_kv_metadata(
+        self,
+        num_blocks_np: np.ndarray,
+        seq_lens_np: np.ndarray,
+        block_table_tensor: torch.Tensor,
+        num_reqs: int,
+        page_size: int,
+    ) -> torch.Tensor:
+        """
+        Compute paged_kv_indptr, paged_kv_indices, paged_kv_last_page_len for FlashInfer
+        attention.
+
+        Results are stored in self.paged_kv_indptr,
+        self.paged_kv_indices, self.paged_kv_last_page_len buffers.
+
+        Returns paged_kv_indices, a GPU tensor with shape [num_actual_pages].
+        """
+        # write self.paged_kv_indptr_cpu inplace (0-index is always 0)
+        np.cumsum(
+            num_blocks_np,
+            dtype=np.int32,
+            out=self.paged_kv_indptr.np[1 : num_reqs + 1],
+        )
+        # NOTE(woosuk): Because self.paged_kv_indptr_cpu can be modified
+        # after this line (e.g., for cuda graphs), we need to copy the data to
+        # self.paged_kv_indptr_buffer to avoid race condition.
+        self.paged_kv_indptr_cpu_buffer[: num_reqs + 1] = self.paged_kv_indptr.cpu[
+            : num_reqs + 1
+        ]
+        paged_kv_indptr = self.paged_kv_indptr.gpu[: num_reqs + 1]
+        paged_kv_indptr.copy_(
+            self.paged_kv_indptr_cpu_buffer[: num_reqs + 1], non_blocking=True
+        )
+
+        # write self.paged_kv_indices inplace
+        num_actual_pages = self.paged_kv_indptr.np[num_reqs]
+        paged_kv_indices = self.paged_kv_indices.gpu[:num_actual_pages]
+        _copy_page_indices_kernel[(num_reqs,)](
+            paged_kv_indices,
+            block_table_tensor,
+            block_table_tensor.stride(0),
+            paged_kv_indptr,
+            BLOCK_SIZE=1024,
+        )
+
+        # write self.paged_kv_last_page_len_cpu inplace
+        paged_kv_last_page_len_np = seq_lens_np % page_size
+        self.paged_kv_last_page_len.np[:num_reqs] = np.where(
+            (paged_kv_last_page_len_np == 0) & (seq_lens_np != 0),
+            page_size,
+            paged_kv_last_page_len_np,
+        )
+        return paged_kv_indices
+
     def build(
         self,
         common_prefix_len: int,
@@ -678,98 +789,17 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
         )
 
         page_size = self.page_size
-        max_q_len = common_attn_metadata.max_query_len
         max_seq_len = common_attn_metadata.max_seq_len
         seq_lens = common_attn_metadata.seq_lens
-        seq_lens_cpu = common_attn_metadata.seq_lens_cpu
         block_table_tensor = common_attn_metadata.block_table_tensor
+        qo_indptr = common_attn_metadata.query_start_loc
         qo_indptr_cpu = common_attn_metadata.query_start_loc_cpu
 
-        if self.dcp_world_size > 1:
-            if num_prefills > 0:
-                qo_indptr_prefill_cpu = (
-                    qo_indptr_cpu[num_decodes:] - qo_indptr_cpu[num_decodes]
-                )
-                query_lens_prefill_cpu = (
-                    qo_indptr_prefill_cpu[1:] - qo_indptr_prefill_cpu[:-1]
-                )
-                seq_lens_cpu[num_decodes:] = (
-                    seq_lens_cpu[num_decodes:] - query_lens_prefill_cpu
-                )
-
-            seq_lens_cpu = get_dcp_local_seq_lens(
-                seq_lens_cpu,
-                self.dcp_world_size,
-                self.dcp_rank,
-                self.dcp_kv_cache_interleave_size,
-            )
-
-        seq_lens_np = seq_lens_cpu.numpy()
-        num_blocks_np = (seq_lens_np + (page_size - 1)) // page_size
-
+        # Step 1: Decide which dispatch modes to use:
+        # - Cascade attention (distinct mode)
+        # - Prefill (FI native or TRTLLM)
+        # - Decode (FI native or TRTLLM)
         use_cascade = common_prefix_len > 0
-        if use_cascade:
-            # Grab the blocks of the shared prefix from the first request.
-            assert common_prefix_len % page_size == 0
-            num_common_kv_blocks = common_prefix_len // page_size
-
-            # Create CPU versions directly for cascade (no GPU versions needed)
-            shared_qo_indptr_cpu = torch.tensor(
-                [0, num_actual_tokens], dtype=torch.int32, device="cpu"
-            )
-            shared_kv_page_indptr_cpu = torch.tensor(
-                [0, num_common_kv_blocks], dtype=torch.int32, device="cpu"
-            )
-            shared_kv_page_indices_cpu = block_table_tensor[0, :num_common_kv_blocks]
-            shared_kv_last_page_len_cpu = torch.tensor(
-                [page_size], dtype=torch.int32, device="cpu"
-            )
-
-            # Remove the blocks of the shared prefix from all requests.
-            block_table_tensor = block_table_tensor[:, num_common_kv_blocks:]
-            num_blocks_np -= num_common_kv_blocks
-        else:
-            shared_qo_indptr_cpu = None
-            shared_kv_page_indptr_cpu = None
-            shared_kv_page_indices_cpu = None
-            shared_kv_last_page_len_cpu = None
-
-        # write self.paged_kv_indptr_cpu inplace (0-index is always 0)
-        np.cumsum(
-            num_blocks_np,
-            dtype=np.int32,
-            out=self.paged_kv_indptr_np[1 : num_reqs + 1],
-        )
-        # NOTE(woosuk): Because self.paged_kv_indptr_cpu can be modified
-        # after this line (e.g., for cuda graphs), we need to copy the data to
-        # self.paged_kv_indptr_buffer to avoid race condition.
-        self.paged_kv_indptr_buffer[: num_reqs + 1] = self.paged_kv_indptr_cpu[
-            : num_reqs + 1
-        ]
-        paged_kv_indptr = self.paged_kv_indptr[: num_reqs + 1]
-        paged_kv_indptr.copy_(
-            self.paged_kv_indptr_buffer[: num_reqs + 1], non_blocking=True
-        )
-
-        # write self.paged_kv_indices inplace
-        num_actual_pages = self.paged_kv_indptr_np[num_reqs]
-        paged_kv_indices = self.paged_kv_indices[:num_actual_pages]
-        _copy_page_indices_kernel[(num_reqs,)](
-            paged_kv_indices,
-            block_table_tensor,
-            block_table_tensor.stride(0),
-            paged_kv_indptr,
-            BLOCK_SIZE=1024,
-        )
-
-        # write self.paged_kv_last_page_len_cpu inplace
-        paged_kv_last_page_len_np = seq_lens_np % page_size
-        self.paged_kv_last_page_len_np[:num_reqs] = np.where(
-            (paged_kv_last_page_len_np == 0) & (seq_lens_np != 0),
-            page_size,
-            paged_kv_last_page_len_np,
-        )
-
         uses_spec_reorder = self.reorder_batch_threshold > 1
         prefill_use_trtllm = use_trtllm_attention(
             self.num_qo_heads,
@@ -788,7 +818,14 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
             self.use_trtllm_decode_attention and self.dcp_world_size <= 1
         )
 
-        if not (prefill_use_trtllm and decode_use_trtllm):
+        all_uses_trtllm = (num_prefills == 0 or prefill_use_trtllm) and (
+            num_decodes == 0 or decode_use_trtllm
+        )
+        is_only_trtllm_decode = num_prefills == 0 and (
+            num_decodes > 0 and decode_use_trtllm
+        )
+
+        if not all_uses_trtllm:
             if self.has_sinks:
                 raise NotImplementedError(
                     "FlashInfer backend currently does not support attention "
@@ -813,28 +850,102 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
             # fall back to model dtype.
             self.q_data_type = self.model_config.dtype
 
+        # Step 2: Initialize the output metadata
+        # Leave prefill/decode/cascade_wrapper empty, to be populated
+        # case by case depending on the batch contents and backend selection.
         attn_metadata = FlashInferMetadata(
             num_actual_tokens=num_actual_tokens,
-            q_data_type=self.q_data_type,
             slot_mapping=common_attn_metadata.slot_mapping,
-            max_q_len=max_q_len,
-            max_q_len_prefill=max_q_len,
-            max_seq_len=max_seq_len,
-            seq_lens=seq_lens,
-            block_table_tensor=block_table_tensor,
-            prefill_use_trtllm=prefill_use_trtllm,
-            decode_use_trtllm=decode_use_trtllm,
+            q_data_type=self.q_data_type,
             num_decodes=num_decodes,
             num_decode_tokens=num_decode_tokens,
             num_prefills=num_prefills,
             num_prefill_tokens=num_prefill_tokens,
             use_cascade=use_cascade,
+            prefill=None,
+            decode=None,
+            cascade_wrapper=None,
         )
 
-        paged_kv_indptr_cpu = self.paged_kv_indptr_cpu[: 1 + num_reqs]
-        paged_kv_last_page_len_cpu = self.paged_kv_last_page_len_cpu[:num_reqs]
+        # Guard access to seq_lens_cpu, which may not always be needed
+        # and can be expensive to retrieve in async mode.
+        needs_seq_lens_cpu = self.use_dcp or use_cascade or not is_only_trtllm_decode
+        seq_lens_cpu = common_attn_metadata.seq_lens_cpu if needs_seq_lens_cpu else None
+        seq_lens_np = seq_lens_cpu.numpy() if seq_lens_cpu is not None else None
+        num_blocks_np = (
+            (seq_lens_np + (page_size - 1)) // page_size
+            if seq_lens_np is not None
+            else None
+        )
+
+        # Adjust seq_lens_cpu for DCP
+        if self.use_dcp:
+            assert seq_lens_cpu is not None
+            if num_prefills > 0:
+                qo_indptr_prefill_cpu = (
+                    qo_indptr_cpu[num_decodes:] - qo_indptr_cpu[num_decodes]
+                )
+                query_lens_prefill_cpu = (
+                    qo_indptr_prefill_cpu[1:] - qo_indptr_prefill_cpu[:-1]
+                )
+                seq_lens_cpu[num_decodes:] = (
+                    seq_lens_cpu[num_decodes:] - query_lens_prefill_cpu
+                )
+
+            seq_lens_cpu = get_dcp_local_seq_lens(
+                seq_lens_cpu,
+                self.dcp_world_size,
+                self.dcp_rank,
+                self.dcp_kv_cache_interleave_size,
+            )
+
+        # Adjust num_block_np for cascade attention
+        if use_cascade:
+            assert num_blocks_np is not None
+            assert common_prefix_len % page_size == 0
+            num_common_kv_blocks = common_prefix_len // page_size
+            num_blocks_np -= num_common_kv_blocks
+
+        # Compute paged_kv_indices if necessary
+        needs_paged_kv_indices = use_cascade or not is_only_trtllm_decode
+        if needs_paged_kv_indices:
+            assert num_blocks_np is not None
+            assert seq_lens_np is not None
+            paged_kv_indices = self._compute_flashinfer_kv_metadata(
+                num_blocks_np,
+                seq_lens_np,
+                block_table_tensor,
+                num_reqs,
+                page_size,
+            )
+        else:
+            paged_kv_indices = None
+
+        # Early-out for cascade attention
+        if use_cascade:
+            # Grab the blocks of the shared prefix from the first request.
+            num_common_kv_blocks = common_prefix_len // page_size
+
+            # Create CPU versions directly for cascade (no GPU versions needed)
+            shared_qo_indptr_cpu = torch.tensor(
+                [0, num_actual_tokens], dtype=torch.int32, device="cpu"
+            )
+            shared_kv_page_indptr_cpu = torch.tensor(
+                [0, num_common_kv_blocks], dtype=torch.int32, device="cpu"
+            )
+            shared_kv_page_indices_cpu = block_table_tensor[0, :num_common_kv_blocks]
+            shared_kv_last_page_len_cpu = torch.tensor(
+                [page_size], dtype=torch.int32, device="cpu"
+            )
+
+            # Remove the blocks of the shared prefix from all requests.
+            block_table_tensor = block_table_tensor[:, num_common_kv_blocks:]
+            num_blocks_np -= num_common_kv_blocks
+
+            assert paged_kv_indices is not None
+            paged_kv_indptr_cpu = self.paged_kv_indptr.cpu[: 1 + num_reqs]
+            paged_kv_last_page_len_cpu = self.paged_kv_last_page_len.cpu[:num_reqs]
 
-        if attn_metadata.use_cascade:
             attn_metadata.cascade_wrapper = self._get_cascade_wrapper()
             attn_metadata.cascade_wrapper.plan(
                 [shared_qo_indptr_cpu, qo_indptr_cpu],
@@ -852,91 +963,107 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                 q_data_type=self.q_data_type,
                 kv_data_type=self.kv_cache_dtype,
             )
-        else:
-            # Regular attention (common case).
-            # Decodes are at the front and prefills are at the back.
-            num_prefills = attn_metadata.num_prefills
-            num_decodes = attn_metadata.num_decodes
-            if num_prefills > 0:
-                # Decodes are first so prefills start after the last decode
-                prefill_start = num_decodes
-                attn_metadata.prefill_wrapper = self._get_prefill_wrapper()
-                assert qo_indptr_cpu[prefill_start:].shape[0] == num_prefills + 1
-                assert paged_kv_indptr_cpu[prefill_start:].shape[0] == num_prefills + 1
-                assert (
-                    paged_kv_last_page_len_cpu[prefill_start:].shape[0] == num_prefills
-                )
-                # Since prefill_wrapper.run() will be called with
-                # query[num_decode_tokens:] we need to adjust the qo_indptr
-                # to be relative to the start of the prefill queries.
-                qo_indptr_cpu = (
-                    qo_indptr_cpu[prefill_start:] - qo_indptr_cpu[prefill_start]
-                )
-                paged_kv_indptr_cpu = paged_kv_indptr_cpu[prefill_start:]
+            return attn_metadata
 
-                # Recompute max_q_len for the slice of requests we are using
-                # for prefills. This can be different from max_q_len when
-                # we have a non-uniform batch with some short decodes offloaded
-                # to the prefill pathway
-                query_lens_prefill = qo_indptr_cpu[1:] - qo_indptr_cpu[:-1]
-                attn_metadata.max_q_len_prefill = int(query_lens_prefill.max().item())
+        # Step 3: Handle prefill and decode pathways case by case
+        ## PREFILL PATHWAY
+        if num_prefills > 0:
+            # Slices for shared prefill metadata
+            prefill_start = num_decodes
+            qo_indptr_prefill_cpu = (
+                qo_indptr_cpu[prefill_start:] - qo_indptr_cpu[prefill_start]
+            )
+            assert qo_indptr_prefill_cpu.shape[0] == num_prefills + 1
 
-                if not attn_metadata.prefill_use_trtllm:
-                    if self.dcp_world_size > 1:
-                        assert isinstance(
-                            attn_metadata.prefill_wrapper, BatchDCPPrefillWrapper
-                        )
-                        attn_metadata.prefill_wrapper.plan(
-                            qo_indptr_cpu=qo_indptr_cpu,
-                            paged_kv_indptr_cpu=paged_kv_indptr_cpu,
-                            paged_kv_indices=paged_kv_indices,
-                            paged_kv_last_page_len_cpu=paged_kv_last_page_len_cpu,
-                            prefill_start=prefill_start,
-                            page_size=self.page_size,
-                            num_qo_heads=self.num_qo_heads,
-                            dcp_world_size=self.dcp_world_size,
-                            num_kv_heads=self.num_kv_heads,
-                            head_dim=self.head_dim,
-                            sm_scale=self.sm_scale,
-                            window_left=self.window_left,
-                            logits_soft_cap=self.logits_soft_cap,
-                            q_data_type=self.q_data_type,
-                            kv_cache_dtype=self.kv_cache_dtype,
-                            prefill_fixed_split_size=self.prefill_fixed_split_size,
-                            disable_split_kv=self.disable_split_kv,
-                        )
-                    else:
-                        assert isinstance(
-                            attn_metadata.prefill_wrapper,
-                            BatchPrefillWithPagedKVCacheWrapper,
-                        )
-                        attn_metadata.prefill_wrapper.plan(
-                            qo_indptr_cpu,
-                            paged_kv_indptr_cpu,
-                            paged_kv_indices,
-                            paged_kv_last_page_len_cpu[prefill_start:],
-                            self.num_qo_heads,
-                            self.num_kv_heads,
-                            self.head_dim,
-                            self.page_size,
-                            causal=True,
-                            sm_scale=self.sm_scale,
-                            window_left=self.window_left,
-                            logits_soft_cap=self.logits_soft_cap,
-                            q_data_type=self.q_data_type,
-                            kv_data_type=self.kv_cache_dtype,
-                            fixed_split_size=self.prefill_fixed_split_size,
-                            disable_split_kv=self.disable_split_kv,
-                        )
+            if prefill_use_trtllm:
+                # Create GPU versions
+                qo_indptr_prefill_gpu = (
+                    qo_indptr[prefill_start:] - qo_indptr[prefill_start]
+                )
+                paged_kv_indptr_prefill_gpu = self.paged_kv_indptr.gpu[
+                    prefill_start : num_reqs + 1
+                ]
+                # Compute max_q_len for prefill requests
+                query_lens_prefill_cpu = (
+                    qo_indptr_prefill_cpu[1:] - qo_indptr_prefill_cpu[:-1]
+                )
+                max_q_len_prefill = int(query_lens_prefill_cpu.max().item())
+                attn_metadata.prefill = TRTLLMPrefill(
+                    block_tables=block_table_tensor[prefill_start:],
+                    seq_lens=seq_lens[prefill_start:],
+                    cum_seq_lens_q=qo_indptr_prefill_gpu,
+                    cum_seq_lens_kv=paged_kv_indptr_prefill_gpu,
+                    max_q_len=max_q_len_prefill,
+                    max_seq_len=max_seq_len,
+                )
+            else:
+                prefill_wrapper = self._get_prefill_wrapper()
+                # Slicing CPU buffers that are only needed for FI native prefills
+                paged_kv_last_page_len_prefill_cpu = self.paged_kv_last_page_len.cpu[
+                    prefill_start:num_reqs
+                ]
+                assert paged_kv_last_page_len_prefill_cpu.shape[0] == num_prefills
+                paged_kv_indptr_prefill_cpu = self.paged_kv_indptr.cpu[
+                    prefill_start : num_reqs + 1
+                ]
+                assert paged_kv_indptr_prefill_cpu.shape[0] == num_prefills + 1
+                if self.use_dcp:
+                    assert isinstance(prefill_wrapper, BatchDCPPrefillWrapper)
+                    prefill_wrapper.plan(
+                        qo_indptr_cpu=qo_indptr_prefill_cpu,
+                        paged_kv_indptr_cpu=paged_kv_indptr_prefill_cpu,
+                        paged_kv_indices=paged_kv_indices,
+                        paged_kv_last_page_len_cpu=paged_kv_last_page_len_prefill_cpu,
+                        page_size=self.page_size,
+                        num_qo_heads=self.num_qo_heads,
+                        dcp_world_size=self.dcp_world_size,
+                        num_kv_heads=self.num_kv_heads,
+                        head_dim=self.head_dim,
+                        sm_scale=self.sm_scale,
+                        window_left=self.window_left,
+                        logits_soft_cap=self.logits_soft_cap,
+                        q_data_type=self.q_data_type,
+                        kv_cache_dtype=self.kv_cache_dtype,
+                        prefill_fixed_split_size=self.prefill_fixed_split_size,
+                        disable_split_kv=self.disable_split_kv,
+                    )
                 else:
-                    attn_metadata.qo_indptr_gpu = qo_indptr_cpu.to(
-                        self.device, non_blocking=True
+                    assert isinstance(
+                        prefill_wrapper,
+                        BatchPrefillWithPagedKVCacheWrapper,
                     )
-                    attn_metadata.paged_kv_indptr_gpu = paged_kv_indptr_cpu.to(
-                        self.device, non_blocking=True
+                    prefill_wrapper.plan(
+                        qo_indptr_prefill_cpu,
+                        paged_kv_indptr_prefill_cpu,
+                        paged_kv_indices,
+                        paged_kv_last_page_len_prefill_cpu,
+                        self.num_qo_heads,
+                        self.num_kv_heads,
+                        self.head_dim,
+                        self.page_size,
+                        causal=True,
+                        sm_scale=self.sm_scale,
+                        window_left=self.window_left,
+                        logits_soft_cap=self.logits_soft_cap,
+                        q_data_type=self.q_data_type,
+                        kv_data_type=self.kv_cache_dtype,
+                        fixed_split_size=self.prefill_fixed_split_size,
+                        disable_split_kv=self.disable_split_kv,
                     )
+                attn_metadata.prefill = FIPrefill(wrapper=prefill_wrapper)
 
-            if num_decodes > 0:
+        ## DECODE PATHWAY
+        if num_decodes > 0:
+            if decode_use_trtllm:
+                assert num_decode_tokens % num_decodes == 0, (
+                    "TRTLLM decode requires uniform query lengths per request."
+                )
+                attn_metadata.decode = TRTLLMDecode(
+                    block_tables=block_table_tensor[:num_decodes],
+                    seq_lens=seq_lens[:num_decodes],
+                    max_seq_len=max_seq_len,
+                )
+            else:
                 pure_decode = num_prefills == 0
                 use_cudagraph = (
                     self.enable_cuda_graph
@@ -945,33 +1072,33 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                 )
                 num_input_tokens = num_decode_tokens
 
-                attn_metadata.decode_wrapper = self._get_decode_wrapper(
+                decode_wrapper = self._get_decode_wrapper(
                     num_input_tokens, use_cudagraph
                 )
-                if not attn_metadata.decode_use_trtllm:
-                    # Use the persistent buffer with padding length,
-                    # instead of the same address but chunked version
-                    # in atten_metadata when using cudagraph.
-                    fast_plan_decode(
-                        attn_metadata.decode_wrapper,
-                        self.paged_kv_indptr_cpu[: num_input_tokens + 1],
-                        paged_kv_indices,
-                        self.paged_kv_last_page_len_cpu[:num_input_tokens],
-                        seq_lens_cpu[:num_input_tokens],
-                        self.num_qo_heads * self.dcp_world_size,
-                        self.num_kv_heads,
-                        self.head_dim,
-                        self.page_size,
-                        # Disable flashinfer's pos encoding and use vllm's rope.
-                        pos_encoding_mode="NONE",
-                        sm_scale=self.sm_scale,
-                        window_left=self.window_left,
-                        logits_soft_cap=self.logits_soft_cap,
-                        q_data_type=self.q_data_type,
-                        kv_data_type=self.kv_cache_dtype,
-                        fixed_split_size=self.decode_fixed_split_size,
-                        disable_split_kv=self.disable_split_kv,
-                    )
+                # Use the persistent buffer with padding length,
+                # instead of the same address but chunked version
+                # in atten_metadata when using cudagraph.
+                fast_plan_decode(
+                    decode_wrapper,
+                    self.paged_kv_indptr.cpu[: num_input_tokens + 1],
+                    paged_kv_indices,
+                    self.paged_kv_last_page_len.cpu[:num_input_tokens],
+                    seq_lens_cpu[:num_input_tokens],
+                    self.num_qo_heads * self.dcp_world_size,
+                    self.num_kv_heads,
+                    self.head_dim,
+                    self.page_size,
+                    # Disable flashinfer's pos encoding and use vllm's rope.
+                    pos_encoding_mode="NONE",
+                    sm_scale=self.sm_scale,
+                    window_left=self.window_left,
+                    logits_soft_cap=self.logits_soft_cap,
+                    q_data_type=self.q_data_type,
+                    kv_data_type=self.kv_cache_dtype,
+                    fixed_split_size=self.decode_fixed_split_size,
+                    disable_split_kv=self.disable_split_kv,
+                )
+                attn_metadata.decode = FIDecode(wrapper=decode_wrapper)
         return attn_metadata
 
     def use_cascade_attention(self, *args, **kwargs) -> bool:
@@ -1104,6 +1231,9 @@ class FlashInferImpl(AttentionImpl):
         if self.bmm2_scale is None:
             self.bmm2_scale = layer._v_scale_float
 
+        prefill_use_trtllm = isinstance(attn_metadata.prefill, TRTLLMPrefill)
+        decode_use_trtllm = isinstance(attn_metadata.decode, TRTLLMDecode)
+
         # The attn+quant fusion happens when output_scale is provided.
         if output_scale is None:
             assert output_block_scale is None, (
@@ -1113,8 +1243,8 @@ class FlashInferImpl(AttentionImpl):
             assert attn_metadata.q_data_type == FP8_DTYPE, (
                 "Query must be FP8 when attn+quant fusion happened."
             )
-            assert (
-                attn_metadata.prefill_use_trtllm and attn_metadata.decode_use_trtllm
+            assert (attn_metadata.num_prefills == 0 or prefill_use_trtllm) and (
+                attn_metadata.num_decodes == 0 or decode_use_trtllm
             ), "Must use TRT-LLM attn"
 
             if output.dtype == FP8_DTYPE:
@@ -1191,22 +1321,25 @@ class FlashInferImpl(AttentionImpl):
 
         # When using spec decoding, num_decodes can be < num_decode_tokens
         # because some decode requests may have more than one query token.
-        num_decodes = attn_metadata.num_decodes
         num_decode_tokens = attn_metadata.num_decode_tokens
         num_prefill_tokens = attn_metadata.num_prefill_tokens
 
         stride_order = FlashInferBackend.get_kv_cache_stride_order()
         kv_cache_permute = kv_cache.permute(*stride_order)
+
+        use_dcp = self.dcp_world_size > 1
+
         # Regular attention (common case).
         # Decodes are at the front and prefills are at the back.
         if num_prefill_tokens > 0:
-            prefill_wrapper = attn_metadata.prefill_wrapper
             prefill_query = query[num_decode_tokens:]
             assert prefill_query.shape[0] == num_prefill_tokens
-            assert prefill_wrapper is not None
 
-            if not attn_metadata.prefill_use_trtllm:
-                if self.dcp_world_size > 1:
+            if not prefill_use_trtllm:
+                assert isinstance(attn_metadata.prefill, FIPrefill)
+                prefill_wrapper = attn_metadata.prefill.wrapper
+                assert prefill_wrapper is not None
+                if use_dcp:
                     assert isinstance(prefill_wrapper, BatchDCPPrefillWrapper)
                     assert prefill_wrapper._context._window_left == self.window_left
                     assert prefill_wrapper._context._logits_soft_cap == (
@@ -1247,11 +1380,12 @@ class FlashInferImpl(AttentionImpl):
                         out=output[num_decode_tokens:],
                     )
             else:
+                assert isinstance(attn_metadata.prefill, TRTLLMPrefill)
                 # prefill_query may be non-contiguous
                 prefill_query = prefill_query.contiguous()
                 workspace_buffer = _get_trtllm_gen_workspace_buffer()
-                block_tables_prefill = attn_metadata.block_table_tensor[num_decodes:]
-                seq_lens_prefill = attn_metadata.seq_lens[num_decodes:]
+                block_tables_prefill = attn_metadata.prefill.block_tables
+                seq_lens_prefill = attn_metadata.prefill.seq_lens
 
                 # This path needs to be enabled with VLLM_KV_CACHE_LAYOUT = HND
                 assert get_kv_cache_layout() == "HND"
@@ -1298,13 +1432,13 @@ class FlashInferImpl(AttentionImpl):
                     workspace_buffer=workspace_buffer,
                     block_tables=mock_block_table,
                     seq_lens=seq_lens_prefill,
-                    max_q_len=attn_metadata.max_q_len_prefill,
-                    max_kv_len=attn_metadata.max_seq_len,
+                    max_q_len=attn_metadata.prefill.max_q_len,
+                    max_kv_len=attn_metadata.prefill.max_seq_len,
                     bmm1_scale=self.bmm1_scale,
                     bmm2_scale=self.bmm2_scale,
                     batch_size=attn_metadata.num_prefills,
-                    cum_seq_lens_q=attn_metadata.qo_indptr_gpu,
-                    cum_seq_lens_kv=attn_metadata.paged_kv_indptr_gpu,
+                    cum_seq_lens_q=attn_metadata.prefill.cum_seq_lens_q,
+                    cum_seq_lens_kv=attn_metadata.prefill.cum_seq_lens_kv,
                     window_left=self.window_left,
                     sinks=self.sinks,
                     o_sf_scale=self.o_sf_scale,
@@ -1312,17 +1446,18 @@ class FlashInferImpl(AttentionImpl):
                 )
 
         if num_decode_tokens > 0:
-            decode_wrapper = attn_metadata.decode_wrapper
             decode_query = query[:num_decode_tokens]
             assert decode_query.shape[0] == num_decode_tokens
-            assert decode_wrapper is not None
 
-            if not attn_metadata.decode_use_trtllm:
+            if not decode_use_trtllm:
+                assert isinstance(attn_metadata.decode, FIDecode)
+                decode_wrapper = attn_metadata.decode.wrapper
+                assert decode_wrapper is not None
                 assert decode_wrapper._window_left == self.window_left
                 assert decode_wrapper._logits_soft_cap == (self.logits_soft_cap or 0.0)
                 assert decode_wrapper._sm_scale == self.scale
 
-                if self.dcp_world_size > 1:
+                if use_dcp:
                     decode_query = get_dcp_group().all_gather(
                         decode_query.contiguous(), dim=-2
                     )
@@ -1357,12 +1492,11 @@ class FlashInferImpl(AttentionImpl):
                     )
             else:
                 # decode_query may be non-contiguous
+                assert isinstance(attn_metadata.decode, TRTLLMDecode)
                 decode_query = decode_query.contiguous()
                 workspace_buffer = _get_trtllm_gen_workspace_buffer()
-                block_tables_decode = attn_metadata.block_table_tensor[
-                    :num_decode_tokens
-                ]
-                seq_lens_decode = attn_metadata.seq_lens[:num_decode_tokens]
+                block_tables_decode = attn_metadata.decode.block_tables
+                seq_lens_decode = attn_metadata.decode.seq_lens
 
                 # This path needs to be enabled with VLLM_KV_CACHE_LAYOUT = HND
                 assert get_kv_cache_layout() == "HND"
@@ -1397,7 +1531,7 @@ class FlashInferImpl(AttentionImpl):
                     workspace_buffer=workspace_buffer,
                     block_tables=block_tables_decode,
                     seq_lens=seq_lens_decode,
-                    max_seq_len=attn_metadata.max_seq_len,
+                    max_seq_len=attn_metadata.decode.max_seq_len,
                     bmm1_scale=self.bmm1_scale,
                     bmm2_scale=self.bmm2_scale,
                     window_left=self.window_left,
diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py
index d8dbe4cbae013..8193c05c2b1ab 100644
--- a/vllm/v1/attention/backends/flex_attention.py
+++ b/vllm/v1/attention/backends/flex_attention.py
@@ -160,7 +160,7 @@ def physical_to_logical_mapping(
     └───────────────────────────────────────────┘
 
     If multiple logical blocks map to the same physical block,
-    this function returns the first (minimum) logical block index.
+    this function returns the latest (maximum) logical block index.
 
     If a physical block is not mapped to by any logical block,
     its value in the result will be -1.
@@ -183,6 +183,15 @@ def physical_to_logical_mapping(
     To prevent this, we use seq_lens and block_size to mask out unused
     entries, ensuring only valid block references are processed.
 
+    IMPORTANT: Reused physical blocks (sliding-window / hybrid attention)
+    ────────────────────────────────────────────────────────────────────
+    For some attention types, physical cache blocks can be reused over time.
+    This can cause the same physical block id to appear multiple times in a row
+    of `block_table` at different logical block indices. In that case, only the
+    latest logical block index corresponds to the current contents of that
+    physical block. Therefore, the inverse mapping must pick the maximum logical
+    block index for each physical block id.
+
     Args:
         block_table: Tensor of shape [max_reqs, max_num_blocks]
             mapping logical blocks to physical locations. May contain
@@ -217,8 +226,8 @@ def physical_to_logical_mapping(
         mask, torch.arange(max_num_blocks, device=device)[None, :], 0
     )
 
-    physical_to_logical.scatter_(
-        -1, valid_block_table.to(torch.int64), valid_logical_indices
+    physical_to_logical.scatter_reduce_(
+        -1, valid_block_table.to(torch.int64), valid_logical_indices, reduce="amax"
     )
     # NB - Seems like block 0 is always empty so we reset it manually
     physical_to_logical[:, 0] = -1
diff --git a/vllm/v1/attention/backends/gdn_attn.py b/vllm/v1/attention/backends/gdn_attn.py
index ace2cbb0564c8..fcde986f48d46 100644
--- a/vllm/v1/attention/backends/gdn_attn.py
+++ b/vllm/v1/attention/backends/gdn_attn.py
@@ -143,7 +143,7 @@ class GDNAttentionMetadataBuilder(AttentionMetadataBuilder[GDNAttentionMetadata]
 
         query_start_loc = m.query_start_loc
         context_lens = m.num_computed_tokens_cpu
-        context_lens_tensor = context_lens.to(query_start_loc.device)
+        context_lens_tensor = context_lens.to(query_start_loc.device, non_blocking=True)
         nums_dict, batch_ptr, token_chunk_offset_ptr = None, None, None
 
         if (
diff --git a/vllm/v1/attention/backends/mamba1_attn.py b/vllm/v1/attention/backends/mamba1_attn.py
index fcda6134016ba..47dd44601377b 100644
--- a/vllm/v1/attention/backends/mamba1_attn.py
+++ b/vllm/v1/attention/backends/mamba1_attn.py
@@ -3,17 +3,11 @@
 
 from dataclasses import dataclass
 
-import torch
-
 from vllm.attention.backends.abstract import AttentionBackend
-from vllm.attention.backends.utils import PAD_SLOT_ID
-from vllm.config import VllmConfig
-from vllm.v1.attention.backends.mamba_attn import BaseMambaAttentionMetadataBuilder
-from vllm.v1.attention.backends.utils import (
-    CommonAttentionMetadata,
-    split_decodes_and_prefills,
+from vllm.v1.attention.backends.mamba_attn import (
+    BaseMambaAttentionMetadata,
+    BaseMambaAttentionMetadataBuilder,
 )
-from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec
 
 
 class Mamba1AttentionBackend(AttentionBackend):
@@ -23,137 +17,12 @@ class Mamba1AttentionBackend(AttentionBackend):
 
 
 @dataclass
-class Mamba1AttentionMetadata:
-    query_start_loc_p: torch.Tensor
-    state_indices_tensor: torch.Tensor
-    has_initial_states_p: torch.Tensor | None
-    num_prefills: int
-    num_prefill_tokens: int
-    num_decodes: int
-    num_decode_tokens: int
-
-    block_idx_last_scheduled_token: torch.Tensor  # shape: [batch,]
-    block_idx_first_scheduled_token_p: torch.Tensor  # shape: [batch,]
-    block_idx_last_computed_token: torch.Tensor  # shape: [batch,]
-    num_computed_tokens_p: torch.Tensor  # shape: [batch,]
+class Mamba1AttentionMetadata(BaseMambaAttentionMetadata):
+    pass
 
 
 class Mamba1AttentionMetadataBuilder(
     BaseMambaAttentionMetadataBuilder[Mamba1AttentionMetadata]
 ):
-    def __init__(
-        self,
-        kv_cache_spec: AttentionSpec,
-        layer_names: list[str],
-        vllm_config: VllmConfig,
-        device: torch.device,
-    ):
-        super().__init__(kv_cache_spec, layer_names, vllm_config, device)
-        assert isinstance(kv_cache_spec, MambaSpec)
-
-    def build(
-        self,
-        common_prefix_len: int,
-        common_attn_metadata: CommonAttentionMetadata,
-        fast_build: bool = False,
-    ) -> Mamba1AttentionMetadata:
-        num_reqs = common_attn_metadata.num_reqs
-
-        num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
-            split_decodes_and_prefills(
-                common_attn_metadata, decode_threshold=self.reorder_batch_threshold
-            )
-        )
-
-        has_initial_states_p = None
-        query_start_loc_p = None
-        num_computed_tokens, num_computed_tokens_p = None, None
-        block_idx_first_scheduled_token = None
-        block_idx_first_scheduled_token_p = None
-
-        # TODO(@Josephasafg) Mamba1 and Mamba2 have a lot of code in common here.
-        # We should consolidate this code
-        if self.vllm_config.cache_config.enable_prefix_caching:
-            # Return a tensor of shape (#requests, #max blocks)
-            state_indices_tensor = common_attn_metadata.block_table_tensor
-            mamba_block_size = self.kv_cache_spec.block_size
-            num_computed_tokens = common_attn_metadata.num_computed_tokens_cpu.to(
-                self.device
-            )
-            (
-                block_idx_last_computed_token,
-                block_idx_first_scheduled_token,
-                block_idx_last_scheduled_token,
-            ) = self._compute_prefix_caching_block_indices(
-                common_attn_metadata, mamba_block_size
-            )
-        else:
-            # Always return just a single block per each request:
-            state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0]
-            block_idx_last_scheduled_token = None
-            block_idx_last_computed_token = None
-
-        if num_prefills > 0:
-            query_start_loc_p = (
-                common_attn_metadata.query_start_loc[-num_prefills - 1 :]
-                - num_decode_tokens
-            )
-            has_initial_states_cpu = (
-                common_attn_metadata.num_computed_tokens_cpu[
-                    num_reqs - num_prefills : num_reqs
-                ]
-                > 0
-            )
-            has_initial_states_p = has_initial_states_cpu.to(
-                common_attn_metadata.query_start_loc.device
-            )
-
-            if self.vllm_config.cache_config.enable_prefix_caching:
-                assert num_computed_tokens is not None
-                num_computed_tokens_p = num_computed_tokens[
-                    num_reqs - num_prefills : num_reqs
-                ]
-                assert block_idx_first_scheduled_token is not None
-                block_idx_first_scheduled_token_p = block_idx_first_scheduled_token[
-                    num_reqs - num_prefills : num_reqs
-                ]
-
-        elif (
-            num_decodes > 0
-            and num_decodes <= self.decode_cudagraph_max_bs
-            and self.compilation_config.cudagraph_mode.has_full_cudagraphs()
-        ):
-            self.state_indices_tensor[:num_decodes].copy_(
-                state_indices_tensor, non_blocking=True
-            )
-            state_indices_tensor = self.state_indices_tensor[:num_decode_tokens]
-            state_indices_tensor[num_decodes:] = PAD_SLOT_ID
-
-            if self.vllm_config.cache_config.enable_prefix_caching:
-                self.block_idx_last_scheduled_token[:num_decodes].copy_(
-                    block_idx_last_scheduled_token, non_blocking=True
-                )
-                block_idx_last_scheduled_token = self.block_idx_last_scheduled_token[
-                    :num_decode_tokens
-                ]
-
-                self.block_idx_last_computed_token[:num_decodes].copy_(
-                    block_idx_last_computed_token, non_blocking=True
-                )
-                block_idx_last_computed_token = self.block_idx_last_computed_token[
-                    :num_decode_tokens
-                ]
-
-        return Mamba1AttentionMetadata(
-            query_start_loc_p=query_start_loc_p,
-            has_initial_states_p=has_initial_states_p,
-            state_indices_tensor=state_indices_tensor,
-            num_prefills=num_prefills,
-            num_prefill_tokens=num_prefill_tokens,
-            num_decodes=num_decodes,
-            num_decode_tokens=num_decode_tokens,
-            block_idx_last_scheduled_token=block_idx_last_scheduled_token,
-            block_idx_first_scheduled_token_p=block_idx_first_scheduled_token_p,
-            block_idx_last_computed_token=block_idx_last_computed_token,
-            num_computed_tokens_p=num_computed_tokens_p,
-        )
+    metadata_cls = Mamba1AttentionMetadata
+    supports_update_block_table: bool = False
diff --git a/vllm/v1/attention/backends/mamba2_attn.py b/vllm/v1/attention/backends/mamba2_attn.py
index f923371283aa0..b526f0a329972 100644
--- a/vllm/v1/attention/backends/mamba2_attn.py
+++ b/vllm/v1/attention/backends/mamba2_attn.py
@@ -1,19 +1,19 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import copy
 import itertools
-from dataclasses import dataclass
+from dataclasses import dataclass, replace
 
 import torch
 
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.config import VllmConfig
 from vllm.utils.math_utils import cdiv
-from vllm.v1.attention.backends.mamba_attn import BaseMambaAttentionMetadataBuilder
+from vllm.v1.attention.backends.mamba_attn import (
+    BaseMambaAttentionMetadata,
+    BaseMambaAttentionMetadataBuilder,
+)
 from vllm.v1.attention.backends.utils import (
     CommonAttentionMetadata,
-    compute_causal_conv1d_metadata,
-    split_decodes_and_prefills,
 )
 from vllm.v1.kv_cache_interface import AttentionSpec
 
@@ -94,48 +94,26 @@ class Mamba2AttentionBackend(AttentionBackend):
 
 
 @dataclass
-class Mamba2AttentionMetadata:
-    num_prefills: int
-    num_prefill_tokens: int
-    num_decodes: int
-    num_decode_tokens: int
-    query_start_loc_p: torch.Tensor
-    seq_lens: torch.Tensor
-
-    prep_initial_states: bool
-    chunk_size: int
-
-    # The following tensors only contain prefill requests and will be None if
-    # the batch has no prefill request.
-    has_initial_states_p: torch.Tensor | None
-    seq_idx_p: torch.Tensor | None
+class Mamba2AttentionMetadata(BaseMambaAttentionMetadata):
+    prep_initial_states: bool = False
+    chunk_size: int = 0
 
+    # Chunk-related metadata (only for prefill)
+    seq_idx_p: torch.Tensor | None = None
     # cu_chunk_seqlen_p is a tensor of shape (nchunks+1,) that contains, for
     # each chunk, its offests into the varlen sequence dimension. It is defined
     # such that the i-th chunk contains tokens from cu_chunk_seqlen_p[i] to
     # cu_chunk_seqlen_p[i+1].
-    cu_chunk_seqlen_p: torch.Tensor | None
-
+    cu_chunk_seqlen_p: torch.Tensor | None = None
     # last_chunk_indices_p is a tensor of shape (batch,) that contains the
     # index of the last chunk for every sequence in the (prefill) batch.
-    last_chunk_indices_p: torch.Tensor | None
-
-    state_indices_tensor: torch.Tensor  # shape: [batch,]
-    block_idx_last_scheduled_token: torch.Tensor  # shape: [batch,]
-    block_idx_first_scheduled_token_p: torch.Tensor  # shape: [batch,]
-    block_idx_last_computed_token: torch.Tensor  # shape: [batch,]
-    num_computed_tokens_p: torch.Tensor  # shape: [batch,]
-
-    # The following attributes are for triton implementation of causal_conv1d
-    nums_dict: dict | None = None
-    batch_ptr: torch.Tensor | None = None
-    token_chunk_offset_ptr: torch.Tensor | None = None
+    last_chunk_indices_p: torch.Tensor | None = None
 
 
 class Mamba2AttentionMetadataBuilder(
     BaseMambaAttentionMetadataBuilder[Mamba2AttentionMetadata]
 ):
-    supports_update_block_table: bool = True
+    metadata_cls = Mamba2AttentionMetadata
 
     def __init__(
         self,
@@ -150,87 +128,93 @@ class Mamba2AttentionMetadataBuilder(
             "chunk_size needs to be set in the model config for Mamba2 models"
         )
 
+    def _compute_chunk_metadata(
+        self,
+        num_prefills: int,
+        num_computed_tokens_p_cpu: torch.Tensor,
+        query_start_loc_p_cpu: torch.Tensor,
+    ) -> tuple[list[int], list[int], list[int]]:
+        """
+        Compute chunk-specific metadata for Mamba2.
+
+        The code below carefully constructs the chunks such that:
+        1. Chunks contain tokens from a *single* sequence only.
+        2. For every sequence, we are guaranteed that we can
+           retrieve the mamba state *every* chunk_size tokens.
+        Constraint (1) dramatically simplifies the mamba2 kernels.
+        Constraint (2) dramatically simplifies the implementation
+        of prefix caching for mamba2 (wip). We need to take care
+        of the interaction with chunked prefill in order to
+        satisfy constraint (2).
+        """
+        # TODO (tdoublep): This code could probably be optimized.
+        cu_chunk_seqlen = []
+        seq_idx = []
+        last_chunk_indices = []
+        seqlen_pos = 0
+
+        for req_idx in range(num_prefills):
+            this_num_computed = num_computed_tokens_p_cpu[req_idx].item()
+            this_new_tokens = (
+                query_start_loc_p_cpu[req_idx + 1].item()
+                - query_start_loc_p_cpu[req_idx].item()
+            )
+
+            # if computed tokens are not chunk-aligned, use the first
+            # chunk to finish it off
+            if this_num_computed % self.chunk_size != 0:
+                seq_idx.append(req_idx)
+                cu_chunk_seqlen.append(seqlen_pos)
+                # how many tokens to finish the chunk?
+                chunk_len = (
+                    cdiv(this_num_computed, self.chunk_size) * self.chunk_size
+                    - this_num_computed
+                )
+                # we can only use at most this_new_tokens
+                chunk_len = min(chunk_len, this_new_tokens)
+                seqlen_pos += chunk_len
+                this_new_tokens -= chunk_len
+
+            n_chunks = cdiv(this_new_tokens, self.chunk_size)
+            for chunk in range(n_chunks):
+                seq_idx.append(req_idx)
+                cu_chunk_seqlen.append(seqlen_pos)
+                chunk_len = min(self.chunk_size, this_new_tokens)
+                seqlen_pos += chunk_len
+                this_new_tokens -= chunk_len
+
+            assert this_new_tokens == 0
+            last_chunk_indices.append(len(cu_chunk_seqlen) - 1)
+
+        cu_chunk_seqlen.append(seqlen_pos)
+
+        return cu_chunk_seqlen, seq_idx, last_chunk_indices
+
     def build(
         self,
         common_prefix_len: int,
         common_attn_metadata: CommonAttentionMetadata,
         fast_build: bool = False,
     ) -> Mamba2AttentionMetadata:
-        num_reqs = common_attn_metadata.num_reqs
-        seq_lens = common_attn_metadata.seq_lens
+        common = self._compute_common_metadata(common_attn_metadata)
 
-        query_start_loc_p = None
         seq_idx_p = None
         cu_chunk_seqlen_p = None
         last_chunk_indices_p = None
-
-        # Need flags to indicate if there are initial states
-        has_initial_states_p = None
         prep_initial_states = False
 
-        # for causal_conv1d
-        nums_dict, batch_ptr, token_chunk_offset_ptr = None, None, None
-
-        num_computed_tokens, num_computed_tokens_p = None, None
-        block_idx_first_scheduled_token = None
-        block_idx_first_scheduled_token_p = None
-
-        if self.vllm_config.cache_config.enable_prefix_caching:
-            # Return a tensor of shape (#requests, #max blocks)
-            state_indices_tensor = common_attn_metadata.block_table_tensor
-            # Additional cache-related varaiables:
-            mamba_block_size = self.kv_cache_spec.block_size
-            num_computed_tokens = common_attn_metadata.num_computed_tokens_cpu.to(
-                self.device
-            )
-            (
-                block_idx_last_computed_token,
-                block_idx_first_scheduled_token,
-                block_idx_last_scheduled_token,
-            ) = self._compute_prefix_caching_block_indices(
-                common_attn_metadata, mamba_block_size
-            )
-        else:
-            # Always return just a single block per each request:
-            state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0]
-            # Additional cache-related varaiables:
-            block_idx_last_scheduled_token = None
-            block_idx_last_computed_token = None
-
-        num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
-            split_decodes_and_prefills(
-                common_attn_metadata, decode_threshold=self.reorder_batch_threshold
-            )
-        )
-
         # Compute seq_idx for prefill only
-        if num_prefills > 0:
-            # [batch,]
-            has_initial_states_cpu = (
-                common_attn_metadata.num_computed_tokens_cpu[
-                    num_reqs - num_prefills : num_reqs
-                ]
-                > 0
-            )
-            prep_initial_states = torch.any(has_initial_states_cpu).item()
-            has_initial_states_p = has_initial_states_cpu.to(
-                common_attn_metadata.query_start_loc.device
+        if common.num_prefills > 0:
+            prep_initial_states = (
+                torch.any(common.has_initial_states_p).item()
+                if common.has_initial_states_p is not None
+                else False
             )
 
-            query_start_loc_p = (
-                common_attn_metadata.query_start_loc[-num_prefills - 1 :]
-                - num_decode_tokens
-            )
+            num_reqs = common.num_reqs
+            num_prefills = common.num_prefills
+            num_decode_tokens = common.num_decode_tokens
 
-            if self.vllm_config.cache_config.enable_prefix_caching:
-                assert num_computed_tokens is not None
-                num_computed_tokens_p = num_computed_tokens[
-                    num_reqs - num_prefills : num_reqs
-                ]
-                assert block_idx_first_scheduled_token is not None
-                block_idx_first_scheduled_token_p = block_idx_first_scheduled_token[
-                    num_reqs - num_prefills : num_reqs
-                ]
             num_computed_tokens_p_cpu = common_attn_metadata.num_computed_tokens_cpu[
                 num_reqs - num_prefills : num_reqs
             ]
@@ -239,137 +223,33 @@ class Mamba2AttentionMetadataBuilder(
                 - num_decode_tokens
             )
 
-            # The code below carefully constructs the chunks such that:
-            # 1. Chunks contain tokens from a *single* sequence only.
-            # 2. For every sequence, we are guaranteed that we can
-            #    retrieve the mamba state *every* chunk_size tokens.
-            # Constraint (1) dramatically simplifies the mamba2 kernels.
-            # Constraint (2) dramatically simplifies the implementation
-            # of prefix caching for mamba2 (wip). We need to take care
-            # of the interaction with chunked prefill in order to
-            # satisfy constraint (2).
-            # TODO (tdoublep): This code could probably be optimized.
-            cu_chunk_seqlen = []
-            seq_idx = []
-            last_chunk_indices = []
-            seqlen_pos = 0
-            for req_idx in range(num_prefills):
-                this_num_computed = num_computed_tokens_p_cpu[req_idx].item()
-                this_new_tokens = (
-                    query_start_loc_p_cpu[req_idx + 1].item()
-                    - query_start_loc_p_cpu[req_idx].item()
-                )
-
-                # if computed tokens are not chunk-aligned, use the first
-                # chunk to finish it off
-                if this_num_computed % self.chunk_size != 0:
-                    seq_idx.append(req_idx)
-                    cu_chunk_seqlen.append(seqlen_pos)
-                    # how many tokens to finish the chunk?
-                    chunk_len = (
-                        cdiv(this_num_computed, self.chunk_size) * self.chunk_size
-                        - this_num_computed
-                    )
-                    # we can only use at most this_new_tokens
-                    chunk_len = min(chunk_len, this_new_tokens)
-                    seqlen_pos += chunk_len
-                    this_new_tokens -= chunk_len
-
-                n_chunks = cdiv(this_new_tokens, self.chunk_size)
-                for chunk in range(n_chunks):
-                    seq_idx.append(req_idx)
-                    cu_chunk_seqlen.append(seqlen_pos)
-                    chunk_len = min(self.chunk_size, this_new_tokens)
-                    seqlen_pos += chunk_len
-                    this_new_tokens -= chunk_len
-
-                assert this_new_tokens == 0
-                last_chunk_indices.append(len(cu_chunk_seqlen) - 1)
-
-            cu_chunk_seqlen.append(seqlen_pos)
+            cu_chunk_seqlen, seq_idx, last_chunk_indices = self._compute_chunk_metadata(
+                num_prefills,
+                num_computed_tokens_p_cpu,
+                query_start_loc_p_cpu,
+            )
 
             seq_idx_p = torch.as_tensor(
-                seq_idx, device=query_start_loc_p.device, dtype=torch.int32
+                seq_idx,
+                device=common_attn_metadata.query_start_loc.device,
+                dtype=torch.int32,
             )
             cu_chunk_seqlen_p = torch.as_tensor(
-                cu_chunk_seqlen, device=query_start_loc_p.device, dtype=torch.int32
+                cu_chunk_seqlen,
+                device=common_attn_metadata.query_start_loc.device,
+                dtype=torch.int32,
             )
             last_chunk_indices_p = torch.as_tensor(
-                last_chunk_indices, device=query_start_loc_p.device, dtype=torch.int32
+                last_chunk_indices,
+                device=common_attn_metadata.query_start_loc.device,
+                dtype=torch.int32,
             )
 
-            nums_dict, batch_ptr, token_chunk_offset_ptr = (
-                compute_causal_conv1d_metadata(query_start_loc_p)
-            )
-
-        elif (
-            num_decodes <= self.decode_cudagraph_max_bs
-            and self.compilation_config.cudagraph_mode.has_full_cudagraphs()
-        ):
-            self.state_indices_tensor[:num_decodes].copy_(
-                state_indices_tensor, non_blocking=True
-            )
-            state_indices_tensor = self.state_indices_tensor[:num_decode_tokens]
-
-            if self.vllm_config.cache_config.enable_prefix_caching:
-                self.block_idx_last_scheduled_token[:num_decodes].copy_(
-                    block_idx_last_scheduled_token, non_blocking=True
-                )
-                block_idx_last_scheduled_token = self.block_idx_last_scheduled_token[
-                    :num_decode_tokens
-                ]
-
-                self.block_idx_last_computed_token[:num_decodes].copy_(
-                    block_idx_last_computed_token, non_blocking=True
-                )
-                block_idx_last_computed_token = self.block_idx_last_computed_token[
-                    :num_decode_tokens
-                ]
-
-        attn_metadata = Mamba2AttentionMetadata(
-            num_prefills=num_prefills,
-            num_prefill_tokens=num_prefill_tokens,
-            num_decodes=num_decodes,
-            num_decode_tokens=num_decode_tokens,
-            query_start_loc_p=query_start_loc_p,
-            seq_lens=seq_lens,
+        return replace(
+            common,
             prep_initial_states=prep_initial_states,
             chunk_size=self.chunk_size,
-            has_initial_states_p=has_initial_states_p,
             seq_idx_p=seq_idx_p,
-            state_indices_tensor=state_indices_tensor,
             cu_chunk_seqlen_p=cu_chunk_seqlen_p,
             last_chunk_indices_p=last_chunk_indices_p,
-            nums_dict=nums_dict,
-            batch_ptr=batch_ptr,
-            token_chunk_offset_ptr=token_chunk_offset_ptr,
-            block_idx_last_scheduled_token=block_idx_last_scheduled_token,
-            block_idx_first_scheduled_token_p=block_idx_first_scheduled_token_p,
-            block_idx_last_computed_token=block_idx_last_computed_token,
-            num_computed_tokens_p=num_computed_tokens_p,
         )
-        return attn_metadata
-
-    def update_block_table(
-        self,
-        metadata: Mamba2AttentionMetadata,
-        blk_table: torch.Tensor,
-        slot_mapping: torch.Tensor,
-    ) -> Mamba2AttentionMetadata:
-        new_metadata = copy.copy(metadata)
-        prefix_caching = self.vllm_config.cache_config.enable_prefix_caching
-        state_indices_t = blk_table if prefix_caching else blk_table[:, 0]
-        num_reqs = blk_table.shape[0]
-
-        # For CUDA graphs, copy to persistent buffer
-        if (
-            metadata.num_prefills == 0
-            and num_reqs <= self.decode_cudagraph_max_bs
-            and self.compilation_config.cudagraph_mode.has_full_cudagraphs()
-        ):
-            persistent_state_indices_t = self.state_indices_tensor[:num_reqs]
-            persistent_state_indices_t.copy_(state_indices_t, non_blocking=True)
-            state_indices_t = persistent_state_indices_t
-
-        new_metadata.state_indices_tensor = state_indices_t
-        return new_metadata
diff --git a/vllm/v1/attention/backends/mamba_attn.py b/vllm/v1/attention/backends/mamba_attn.py
index a9705db59f19d..4f876d66da147 100644
--- a/vllm/v1/attention/backends/mamba_attn.py
+++ b/vllm/v1/attention/backends/mamba_attn.py
@@ -2,6 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import abc
+import copy
+from dataclasses import dataclass
 from typing import ClassVar, TypeVar
 
 import torch
@@ -9,20 +11,52 @@ import torch
 from vllm.config import VllmConfig
 from vllm.utils.math_utils import cdiv
 from vllm.v1.attention.backends.utils import (
+    PAD_SLOT_ID,
     AttentionCGSupport,
     AttentionMetadataBuilder,
     CommonAttentionMetadata,
+    compute_causal_conv1d_metadata,
+    split_decodes_and_prefills,
 )
 from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec
 
-M = TypeVar("M")
+M = TypeVar("M", bound="BaseMambaAttentionMetadata")
+
+
+@dataclass
+class BaseMambaAttentionMetadata:
+    num_prefills: int
+    num_prefill_tokens: int
+    num_decodes: int
+    num_decode_tokens: int
+    num_reqs: int
+
+    # The following tensors only contain prefill requests and will be None if
+    # the batch has no prefill request.
+    has_initial_states_p: torch.Tensor | None
+    query_start_loc_p: torch.Tensor | None
+    num_computed_tokens_p: torch.Tensor | None
+
+    state_indices_tensor: torch.Tensor
+
+    # The following tensors are only used for prefix caching and are None if disabled
+    block_idx_last_scheduled_token: torch.Tensor | None
+    block_idx_first_scheduled_token_p: torch.Tensor | None
+    block_idx_last_computed_token: torch.Tensor | None
+
+    # The following attributes are for triton implementation of causal_conv1d
+    nums_dict: dict | None = None
+    batch_ptr: torch.Tensor | None = None
+    token_chunk_offset_ptr: torch.Tensor | None = None
 
 
 class BaseMambaAttentionMetadataBuilder(AttentionMetadataBuilder[M], abc.ABC):
+    metadata_cls: type[M]
     reorder_batch_threshold: int = 1
     _cudagraph_support: ClassVar[AttentionCGSupport] = (
         AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
     )
+    supports_update_block_table: bool = True
 
     def __init__(
         self,
@@ -87,6 +121,18 @@ class BaseMambaAttentionMetadataBuilder(AttentionMetadataBuilder[M], abc.ABC):
 
         return self.build(0, m)
 
+    def build(
+        self,
+        common_prefix_len: int,
+        common_attn_metadata: CommonAttentionMetadata,
+        fast_build: bool = False,
+    ) -> M:
+        """
+        Default build implementation for Mamba-like attention backends.
+        Subclasses (e.g., Mamba2) can override to add additional metadata.
+        """
+        return self._compute_common_metadata(common_attn_metadata)
+
     def _compute_prefix_caching_block_indices(
         self,
         common_attn_metadata: CommonAttentionMetadata,
@@ -115,3 +161,147 @@ class BaseMambaAttentionMetadataBuilder(AttentionMetadataBuilder[M], abc.ABC):
             block_idx_first_scheduled_token,
             block_idx_last_scheduled_token,
         )
+
+    def _compute_common_metadata(
+        self,
+        common_attn_metadata: CommonAttentionMetadata,
+    ) -> M:
+        """
+        Compute metadata common to both Mamba1 and Mamba2.
+        """
+        num_reqs = common_attn_metadata.num_reqs
+
+        num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
+            split_decodes_and_prefills(
+                common_attn_metadata, decode_threshold=self.reorder_batch_threshold
+            )
+        )
+
+        # Need flags to indicate if there are initial states
+        has_initial_states_p = None
+        query_start_loc_p = None
+        num_computed_tokens = None
+        num_computed_tokens_p = None
+
+        # for prefix caching
+        block_idx_first_scheduled_token = None
+        block_idx_first_scheduled_token_p = None
+        block_idx_last_computed_token = None
+        block_idx_last_scheduled_token = None
+
+        # for causal_conv1d
+        nums_dict, batch_ptr, token_chunk_offset_ptr = None, None, None
+
+        if self.vllm_config.cache_config.enable_prefix_caching:
+            # Return a tensor of shape (#requests, #max blocks)
+            state_indices_tensor = common_attn_metadata.block_table_tensor
+            # Additional cache-related varaiables:
+            mamba_block_size = self.kv_cache_spec.block_size
+            num_computed_tokens = common_attn_metadata.num_computed_tokens_cpu.to(
+                self.device
+            )
+            (
+                block_idx_last_computed_token,
+                block_idx_first_scheduled_token,
+                block_idx_last_scheduled_token,
+            ) = self._compute_prefix_caching_block_indices(
+                common_attn_metadata, mamba_block_size
+            )
+        else:
+            # Always return just a single block per each request:
+            state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0]
+
+        if num_prefills > 0:
+            query_start_loc_p = (
+                common_attn_metadata.query_start_loc[-num_prefills - 1 :]
+                - num_decode_tokens
+            )
+            has_initial_states_cpu = (
+                common_attn_metadata.num_computed_tokens_cpu[
+                    num_reqs - num_prefills : num_reqs
+                ]
+                > 0
+            )
+            has_initial_states_p = has_initial_states_cpu.to(
+                common_attn_metadata.query_start_loc.device
+            )
+
+            nums_dict, batch_ptr, token_chunk_offset_ptr = (
+                compute_causal_conv1d_metadata(query_start_loc_p)
+            )
+
+            if self.vllm_config.cache_config.enable_prefix_caching:
+                assert num_computed_tokens is not None
+                num_computed_tokens_p = num_computed_tokens[
+                    num_reqs - num_prefills : num_reqs
+                ]
+                assert block_idx_first_scheduled_token is not None
+                block_idx_first_scheduled_token_p = block_idx_first_scheduled_token[
+                    num_reqs - num_prefills : num_reqs
+                ]
+        elif (
+            num_decodes <= self.decode_cudagraph_max_bs
+            and self.compilation_config.cudagraph_mode.has_full_cudagraphs()
+        ):
+            self.state_indices_tensor[:num_decodes].copy_(
+                state_indices_tensor, non_blocking=True
+            )
+            state_indices_tensor = self.state_indices_tensor[:num_decode_tokens]
+            state_indices_tensor[num_decodes:] = PAD_SLOT_ID
+
+            if self.vllm_config.cache_config.enable_prefix_caching:
+                self.block_idx_last_scheduled_token[:num_decodes].copy_(
+                    block_idx_last_scheduled_token, non_blocking=True
+                )
+                block_idx_last_scheduled_token = self.block_idx_last_scheduled_token[
+                    :num_decode_tokens
+                ]
+
+                self.block_idx_last_computed_token[:num_decodes].copy_(
+                    block_idx_last_computed_token, non_blocking=True
+                )
+                block_idx_last_computed_token = self.block_idx_last_computed_token[
+                    :num_decode_tokens
+                ]
+
+        return self.metadata_cls(
+            num_prefills=num_prefills,
+            num_prefill_tokens=num_prefill_tokens,
+            num_decodes=num_decodes,
+            num_decode_tokens=num_decode_tokens,
+            query_start_loc_p=query_start_loc_p,
+            has_initial_states_p=has_initial_states_p,
+            state_indices_tensor=state_indices_tensor,
+            block_idx_last_scheduled_token=block_idx_last_scheduled_token,
+            block_idx_first_scheduled_token_p=block_idx_first_scheduled_token_p,
+            block_idx_last_computed_token=block_idx_last_computed_token,
+            num_computed_tokens_p=num_computed_tokens_p,
+            num_reqs=num_reqs,
+            nums_dict=nums_dict,
+            batch_ptr=batch_ptr,
+            token_chunk_offset_ptr=token_chunk_offset_ptr,
+        )
+
+    def update_block_table(
+        self,
+        metadata: M,
+        blk_table: torch.Tensor,
+        slot_mapping: torch.Tensor,
+    ) -> M:
+        new_metadata = copy.copy(metadata)
+        prefix_caching = self.vllm_config.cache_config.enable_prefix_caching
+        state_indices_t = blk_table if prefix_caching else blk_table[:, 0]
+        num_reqs = blk_table.shape[0]
+
+        # For CUDA graphs, copy to persistent buffer
+        if (
+            metadata.num_prefills == 0
+            and num_reqs <= self.decode_cudagraph_max_bs
+            and self.compilation_config.cudagraph_mode.has_full_cudagraphs()
+        ):
+            persistent_state_indices_t = self.state_indices_tensor[:num_reqs]
+            persistent_state_indices_t.copy_(state_indices_t, non_blocking=True)
+            state_indices_t = persistent_state_indices_t
+
+        new_metadata.state_indices_tensor = state_indices_t
+        return new_metadata
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index fea482493635f..e9ec96835f277 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -355,6 +355,8 @@ class MLACommonPrefillMetadata:
     max_query_len: int
     chunked_context: ChunkedContextMetadata | None = None
     query_seq_lens: torch.Tensor | None = None
+    workspace_buffer: torch.Tensor | None = None
+    q_data_type: torch.dtype | None = None
 
 
 @dataclass
@@ -558,6 +560,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
             self.dcp_rank = 0
         self.dcp_local_block_size = parallel_config.cp_kv_cache_interleave_size
         self.dcp_virtual_block_size = self.dcp_local_block_size * self.dcp_world_size
+        self.cp_kv_cache_interleave_size = parallel_config.cp_kv_cache_interleave_size
 
         # Don't try to access the runner on AMD
         if self.aot_schedule:
@@ -722,8 +725,8 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
     def _build_decode(
         self,
         block_table_tensor: torch.Tensor,
-        seq_lens_cpu: torch.Tensor,
         seq_lens_device: torch.Tensor,
+        max_seq_len: int,
         query_start_loc_cpu: torch.Tensor,
         query_start_loc_device: torch.Tensor,
         num_decode_tokens: int,
@@ -773,13 +776,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
         query_start_loc = common_attn_metadata.query_start_loc
         query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu
         seq_lens = common_attn_metadata.seq_lens
-        seq_lens_cpu = common_attn_metadata.seq_lens_cpu
         dcp_local_seq_lens = common_attn_metadata.dcp_local_seq_lens
-        dcp_local_seq_lens_cpu = common_attn_metadata.dcp_local_seq_lens_cpu
-
-        query_seq_lens_cpu = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1]
-
-        num_computed_tokens_cpu = common_attn_metadata.seq_lens_cpu - query_seq_lens_cpu
 
         num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
             split_decodes_and_prefills(
@@ -794,6 +791,8 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
 
         prefill_metadata = None
         if num_prefills > 0:
+            num_computed_tokens_cpu = common_attn_metadata.num_computed_tokens_cpu
+
             reqs_start = num_decodes  # prefill_start
 
             context_lens_cpu = num_computed_tokens_cpu[reqs_start:num_reqs]
@@ -983,19 +982,29 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
                 prefill_metadata.query_seq_lens = (
                     prefill_query_start_loc[1:] - prefill_query_start_loc[:-1]
                 )
+                prefill_metadata.workspace_buffer = self._workspace_buffer
 
         decode_metadata = None
         if num_decodes > 0:
             dcp_tot_seq_lens_device = None
             if self.dcp_world_size > 1:
                 dcp_tot_seq_lens_device = seq_lens[:num_decodes]
-                seq_lens_cpu = dcp_local_seq_lens_cpu
                 seq_lens = dcp_local_seq_lens
 
+                # After DCP distribution, the maximum number of tokens for any rank is
+                # ceil(L / (N * I)) * I, where L is max_seq_len, N is dcp_world_size,
+                # and I is cp_kv_cache_interleave_size.
+                # This eliminates GPU->CPU sync while minimizing workspace
+                # over-allocation.
+                num_partitions = self.dcp_world_size * self.cp_kv_cache_interleave_size
+                max_seq_len = (
+                    (max_seq_len + num_partitions - 1) // num_partitions
+                ) * self.cp_kv_cache_interleave_size
+
             decode_metadata = self._build_decode(
                 block_table_tensor=block_table_tensor[:num_decodes, ...],
-                seq_lens_cpu=seq_lens_cpu[:num_decodes],
                 seq_lens_device=seq_lens[:num_decodes],
+                max_seq_len=max_seq_len,
                 query_start_loc_cpu=query_start_loc_cpu[: num_decodes + 1],
                 query_start_loc_device=query_start_loc[: num_decodes + 1],
                 num_decode_tokens=num_decode_tokens,
@@ -1491,12 +1500,13 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
         from flashinfer.prefill import trtllm_ragged_attention_deepseek
 
         assert prefill.query_seq_lens is not None
+        assert prefill.workspace_buffer is not None
 
         ret = trtllm_ragged_attention_deepseek(
             query=q,
             key=k,
             value=v,
-            workspace_buffer=self._workspace_buffer,
+            workspace_buffer=prefill.workspace_buffer,
             seq_lens=prefill.query_seq_lens,
             max_q_len=prefill.max_query_len,
             max_kv_len=prefill.max_query_len,
@@ -1525,6 +1535,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
 
         assert prefill.chunked_context is not None
         assert prefill.chunked_context.seq_lens[chunk_idx] is not None
+        assert prefill.workspace_buffer is not None
 
         out = torch.zeros(
             q.shape[0],
@@ -1533,13 +1544,13 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
             device=q.device,
             dtype=q.dtype,
         )
-        self._workspace_buffer.fill_(0)
+        prefill.workspace_buffer.fill_(0)
 
         attn_out, lse = trtllm_ragged_attention_deepseek(
             query=q,
             key=k,
             value=v,
-            workspace_buffer=self._workspace_buffer,
+            workspace_buffer=prefill.workspace_buffer,
             seq_lens=prefill.chunked_context.seq_lens[chunk_idx],
             max_q_len=prefill.max_query_len,
             max_kv_len=prefill.chunked_context.max_seq_lens[chunk_idx],
diff --git a/vllm/v1/attention/backends/mla/flashattn_mla.py b/vllm/v1/attention/backends/mla/flashattn_mla.py
index b28814aceada9..b4a68f472e9c1 100644
--- a/vllm/v1/attention/backends/mla/flashattn_mla.py
+++ b/vllm/v1/attention/backends/mla/flashattn_mla.py
@@ -169,8 +169,8 @@ class FlashAttnMLAMetadataBuilder(MLACommonMetadataBuilder[FlashAttnMLAMetadata]
     def _build_decode(
         self,
         block_table_tensor: torch.Tensor,
-        seq_lens_cpu: torch.Tensor,
         seq_lens_device: torch.Tensor,
+        max_seq_len: int,
         query_start_loc_cpu: torch.Tensor,
         query_start_loc_device: torch.Tensor,
         num_decode_tokens: int,
@@ -178,7 +178,6 @@ class FlashAttnMLAMetadataBuilder(MLACommonMetadataBuilder[FlashAttnMLAMetadata]
     ) -> FlashAttnMLADecodeMetadata:
         query_lens_cpu = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1]
         max_query_len = query_lens_cpu.max().item()
-        max_seq_len = seq_lens_cpu.max().item()
 
         # For Flash Attention MLA + full cudagraph
         max_num_splits = 0
@@ -193,7 +192,7 @@ class FlashAttnMLAMetadataBuilder(MLACommonMetadataBuilder[FlashAttnMLAMetadata]
             max_num_splits = 1
 
         scheduler_metadata = self._schedule_decode(
-            num_reqs=seq_lens_cpu.numel(),
+            num_reqs=seq_lens_device.shape[0],
             cu_query_lens=query_start_loc_device,
             max_query_len=max_query_len,
             seqlens=seq_lens_device,
diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py
index 74a4cd8430250..913503ce44944 100644
--- a/vllm/v1/attention/backends/mla/flashmla.py
+++ b/vllm/v1/attention/backends/mla/flashmla.py
@@ -143,8 +143,8 @@ class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]):
     def _build_decode(
         self,
         block_table_tensor: torch.Tensor,
-        seq_lens_cpu: torch.Tensor,
         seq_lens_device: torch.Tensor,
+        max_seq_len: int,
         query_start_loc_cpu: torch.Tensor,
         query_start_loc_device: torch.Tensor,
         num_decode_tokens: int,
diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
index 589d6ef2f6348..e8921f8a1c403 100644
--- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
+++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
@@ -106,8 +106,8 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
     def _build_decode(
         self,
         block_table_tensor: torch.Tensor,
-        seq_lens_cpu: torch.Tensor,
         seq_lens_device: torch.Tensor,
+        max_seq_len: int,
         query_start_loc_cpu: torch.Tensor,
         query_start_loc_device: torch.Tensor,
         num_decode_tokens: int,
diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py
index e231c600cba7a..3701373f33315 100644
--- a/vllm/v1/attention/backends/rocm_attn.py
+++ b/vllm/v1/attention/backends/rocm_attn.py
@@ -152,7 +152,11 @@ class RocmAttentionMetadataBuilder(AttentionMetadataBuilder[RocmAttentionMetadat
 
 class RocmAttentionBackend(AttentionBackend):
     accept_output_buffer: bool = True
-    supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
+    supported_dtypes: ClassVar[list[torch.dtype]] = [
+        torch.float16,
+        torch.bfloat16,
+        torch.float32,
+    ]
 
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:
diff --git a/vllm/v1/attention/backends/short_conv_attn.py b/vllm/v1/attention/backends/short_conv_attn.py
index c8fe0faf71088..e2fae37f5619d 100644
--- a/vllm/v1/attention/backends/short_conv_attn.py
+++ b/vllm/v1/attention/backends/short_conv_attn.py
@@ -2,15 +2,10 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
 
-import torch
-
 from vllm.attention.backends.abstract import AttentionBackend
-from vllm.v1.attention.backends.mamba_attn import BaseMambaAttentionMetadataBuilder
-from vllm.v1.attention.backends.utils import (
-    PAD_SLOT_ID,
-    CommonAttentionMetadata,
-    compute_causal_conv1d_metadata,
-    split_decodes_and_prefills,
+from vllm.v1.attention.backends.mamba_attn import (
+    BaseMambaAttentionMetadata,
+    BaseMambaAttentionMetadataBuilder,
 )
 
 
@@ -21,84 +16,11 @@ class ShortConvAttentionBackend(AttentionBackend):
 
 
 @dataclass
-class ShortConvAttentionMetadata:
-    num_prefills: int
-    num_prefill_tokens: int
-    num_decodes: int
-    num_decode_tokens: int
-
-    query_start_loc: torch.Tensor
-    state_indices_tensor: torch.Tensor
-    has_initial_states_p: torch.Tensor | None
-
-    # For causal_conv1d
-    nums_dict: dict | None = None
-    batch_ptr: torch.Tensor | None = None
-    token_chunk_offset_ptr: torch.Tensor | None = None
+class ShortConvAttentionMetadata(BaseMambaAttentionMetadata):
+    pass
 
 
 class ShortConvAttentionMetadataBuilder(
     BaseMambaAttentionMetadataBuilder[ShortConvAttentionMetadata]
 ):
-    def build(
-        self,
-        common_prefix_len: int,
-        common_attn_metadata: CommonAttentionMetadata,
-        fast_build: bool = False,
-    ) -> ShortConvAttentionMetadata:
-        num_reqs = common_attn_metadata.num_reqs
-        query_start_loc = common_attn_metadata.query_start_loc
-        state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0]
-
-        # for causal_conv1d
-        nums_dict, batch_ptr, token_chunk_offset_ptr = None, None, None
-
-        num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
-            split_decodes_and_prefills(
-                common_attn_metadata, decode_threshold=self.reorder_batch_threshold
-            )
-        )
-
-        has_initial_states_p = None
-        if num_prefills > 0:
-            has_initial_states_cpu = (
-                common_attn_metadata.num_computed_tokens_cpu[
-                    num_reqs - num_prefills : num_reqs
-                ]
-                > 0
-            )
-            has_initial_states_p = has_initial_states_cpu.to(query_start_loc.device)
-
-            query_start_loc_p = (
-                common_attn_metadata.query_start_loc[-num_prefills - 1 :]
-                - num_decode_tokens
-            )
-
-            nums_dict, batch_ptr, token_chunk_offset_ptr = (
-                compute_causal_conv1d_metadata(query_start_loc_p)
-            )
-
-        elif (
-            num_decodes > 0
-            and num_decodes <= self.decode_cudagraph_max_bs
-            and self.compilation_config.cudagraph_mode.has_full_cudagraphs()
-        ):
-            self.state_indices_tensor[:num_decodes].copy_(
-                state_indices_tensor, non_blocking=True
-            )
-            state_indices_tensor = self.state_indices_tensor[:num_decode_tokens]
-            state_indices_tensor[num_decodes:] = PAD_SLOT_ID
-
-        attn_metadata = ShortConvAttentionMetadata(
-            query_start_loc=query_start_loc,
-            state_indices_tensor=state_indices_tensor,
-            has_initial_states_p=has_initial_states_p,
-            num_prefills=num_prefills,
-            num_prefill_tokens=num_prefill_tokens,
-            num_decodes=num_decodes,
-            num_decode_tokens=num_decode_tokens,
-            nums_dict=nums_dict,
-            batch_ptr=batch_ptr,
-            token_chunk_offset_ptr=token_chunk_offset_ptr,
-        )
-        return attn_metadata
+    metadata_cls = ShortConvAttentionMetadata
diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py
index 7bea3862a03f9..ca7be990ca555 100644
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -76,6 +76,39 @@ class TritonAttentionMetadata:
     # Optional aot scheduling
     scheduler_metadata: torch.Tensor | None = None
     prefix_scheduler_metadata: torch.Tensor | None = None
+    mm_prefix_range: dict[int, list[tuple[int, int]]] | None = None
+
+    @property
+    def mm_prefix_range_tensor(self) -> torch.Tensor | None:
+        """Convert mm_prefix_range dict to padded tensor for Triton kernel.
+
+        Returns shape: (num_seqs, max_ranges, 2) with 0-padding for empty ranges.
+        Empty ranges have start==end==0, which kernel skips via is_valid check.
+        """
+        # TODO(Isotr0py): Move to model runner's attention metadata
+        # preparation to avoid duplicate computation.
+        if self.mm_prefix_range is None:
+            return None
+
+        num_seqs = self.seq_lens.shape[0]
+        device = self.seq_lens.device
+
+        # Collect ranges, using [(0,0)] for empty sequences to ensure uniform dims
+        range_lists = [
+            self.mm_prefix_range.get(i, [(0, 0)]) or [(0, 0)] for i in range(num_seqs)
+        ]
+
+        # Return None if all ranges are trivial (only (0,0) placeholders)
+        if all(r == [(0, 0)] for r in range_lists):
+            return None
+
+        # Create 2D tensors with shape (num_ranges, 2) for each sequence
+        range_tensors = [
+            torch.tensor(r, dtype=torch.int32, device=device).view(-1, 2)
+            for r in range_lists
+        ]
+
+        return torch.nested.nested_tensor(range_tensors).to_padded_tensor(0)
 
 
 class TritonAttentionMetadataBuilder(AttentionMetadataBuilder[TritonAttentionMetadata]):
@@ -268,6 +301,10 @@ class TritonAttentionBackend(AttentionBackend):
     def supports_head_size(cls, head_size: int) -> bool:
         return head_size >= 32
 
+    @classmethod
+    def supports_mm_prefix(cls) -> bool:
+        return True
+
     @classmethod
     def supports_sink(cls) -> bool:
         return True
@@ -427,6 +464,7 @@ class TritonAttentionImpl(AttentionImpl):
         softmax_segm_expsum = attn_metadata.softmax_segm_expsum
 
         descale_shape = (cu_seqlens_q.shape[0] - 1, key_cache.shape[2])
+        mm_prefix_range_tensor = attn_metadata.mm_prefix_range_tensor
 
         unified_attention(
             q=query[:num_actual_tokens],
@@ -453,6 +491,7 @@ class TritonAttentionImpl(AttentionImpl):
             softmax_segm_expsum=softmax_segm_expsum,
             sinks=self.sinks,
             output_scale=output_scale,
+            mm_prefix_range=mm_prefix_range_tensor,
         )
 
         return output
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index 56763f4b52539..6b94f786a26b2 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -835,6 +835,15 @@ def subclass_attention_backend(
     )
 
 
+def subclass_attention_backend_with_overrides(
+    name_prefix: str,
+    attention_backend_cls: type[AttentionBackend],
+    overrides: dict[str, Any],
+) -> type[AttentionBackend]:
+    name: str = name_prefix + attention_backend_cls.__name__  # type: ignore
+    return type(name, (attention_backend_cls,), overrides)
+
+
 def split_decodes_prefills_and_extends(
     common_attn_metadata: CommonAttentionMetadata,
     decode_threshold: int = 1,
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 8e835ad096405..da8339558b143 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -43,6 +43,7 @@ from vllm.v1.core.sched.request_queue import SchedulingPolicy, create_request_qu
 from vllm.v1.core.sched.utils import check_stop, remove_all
 from vllm.v1.engine import EngineCoreEventType, EngineCoreOutput, EngineCoreOutputs
 from vllm.v1.kv_cache_interface import KVCacheConfig
+from vllm.v1.metrics.perf import ModelMetrics, PerfStats
 from vllm.v1.metrics.stats import (
     PrefixCacheStats,
     SchedulerStats,
@@ -219,6 +220,10 @@ class Scheduler(SchedulerInterface):
         self.use_pp = self.parallel_config.pipeline_parallel_size > 1
         self.use_v2_model_runner = envs.VLLM_USE_V2_MODEL_RUNNER
 
+        self.perf_metrics: ModelMetrics | None = None
+        if self.log_stats and vllm_config.observability_config.enable_mfu_metrics:
+            self.perf_metrics = ModelMetrics(vllm_config)
+
     def schedule(self) -> SchedulerOutput:
         # NOTE(woosuk) on the scheduling algorithm:
         # There's no "decoding phase" nor "prefill phase" in the scheduler.
@@ -1066,6 +1071,10 @@ class Scheduler(SchedulerInterface):
         kv_connector_output = model_runner_output.kv_connector_output
         cudagraph_stats = model_runner_output.cudagraph_stats
 
+        perf_stats: PerfStats | None = None
+        if self.perf_metrics and self.perf_metrics.is_enabled():
+            perf_stats = self.perf_metrics.get_step_perf_stats_per_gpu(scheduler_output)
+
         outputs: dict[int, list[EngineCoreOutput]] = defaultdict(list)
         spec_decoding_stats: SpecDecodingStats | None = None
         kv_connector_stats: KVConnectorStats | None = (
@@ -1262,7 +1271,7 @@ class Scheduler(SchedulerInterface):
 
         if (
             stats := self.make_stats(
-                spec_decoding_stats, kv_connector_stats, cudagraph_stats
+                spec_decoding_stats, kv_connector_stats, cudagraph_stats, perf_stats
             )
         ) is not None:
             # Return stats to only one of the front-ends.
@@ -1485,6 +1494,7 @@ class Scheduler(SchedulerInterface):
         spec_decoding_stats: SpecDecodingStats | None = None,
         kv_connector_stats: KVConnectorStats | None = None,
         cudagraph_stats: CUDAGraphStat | None = None,
+        perf_stats: PerfStats | None = None,
     ) -> SchedulerStats | None:
         if not self.log_stats:
             return None
@@ -1510,6 +1520,7 @@ class Scheduler(SchedulerInterface):
             spec_decoding_stats=spec_stats,
             kv_connector_stats=connector_stats_payload,
             cudagraph_stats=cudagraph_stats,
+            perf_stats=perf_stats,
         )
 
     def make_spec_decoding_stats(
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 4f54d12f4b8d0..27d34f1c60da8 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -75,6 +75,12 @@ class EngineCoreRequest(
 
     trace_headers: Mapping[str, str] | None = None
 
+    # The user-provided request ID. This field is set internally,
+    # copied from the provided request_id that's originally assigned
+    # to the request_id field, see InputProcessor.assign_request_id().
+    # Used in outputs and to support abort(req_id, internal=False).
+    external_req_id: str | None = None
+
     @property
     def params(self) -> SamplingParams | PoolingParams:
         """Return the processed params (sampling or pooling)."""
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index a6ee241c41151..87b700d13e9d8 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -4,6 +4,7 @@ import asyncio
 import os
 import socket
 import time
+import warnings
 from collections.abc import AsyncGenerator, Iterable, Mapping
 from copy import copy
 from typing import Any, cast
@@ -289,12 +290,15 @@ class AsyncLLM(EngineClient):
 
         is_pooling = isinstance(params, PoolingParams)
 
-        # Create a new output collector for the request.
-        queue = RequestOutputCollector(output_kind=params.output_kind)
-
         # Convert Input --> Request.
         if isinstance(prompt, EngineCoreRequest):
             request = prompt
+            if request_id != request.request_id:
+                logger.warning_once(
+                    "AsyncLLM.add_request() was passed a request_id parameter that "
+                    "does not match the EngineCoreRequest.request_id attribute. The "
+                    "latter will be used, and the former will be ignored."
+                )
         else:
             assert prompt_text is None
             request = self.input_processor.process_inputs(
@@ -313,6 +317,11 @@ class AsyncLLM(EngineClient):
             elif isinstance(prompt, Mapping):
                 prompt_text = cast(str | None, prompt.get("prompt"))
 
+        self.input_processor.assign_request_id(request)
+
+        # Create a new output collector for the request.
+        queue = RequestOutputCollector(params.output_kind, request.request_id)
+
         # Use cloned params that may have been updated in process_inputs()
         params = request.params
 
@@ -324,7 +333,7 @@ class AsyncLLM(EngineClient):
         assert isinstance(parent_params, SamplingParams)
 
         # Fan out child requests (for n>1).
-        parent_request = ParentRequest(request_id, parent_params)
+        parent_request = ParentRequest(request)
         for idx in range(parent_params.n):
             request_id, child_params = parent_request.get_child_info(idx)
             child_request = request if idx == parent_params.n - 1 else copy(request)
@@ -395,6 +404,7 @@ class AsyncLLM(EngineClient):
                 "prompt logprobs"
             )
 
+        q: RequestOutputCollector | None = None
         try:
             # We start the output_handler on the first call to generate() so
             # we can call __init__ before the event loop, which enables us
@@ -445,7 +455,8 @@ class AsyncLLM(EngineClient):
         # is cancelled or the generator is garbage collected. So,
         # we abort the request if we end up here.
         except (asyncio.CancelledError, GeneratorExit):
-            await self.abort(request_id)
+            if q is not None:
+                await self.abort(q.request_id, internal=True)
             if self.log_requests:
                 logger.info("Request %s aborted.", request_id)
             raise
@@ -464,7 +475,8 @@ class AsyncLLM(EngineClient):
 
         # Unexpected error in the generate() task (possibly recoverable).
         except Exception as e:
-            await self.abort(request_id)
+            if q is not None:
+                await self.abort(q.request_id, internal=True)
             if self.log_requests:
                 logger.info("Request %s failed.", request_id)
             raise EngineGenerateError() from e
@@ -540,13 +552,15 @@ class AsyncLLM(EngineClient):
 
         self.output_handler = asyncio.create_task(output_handler())
 
-    async def abort(self, request_id: str | Iterable[str]) -> None:
+    async def abort(
+        self, request_id: str | Iterable[str], internal: bool = False
+    ) -> None:
         """Abort RequestId in OutputProcessor and EngineCore."""
 
         request_ids = (
             (request_id,) if isinstance(request_id, str) else as_list(request_id)
         )
-        all_request_ids = self.output_processor.abort_requests(request_ids)
+        all_request_ids = self.output_processor.abort_requests(request_ids, internal)
         await self.engine_core.abort_requests_async(all_request_ids)
 
         if self.log_requests:
@@ -580,7 +594,7 @@ class AsyncLLM(EngineClient):
         if not wait_for_inflight_requests:
             request_ids = list(self.output_processor.request_states.keys())
             if request_ids:
-                await self.abort(request_ids)
+                await self.abort(request_ids, internal=True)
 
         # Wait for running requests to drain before clearing cache.
         if self.output_processor.has_unfinished_requests():
@@ -627,8 +641,12 @@ class AsyncLLM(EngineClient):
 
         The caller of generate() iterates the returned AsyncGenerator,
         returning the RequestOutput back to the caller.
+
+        NOTE: truncate_prompt_tokens is deprecated in v0.14.
+        TODO: Remove truncate_prompt_tokens in v0.15.
         """
 
+        q: RequestOutputCollector | None = None
         try:
             # We start the output_handler on the first call to generate() so
             # we can call __init__ before the event loop, which enables us
@@ -641,9 +659,19 @@ class AsyncLLM(EngineClient):
 
             if tokenization_kwargs is None:
                 tokenization_kwargs = {}
+
+            if truncate_prompt_tokens is not None:
+                warnings.warn(
+                    "The `truncate_prompt_tokens` parameter in `AsyncLLM.encode()` "
+                    "is deprecated and will be removed in v0.15. "
+                    "Please use `pooling_params.truncate_prompt_tokens` instead.",
+                    DeprecationWarning,
+                    stacklevel=2,
+                )
+
             _validate_truncation_size(
                 self.model_config.max_model_len,
-                truncate_prompt_tokens,
+                pooling_params.truncate_prompt_tokens,
                 tokenization_kwargs,
             )
 
@@ -673,7 +701,8 @@ class AsyncLLM(EngineClient):
         # If the request is disconnected by the client, generate()
         # is cancelled. So, we abort the request if we end up here.
         except asyncio.CancelledError:
-            await self.abort(request_id)
+            if q is not None:
+                await self.abort(q.request_id, internal=True)
             if self.log_requests:
                 logger.info("Request %s aborted.", request_id)
             raise
@@ -692,7 +721,8 @@ class AsyncLLM(EngineClient):
 
         # Unexpected error in the generate() task (possibly recoverable).
         except Exception as e:
-            await self.abort(request_id)
+            if q is not None:
+                await self.abort(q.request_id, internal=True)
             if self.log_requests:
                 logger.info("Request %s failed.", request_id)
             raise EngineGenerateError() from e
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 0045b8c1dd3e7..40c3e9a515e18 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -43,9 +43,11 @@ from vllm.v1.core.kv_cache_utils import (
 from vllm.v1.core.sched.interface import SchedulerInterface
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.engine import (
+    EngineCoreOutput,
     EngineCoreOutputs,
     EngineCoreRequest,
     EngineCoreRequestType,
+    FinishReason,
     ReconfigureDistributedRequest,
     ReconfigureRankType,
     UtilityOutput,
@@ -923,6 +925,13 @@ class EngineCoreProc(EngineCore):
         # Post-step hook.
         self.post_step(model_executed)
 
+        # If no model execution happened but there are waiting requests
+        # (e.g., WAITING_FOR_REMOTE_KVS), yield the GIL briefly to allow
+        # background threads (like NIXL handshake) to make progress.
+        # Without this, the tight polling loop can starve background threads.
+        if not model_executed and self.scheduler.has_unfinished_requests():
+            time.sleep(0.001)
+
         return model_executed
 
     def _handle_client_request(
@@ -1048,9 +1057,14 @@ class EngineCoreProc(EngineCore):
                     request_type = EngineCoreRequestType(bytes(type_frame.buffer))
 
                     # Deserialize the request data.
+                    request: Any
                     if request_type == EngineCoreRequestType.ADD:
-                        request = add_request_decoder.decode(data_frames)
-                        request = self.preprocess_add_request(request)
+                        req: EngineCoreRequest = add_request_decoder.decode(data_frames)
+                        try:
+                            request = self.preprocess_add_request(req)
+                        except Exception:
+                            self._handle_request_preproc_error(req)
+                            continue
                     else:
                         request = generic_decoder.decode(data_frames)
 
@@ -1134,6 +1148,30 @@ class EngineCoreProc(EngineCore):
                     # Limit the number of buffers to reuse.
                     reuse_buffers.append(buffer)
 
+    def _handle_request_preproc_error(self, request: EngineCoreRequest) -> None:
+        """Log and return a request-scoped error response for exceptions raised
+        from the add request preprocessing in the input socket processing thread.
+        """
+        logger.exception(
+            "Unexpected error pre-processing request %s", request.request_id
+        )
+        self.output_queue.put_nowait(
+            (
+                request.client_index,
+                EngineCoreOutputs(
+                    engine_index=self.engine_index,
+                    finished_requests={request.request_id},
+                    outputs=[
+                        EngineCoreOutput(
+                            request_id=request.request_id,
+                            new_token_ids=[],
+                            finish_reason=FinishReason.ERROR,
+                        )
+                    ],
+                ),
+            )
+        )
+
 
 class DPEngineCoreProc(EngineCoreProc):
     """ZMQ-wrapper for running EngineCore in background process
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index c936646aa7993..cacbc805e84f8 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -20,6 +20,7 @@ import zmq
 import zmq.asyncio
 
 from vllm.config import VllmConfig
+from vllm.envs import VLLM_ENGINE_READY_TIMEOUT_S
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.tasks import SupportedTask
@@ -268,7 +269,8 @@ class InprocClient(EngineCoreClient):
         self.engine_core = EngineCore(*args, **kwargs)
 
     def get_output(self) -> EngineCoreOutputs:
-        outputs, _ = self.engine_core.step_fn()
+        outputs, model_executed = self.engine_core.step_fn()
+        self.engine_core.post_step(model_executed=model_executed)
         return outputs and outputs.get(0) or EngineCoreOutputs()
 
     def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
@@ -527,7 +529,9 @@ class MPClient(EngineCoreClient):
             identities = set(self.core_engines)
             sync_input_socket = zmq.Socket.shadow(self.input_socket)
             while identities:
-                if not sync_input_socket.poll(timeout=600_000):
+                if not sync_input_socket.poll(
+                    timeout=VLLM_ENGINE_READY_TIMEOUT_S * 1000  # convert to ms
+                ):
                     raise TimeoutError(
                         "Timed out waiting for engines to send"
                         "initial message on input socket."
@@ -1339,7 +1343,9 @@ class DPLBAsyncMPClient(DPAsyncMPClient):
         # Wait for ready messages from new engines on the input socket
         sync_input_socket = zmq.Socket.shadow(self.input_socket)
         while new_engine_identities:
-            if not sync_input_socket.poll(timeout=600_000):
+            if not sync_input_socket.poll(
+                timeout=VLLM_ENGINE_READY_TIMEOUT_S * 1000  # convert to ms
+            ):
                 raise TimeoutError(
                     "Timed out waiting for new engines to send initial "
                     "message on input socket."
diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py
index 65e0c845b0afa..1d43a8253843f 100644
--- a/vllm/v1/engine/input_processor.py
+++ b/vllm/v1/engine/input_processor.py
@@ -21,10 +21,13 @@ from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.tokenizers import TokenizerLike
 from vllm.tokenizers.mistral import MistralTokenizer
-from vllm.utils import length_from_prompt_token_ids_or_embeds
+from vllm.utils import length_from_prompt_token_ids_or_embeds, random_uuid
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.metrics.stats import MultiModalCacheStats
-from vllm.v1.structured_output.backend_guidance import validate_guidance_grammar
+from vllm.v1.structured_output.backend_guidance import (
+    has_guidance_unsupported_json_features,
+    validate_guidance_grammar,
+)
 from vllm.v1.structured_output.backend_lm_format_enforcer import (
     validate_structured_output_request_lm_format_enforcer,
 )
@@ -340,8 +343,22 @@ class InputProcessor:
                 # The request either failed validation
                 # or includes some jsonschema feature(s) that
                 # are not supported in xgrammar.
-                if isinstance(self.tokenizer, MistralTokenizer):
+
+                # Check if schema has features unsupported by guidance
+                so_params = params.structured_outputs
+                skip_guidance = False
+                if so_params.json:
+                    if isinstance(so_params.json, str):
+                        import json
+
+                        schema = json.loads(so_params.json)
+                    else:
+                        schema = so_params.json
+                    skip_guidance = has_guidance_unsupported_json_features(schema)
+
+                if isinstance(self.tokenizer, MistralTokenizer) or skip_guidance:
                     # Fall back to outlines if the tokenizer is Mistral
+                    # or if schema contains features unsupported by guidance
                     validate_structured_output_request_outlines(params)
                     params.structured_outputs._backend = "outlines"
                 else:
@@ -389,6 +406,19 @@ class InputProcessor:
             mm_uuids[modality] = [f"{request_id}-{modality}-{i}" for i in range(n)]
         return mm_uuids
 
+    @staticmethod
+    def assign_request_id(request: EngineCoreRequest):
+        """Replace the externally supplied request ID with an internal request ID
+        that adds 8 random characters in order to ensure uniquness.
+        """
+        if request.external_req_id is not None:
+            raise ValueError(
+                "The external_req_id field should not be set on EngineCoreRequests"
+                " passed to vLLM; use the request_id field."
+            )
+        request.external_req_id = request.request_id
+        request.request_id = f"{request.external_req_id}-{random_uuid():.8}"
+
     def process_inputs(
         self,
         request_id: str,
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 1011317b706d3..33fc34b67af6f 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -213,10 +213,10 @@ class LLMEngine:
     def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
         return self.engine_core.get_supported_tasks()
 
-    def abort_request(self, request_ids: list[str]) -> None:
+    def abort_request(self, request_ids: list[str], internal: bool = False) -> None:
         """Remove request_ids from EngineCore and Detokenizer."""
 
-        request_ids = self.output_processor.abort_requests(request_ids)
+        request_ids = self.output_processor.abort_requests(request_ids, internal)
         self.engine_core.abort_requests(request_ids)
 
     def add_request(
@@ -238,6 +238,12 @@ class LLMEngine:
         # Process raw inputs into the request.
         if isinstance(prompt, EngineCoreRequest):
             request = prompt
+            if request_id != request.request_id:
+                logger.warning_once(
+                    "AsyncLLM.add_request() was passed a request_id parameter that "
+                    "does not match the EngineCoreRequest.request_id attribute. The "
+                    "latter will be used, and the former will be ignored."
+                )
         else:
             assert prompt_text is None
             request = self.input_processor.process_inputs(
@@ -255,6 +261,8 @@ class LLMEngine:
             elif isinstance(prompt, Mapping):
                 prompt_text = cast(str | None, prompt.get("prompt"))
 
+        self.input_processor.assign_request_id(request)
+
         # Use cloned params that may have been updated in process_inputs()
         params = request.params
 
@@ -268,7 +276,7 @@ class LLMEngine:
             return
 
         # Fan out child requests (for n>1).
-        parent_req = ParentRequest(request_id, params)
+        parent_req = ParentRequest(request)
         for idx in range(n):
             request_id, child_params = parent_req.get_child_info(idx)
             child_request = request if idx == n - 1 else copy(request)
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 8f7d8a71f1a2e..e8717e15198a7 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
+from collections import defaultdict
 from collections.abc import Iterable
 from dataclasses import dataclass
 from typing import Any, cast
@@ -40,8 +41,9 @@ class RequestOutputCollector:
     producer gets ahead of the consumer.
     """
 
-    def __init__(self, output_kind: RequestOutputKind):
+    def __init__(self, output_kind: RequestOutputKind, request_id: str):
         self.aggregate = output_kind == RequestOutputKind.DELTA
+        self.request_id = request_id
         self.output: RequestOutput | PoolingRequestOutput | Exception | None = None
         self.ready = asyncio.Event()
 
@@ -92,6 +94,7 @@ class RequestState:
     def __init__(
         self,
         request_id: str,
+        external_req_id: str,
         parent_req: ParentRequest | None,
         request_index: int,
         lora_request: LoRARequest | None,
@@ -111,6 +114,7 @@ class RequestState:
         temperature: float | None = None,
     ):
         self.request_id = request_id
+        self.external_req_id = external_req_id
         self.parent_req = parent_req
         self.request_index = request_index
         self.lora_request = lora_request
@@ -176,8 +180,10 @@ class RequestState:
             assert request.pooling_params is not None
             output_kind = request.pooling_params.output_kind
 
+        assert request.external_req_id is not None
         return cls(
             request_id=request.request_id,
+            external_req_id=request.external_req_id,
             parent_req=parent_req,
             request_index=request_index,
             lora_request=request.lora_request,
@@ -235,10 +241,13 @@ class RequestState:
                 ]
                 self.sent_tokens_offset = len(self.detokenizer.output_token_ids)
 
-        request_id = self.request_id
+        external_req_id = self.external_req_id
+
         if pooling_output is not None:
             return self._new_request_output(
-                request_id, [self._new_pooling_output(pooling_output)], finished
+                external_req_id,
+                [self._new_pooling_output(pooling_output)],
+                finished,
             )
 
         output = self._new_completion_output(new_token_ids, finish_reason, stop_reason)
@@ -246,19 +255,18 @@ class RequestState:
         if self.parent_req is None:
             outputs = [output]
         else:
-            request_id, outputs, finished = self.parent_req.get_outputs(
-                request_id, output
-            )
+            outputs, finished = self.parent_req.get_outputs(self.request_id, output)
             if not outputs:
                 return None
+            external_req_id = self.parent_req.external_req_id
 
         return self._new_request_output(
-            request_id, outputs, finished, kv_transfer_params
+            external_req_id, outputs, finished, kv_transfer_params
         )
 
     def _new_request_output(
         self,
-        request_id: str,
+        external_req_id: str,
         outputs: list[CompletionOutput] | list[PoolingOutput],
         finished: bool,
         kv_transfer_params: dict[str, Any] | None = None,
@@ -269,7 +277,7 @@ class RequestState:
             # Prompt embeddings are currently not supported by pooling requests.
             assert self.prompt_token_ids is not None
             return PoolingRequestOutput(
-                request_id=request_id,
+                request_id=external_req_id,
                 outputs=first_output,
                 num_cached_tokens=self.num_cached_tokens,
                 prompt_token_ids=self.prompt_token_ids,
@@ -288,7 +296,7 @@ class RequestState:
             prompt_token_ids = [0] * len(self.prompt_embeds)
 
         return RequestOutput(
-            request_id=request_id,
+            request_id=external_req_id,  # request_id is what was provided externally
             lora_request=self.lora_request,
             prompt=self.prompt,
             prompt_token_ids=prompt_token_ids,
@@ -352,6 +360,7 @@ class OutputProcessor:
         self.stream_interval = stream_interval
         self.request_states: dict[str, RequestState] = {}
         self.parent_requests: dict[str, ParentRequest] = {}
+        self.external_req_ids: defaultdict[str, list[str]] = defaultdict(list)
         self.lora_states = LoRARequestStates(log_stats)
         self.tracer: Tracer | None = None
         self._requests_drained = asyncio.Event()
@@ -375,12 +384,41 @@ class OutputProcessor:
             assert state.queue is not None
             state.queue.put(e)
 
-    def abort_requests(
-        self,
-        request_ids: Iterable[str],
-    ) -> list[str]:
-        request_ids_to_abort = []
+    def abort_requests(self, request_ids: Iterable[str], internal: bool) -> list[str]:
+        """Abort a list of requests.
+
+        The request_ids may be either external request IDs (those passed to
+        InputProcessor.process_inputs()) or internal request IDs (those randomly
+        generated when creating the EngineCoreRequest).
+
+        If an external request ID is provided, and that external request ID
+        was used for multiple requests, all requests associated with that external
+        request ID are aborted.
+
+        In the case of parallel sampling, a request ID may be used to identify
+        a parent request, in which case the associated child requests are aborted
+        also.
+        """
+
+        internal_req_ids = []
         for request_id in request_ids:
+            if internal:
+                # Internal ID - this may be a parent request
+                internal_req_ids.append(request_id)
+
+                # Remove internal ID from the external->internal mapping
+                if req_state := self.request_states.get(request_id):
+                    external_req_id = req_state.external_req_id
+                    internal_ids = self.external_req_ids[external_req_id]
+                    internal_ids.remove(request_id)
+                    if not internal_ids:
+                        del self.external_req_ids[external_req_id]
+            elif internal_ids := self.external_req_ids.pop(request_id, []):
+                # External ID - abort all requests in the external->internal mapping
+                internal_req_ids.extend(internal_ids)
+
+        request_ids_to_abort = []
+        for request_id in internal_req_ids:
             req_state = self.request_states.pop(request_id, None)
             if req_state is not None:
                 self.lora_states.request_finished(request_id, req_state.lora_name)
@@ -404,7 +442,7 @@ class OutputProcessor:
                 # Abort children prior to removing the parent.
                 if parent.child_requests:
                     child_reqs = list(parent.child_requests)
-                    child_reqs = self.abort_requests(child_reqs)
+                    child_reqs = self.abort_requests(child_reqs, internal=True)
                     request_ids_to_abort.extend(child_reqs)
                 self.parent_requests.pop(request_id, None)
         if not self.request_states:
@@ -439,6 +477,9 @@ class OutputProcessor:
         if parent_req:
             self.parent_requests[parent_req.request_id] = parent_req
 
+        # Track the external_req_id -> [internal_req_id, ...] mapping
+        self.external_req_ids[req_state.external_req_id].append(request_id)
+
     def process_outputs(
         self,
         engine_core_outputs: list[EngineCoreOutput],
@@ -522,6 +563,12 @@ class OutputProcessor:
             # Free completed requests.
             if finish_reason is not None:
                 self.request_states.pop(req_id)
+
+                internal_ids = self.external_req_ids[req_state.external_req_id]
+                internal_ids.remove(req_id)
+                if not internal_ids:
+                    del self.external_req_ids[req_state.external_req_id]
+
                 # Remove parent request if applicable.
                 parent_req = req_state.parent_req
                 if parent_req and not parent_req.child_requests:
@@ -597,7 +644,9 @@ class OutputProcessor:
             )
 
             # meta
-            span.set_attribute(SpanAttributes.GEN_AI_REQUEST_ID, req_state.request_id)
+            span.set_attribute(
+                SpanAttributes.GEN_AI_REQUEST_ID, req_state.external_req_id
+            )
             if req_state.top_p:
                 span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TOP_P, req_state.top_p)
             if req_state.max_tokens_param:
diff --git a/vllm/v1/engine/parallel_sampling.py b/vllm/v1/engine/parallel_sampling.py
index 59aacd1963076..b7761970ba92f 100644
--- a/vllm/v1/engine/parallel_sampling.py
+++ b/vllm/v1/engine/parallel_sampling.py
@@ -6,6 +6,7 @@ from typing import Optional, cast
 
 from vllm.outputs import CompletionOutput
 from vllm.sampling_params import RequestOutputKind, SamplingParams
+from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.metrics.stats import IterationStats
 
 
@@ -17,6 +18,7 @@ class ParentRequest:
     """
 
     request_id: str
+    external_req_id: str
     sampling_params: SamplingParams
 
     # To track the completion of child requests
@@ -31,8 +33,11 @@ class ParentRequest:
     # To efficiently obtain child sampling params
     cached_child_sampling_params: SamplingParams | None
 
-    def __init__(self, request_id: str, sampling_params: SamplingParams) -> None:
-        self.request_id = request_id
+    def __init__(self, request: EngineCoreRequest) -> None:
+        assert request.external_req_id is not None
+        sampling_params = request.params
+        self.request_id = request.request_id
+        self.external_req_id = request.external_req_id
         self.sampling_params = sampling_params
 
         self.child_requests = set()
@@ -96,7 +101,7 @@ class ParentRequest:
         self,
         child_request_id: str,
         completion_output: CompletionOutput,
-    ) -> tuple[str, list[CompletionOutput], bool]:
+    ) -> tuple[list[CompletionOutput], bool]:
         already_finished_and_returned: bool = False
         if completion_output.finished():
             if child_request_id in self.child_requests:
@@ -118,7 +123,7 @@ class ParentRequest:
             outputs = [] if self.child_requests else self.output_aggregator
 
         finished = not self.child_requests
-        return self.request_id, outputs, finished
+        return outputs, finished
 
     def observe_num_generation_tokens(self, num_generation_tokens: int):
         self.max_num_generation_tokens = max(
diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py
index 751862aa9c767..7370f0aefafb4 100644
--- a/vllm/v1/kv_cache_interface.py
+++ b/vllm/v1/kv_cache_interface.py
@@ -80,17 +80,20 @@ class AttentionSpec(KVCacheSpec):
 
 @dataclass(frozen=True)
 class FullAttentionSpec(AttentionSpec):
-    sliding_window: int | None = None
-    attention_chunk_size: int | None = None
     """
-    When hybrid allocator is disabled and the model contains both full 
-    attention layers and sliding window attention layers, sliding 
-    window attention are regarded as full attention in KV cache manager 
-    (blocks are allocated for all tokens), while computed as sliding window 
+    When hybrid allocator is disabled and the model contains both full
+    attention layers and sliding window attention layers, sliding
+    window attention are regarded as full attention in KV cache manager
+    (blocks are allocated for all tokens), while computed as sliding window
     attention in model runner.
     In this case, we use FullAttentionSpec and record the sliding window size.
+    """
+
+    sliding_window: int | None = None
+    """
     Default to None for not using sliding window attention.
     """
+    attention_chunk_size: int | None = None
 
     def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
         max_model_len = vllm_config.model_config.max_model_len
@@ -390,10 +393,11 @@ class KVCacheConfig:
     The KV cache configuration of a model.
     """
 
-    """The number of KV cache blocks"""
     num_blocks: int
-    """How should model runner initialize the KV cache tensors for each layer"""
+    """The number of KV cache blocks"""
     kv_cache_tensors: list[KVCacheTensor]
+    """How should model runner initialize the KV cache tensors for each layer"""
+    kv_cache_groups: list[KVCacheGroupSpec]
     """
     The kv cache groups of the model.
     For models with only one type of attention, there is only one group that
@@ -401,4 +405,3 @@ class KVCacheConfig:
     For models with multiple types of attention, there will be multiple groups,
     see `_get_kv_cache_config_uniform_page_size` for more details.
     """
-    kv_cache_groups: list[KVCacheGroupSpec]
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 9eaee1bb97bb9..2213b952c7a89 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -19,6 +19,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
 from vllm.logger import init_logger
 from vllm.plugins import STAT_LOGGER_PLUGINS_GROUP, load_plugins_by_group
 from vllm.v1.engine import FinishReason
+from vllm.v1.metrics.perf import PerfMetricsLogging
 from vllm.v1.metrics.prometheus import unregister_vllm_metrics
 from vllm.v1.metrics.stats import (
     CachingMetrics,
@@ -118,6 +119,9 @@ class LoggingStatLogger(StatLoggerBase):
         self.engine_is_idle = False
         self.aggregated = False
 
+        if self._enable_perf_stats():
+            self.perf_metrics_logging = PerfMetricsLogging(vllm_config)
+
     def _reset(self, now):
         self.last_log_time = now
 
@@ -127,6 +131,9 @@ class LoggingStatLogger(StatLoggerBase):
         self.num_corrupted_reqs: int = 0
         self.num_preemptions: int = 0
 
+    def _enable_perf_stats(self) -> bool:
+        return self.vllm_config.observability_config.enable_mfu_metrics
+
     def _track_iteration_stats(self, iteration_stats: IterationStats):
         # Save tracked stats for token counters.
         self.num_prompt_tokens += iteration_stats.num_prompt_tokens
@@ -175,6 +182,8 @@ class LoggingStatLogger(StatLoggerBase):
                 self.cudagraph_logging.observe(scheduler_stats.cudagraph_stats)
             if not self.aggregated:
                 self.last_scheduler_stats = scheduler_stats
+            if (perf_stats := scheduler_stats.perf_stats) and self._enable_perf_stats():
+                self.perf_metrics_logging.observe(perf_stats)
         if mm_cache_stats:
             self.mm_caching_metrics.observe(mm_cache_stats)
 
@@ -211,7 +220,7 @@ class LoggingStatLogger(StatLoggerBase):
             "Running: %d reqs",
             "Waiting: %d reqs",
         ]
-        log_args = [
+        log_args: list[int | float | str] = [
             self.last_prompt_throughput,
             self.last_generation_throughput,
             self.last_scheduler_stats.num_running_reqs,
@@ -254,6 +263,8 @@ class LoggingStatLogger(StatLoggerBase):
         self.kv_connector_logging.log(log_fn=log_fn)
         if self.cudagraph_logging is not None:
             self.cudagraph_logging.log(log_fn=log_fn)
+        if self._enable_perf_stats():
+            self.perf_metrics_logging.log(log_fn=log_fn, log_prefix=self.log_prefix)
 
     def log_engine_initialized(self):
         if self.vllm_config.cache_config.num_gpu_blocks:
@@ -282,6 +293,10 @@ class AggregatedLoggingStatLogger(LoggingStatLogger, AggregateStatLoggerBase):
     def log_prefix(self):
         return "{} Engines Aggregated: ".format(len(self.engine_indexes))
 
+    def _enable_perf_stats(self) -> bool:
+        # Adding per_gpu perf stats across engines can lead to misleading numbers.
+        return False
+
     def record(
         self,
         scheduler_stats: SchedulerStats | None,
diff --git a/vllm/v1/metrics/perf.py b/vllm/v1/metrics/perf.py
new file mode 100644
index 0000000000000..446a81fc4855d
--- /dev/null
+++ b/vllm/v1/metrics/perf.py
@@ -0,0 +1,1244 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+Analytic flops/memory estimation module for transformer components,
+to help derive MFU (Model Flops Utilization) stats for a running model.
+"""
+
+import json
+import time
+from abc import ABC, abstractmethod
+from collections.abc import Iterable
+from dataclasses import asdict, dataclass
+from typing import Any, Protocol
+
+import torch
+from pydantic import BaseModel, Field, ValidationError, model_validator
+from typing_extensions import Self
+
+import vllm.envs as envs
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.utils.torch_utils import (
+    STR_DTYPE_TO_TORCH_DTYPE,
+    get_dtype_size,
+    get_kv_cache_torch_dtype,
+)
+from vllm.v1.core.sched.output import SchedulerOutput
+
+logger = init_logger(__name__)
+
+
+class InvalidComponent(Exception):
+    """
+    Custom exception to indicate that a certain ComponentMetric is not
+    applicable to the given VllmConfig.
+    """
+
+    pass
+
+
+#### Basic Data Types ####
+
+
+@dataclass
+class DebugPerfStats:
+    ## Stats for debugging the metrics calculation
+    calc_duration: float = 0.0  # time spent calculating these stats
+    num_prefill_requests: int = 0
+    num_decode_requests: int = 0
+    context_breakdown: dict[str, int] | None = None
+    num_flops_per_gpu_breakdown: dict[str, int] | None = None
+    num_read_bytes_per_gpu_breakdown: dict[str, int] | None = None
+    num_write_bytes_per_gpu_breakdown: dict[str, int] | None = None
+
+
+@dataclass
+class PerfStats:
+    num_flops_per_gpu: int = 0
+    num_read_bytes_per_gpu: int = 0
+    num_write_bytes_per_gpu: int = 0
+    debug_stats: DebugPerfStats | None = None
+
+
+@dataclass
+class ExecutionContext:
+    """
+    Represents an execution context for a batch of requests.
+
+    This class aggregates statistics across multiple requests in a batch,
+    separately tracking prefill and decode phases.
+
+    Example)
+    - Batch with one full prefill (2048 tokens) and one decode (1 token, 8192 context):
+      ctx = ExecutionContext()
+      ctx.add(2048, 2048, is_prefill=True)
+      ctx.add(1, 8192, is_prefill=False)
+    """
+
+    # Prefill phase statistics
+    num_prefill_requests: int = 0
+    prefill_num_tokens: int = 0  # sum of num_tokens for prefill requests
+    prefill_context_len: int = 0  # sum of context_len for prefill requests
+    prefill_token_context_product: int = 0  # sum of (num_tokens * context_len)
+
+    # Decode phase statistics
+    num_decode_requests: int = 0
+    decode_num_tokens: int = 0  # sum of num_tokens for decode requests
+    decode_context_len: int = 0  # sum of context_len for decode requests
+    decode_token_context_product: int = 0  # sum of (num_tokens * context_len)
+
+    def add(self, num_tokens: int, context_len: int, is_prefill: bool) -> None:
+        """Add a single request's statistics to this batch context."""
+        if is_prefill:
+            self.num_prefill_requests += 1
+            self.prefill_num_tokens += num_tokens
+            self.prefill_context_len += context_len
+            self.prefill_token_context_product += num_tokens * context_len
+        else:
+            self.num_decode_requests += 1
+            self.decode_num_tokens += num_tokens
+            self.decode_context_len += context_len
+            self.decode_token_context_product += num_tokens * context_len
+
+    def total_num_tokens(self) -> int:
+        """Total number of tokens across all requests in the batch."""
+        return self.prefill_num_tokens + self.decode_num_tokens
+
+    def total_token_context_product(self) -> int:
+        """Total sum of (num_tokens * context_len) across all requests."""
+        return self.prefill_token_context_product + self.decode_token_context_product
+
+    @classmethod
+    def from_single_request(
+        cls, num_tokens: int, context_len: int, is_prefill: bool
+    ) -> "ExecutionContext":
+        """Create an ExecutionContext from a single request.
+
+        This is a convenience method primarily for testing.
+        """
+        ctx = cls()
+        ctx.add(num_tokens, context_len, is_prefill)
+        return ctx
+
+
+class ParsedArgs:
+    """
+    Syntactic sugar so that Parsers can use dot notations
+    to access/update the parsed arguments.
+
+    e.g.)
+        args = ParsedArgs()
+        args.x = 3
+        args.y = args.x + 1
+    """
+
+    def __getattr__(self, name: str) -> Any:
+        raise AttributeError(f"'{type(self).__name__}' has no attribute '{name}'")
+
+    def __setattr__(self, name: str, value: Any) -> None:
+        object.__setattr__(self, name, value)
+
+    def model_dump(self) -> dict[str, Any]:
+        return vars(self).copy()
+
+
+#### Abstract ####
+
+
+class Parser(Protocol):
+    def parse(self, args: ParsedArgs, vllm_config: VllmConfig) -> ParsedArgs:
+        """
+        Parse the vllm config and update the current ParsedArgs and pass it on.
+        If the parser isn't applicable to the vllm_config, it will do nothing.
+        """
+        ...
+
+
+class ParserChain:
+    """
+    Applies chain of parser in a sequential order.
+    Later parsers might overwrite results from previous parsers,
+    so parsers should be chained in the appropriate order if they
+    are not mutually exclusive.
+    """
+
+    def __init__(self, *parsers: Parser) -> None:
+        self.parsers = list(parsers)
+
+    def add_parser(self, parser: Parser) -> None:
+        self.parsers.append(parser)
+
+    def parse(self, vllm_config: VllmConfig) -> ParsedArgs:
+        args = ParsedArgs()
+        for parser in self.parsers:
+            args = parser.parse(args, vllm_config)
+        return args
+
+
+_COMPONENT_METRICS_REGISTRY: dict[str, type["ComponentMetrics"]] = {}
+
+
+class ComponentMetrics(BaseModel, ABC):
+    """
+    Each concrete ComponentMetrics class is associated with:
+    - fields that are required for metric derivation
+      (fields are specified/validated through pydantic model)
+    - parser to parse VllmConfig into fields
+    - metric methods that derive flops/bytes for a given execution context
+    """
+
+    @classmethod
+    @abstractmethod
+    def component_type(cls) -> str: ...
+
+    @classmethod
+    @abstractmethod
+    def get_parser(cls) -> ParserChain:
+        """
+        Return a ParserChain that provides values for all required fields.
+        The returned parser chain must populate ParsedArgs with values for every
+        field defined on this ComponentMetrics class. Missing fields will cause
+        a ValidationError when from_vllm_config() is called.
+        See individual Parser docstrings for which args they provide, and field
+        comments on ComponentMetrics subclasses for which parser provides each field.
+        """
+        ...
+
+    def __init_subclass__(cls):
+        _COMPONENT_METRICS_REGISTRY[cls.component_type()] = cls
+
+    @classmethod
+    def from_vllm_config(cls, vllm_config: VllmConfig) -> Self:
+        """
+        Instantiate this class from VllmConfig.
+        Raises ValidationError if parsing fails.
+        """
+
+        parser = cls.get_parser()
+        parsed_args = parser.parse(vllm_config)
+        try:
+            return cls.model_validate(parsed_args.model_dump())
+        except ValidationError as e:
+            raise InvalidComponent(f"Invalid {cls.component_type()} config: {e}") from e
+
+    @classmethod
+    def registered_metrics(cls) -> Iterable[type["ComponentMetrics"]]:
+        return iter(_COMPONENT_METRICS_REGISTRY.values())
+
+    @abstractmethod
+    def get_num_flops_breakdown(
+        self, ctx: ExecutionContext, per_gpu: bool = True
+    ) -> dict[str, int]: ...
+
+    @abstractmethod
+    def get_read_bytes_breakdown(
+        self, ctx: ExecutionContext, per_gpu: bool = True
+    ) -> dict[str, int]: ...
+
+    @abstractmethod
+    def get_write_bytes_breakdown(
+        self, ctx: ExecutionContext, per_gpu: bool = True
+    ) -> dict[str, int]: ...
+
+    def get_num_flops(self, ctx: ExecutionContext, per_gpu: bool = True) -> int:
+        return sum(self.get_num_flops_breakdown(ctx, per_gpu).values())
+
+    def get_read_bytes(self, ctx: ExecutionContext, per_gpu: bool = True) -> int:
+        return sum(self.get_read_bytes_breakdown(ctx, per_gpu).values())
+
+    def get_write_bytes(self, ctx: ExecutionContext, per_gpu: bool = True) -> int:
+        return sum(self.get_write_bytes_breakdown(ctx, per_gpu).values())
+
+
+#### parsers ####
+
+
+class BaseConfigParser(Parser):
+    """
+    Parses base model configuration.
+    Provides: vocab_size, hidden_size, num_attention_heads, num_hidden_layers,
+    weight_byte_size, activation_byte_size, dp_size, tp_size, pp_size, enable_ep
+    """
+
+    def parse(self, args: ParsedArgs, vllm_config: VllmConfig) -> ParsedArgs:
+        model_config = vllm_config.model_config
+
+        args.vocab_size = model_config.get_vocab_size()
+        args.hidden_size = model_config.get_hidden_size()
+        # NOTE: model_config.get_attention_heads() divide by TP
+        # so we access field manually here to get total num_heads
+        args.num_attention_heads = get_required(
+            model_config.hf_text_config, "num_attention_heads"
+        )
+        args.num_hidden_layers = get_required(
+            model_config.hf_text_config, "num_hidden_layers"
+        )
+
+        model_dtype = vllm_config.model_config.dtype
+
+        if isinstance(model_dtype, torch.dtype):
+            torch_dtype = model_dtype
+        elif isinstance(model_dtype, str) and model_dtype in STR_DTYPE_TO_TORCH_DTYPE:
+            torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[model_dtype]
+        else:
+            # FIXME: handle this better
+            logger.warning(
+                "Unknown model_dtype %s, defaulting to bfloat16",
+                model_dtype,
+            )
+            torch_dtype = torch.bfloat16
+
+        args.weight_byte_size = get_dtype_size(torch_dtype)
+
+        # FIXME: handle this better by parsing whether activations use
+        # bf16, fp32, etc...
+        args.activation_byte_size = 2
+
+        args.dp_size = vllm_config.parallel_config.data_parallel_size
+        args.tp_size = vllm_config.parallel_config.tensor_parallel_size
+        args.pp_size = vllm_config.parallel_config.pipeline_parallel_size
+        args.enable_ep = vllm_config.parallel_config.enable_expert_parallel
+
+        return args
+
+
+#### Attention ####
+
+
+class BaseAttentionConfigParser(Parser):
+    """
+    Parses attention-specific configuration.
+    Provides: num_key_value_heads, head_dim, cache_byte_size
+    """
+
+    def parse(self, args: ParsedArgs, vllm_config: VllmConfig) -> ParsedArgs:
+        model_config = vllm_config.model_config
+
+        args.num_key_value_heads = model_config.get_total_num_kv_heads()
+        args.head_dim = model_config.get_head_size()
+
+        model_dtype = vllm_config.model_config.dtype
+        cache_dtype = vllm_config.cache_config.cache_dtype
+
+        kv_cache_torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype)
+        args.cache_byte_size = get_dtype_size(kv_cache_torch_dtype)
+
+        return args
+
+
+class AttentionQuantizationConfigParser(Parser):
+    """
+    Parses quantization configuration for attention layers.
+    Overrides: weight_byte_size
+    """
+
+    def parse(self, args: ParsedArgs, vllm_config: VllmConfig) -> ParsedArgs:
+        cfg = vllm_config.quant_config
+
+        if cfg is None:
+            return args
+
+        quant_method = cfg.get_name()
+        if quant_method in ["fp8", "fbgemm_fp8"]:
+            # FIXME: This is a hacky coarse-grained fp8 quantization detection.
+            # FIXME: These configs also have concept of "ignored layers" and we
+            # need to solve the same problem as above.
+            args.weight_byte_size = 1
+        elif quant_method == "mxfp4":
+            # FIXME: Also has "ignored layers" issue above
+            args.weight_byte_size = 0.5
+        else:
+            # FIXME: Add more parsing logic for different quant methods.
+            raise InvalidComponent
+
+        return args
+
+
+class AttentionMetrics(ComponentMetrics):
+    # From BaseConfigParser
+    num_hidden_layers: int = Field(..., gt=0)
+    hidden_size: int = Field(..., gt=0)
+    num_attention_heads: int = Field(..., gt=0)
+    activation_byte_size: int = Field(..., gt=0)
+    tp_size: int = Field(..., gt=0)
+    pp_size: int = Field(..., gt=0)
+
+    # From BaseAttentionConfigParser
+    num_key_value_heads: int = Field(..., gt=0)
+    head_dim: int = Field(..., gt=0)
+    cache_byte_size: int = Field(..., gt=0)
+
+    # From BaseConfig Parser, overridden by AttentionQuantizationConfigParser
+    weight_byte_size: int | float = Field(..., gt=0)
+
+    # TODO: discern cases where we have mixture of different attention layer types
+    # such as SWA, MLA, etc.
+
+    @classmethod
+    def component_type(cls) -> str:
+        return "attn"
+
+    @classmethod
+    def get_parser(cls) -> ParserChain:
+        return ParserChain(
+            BaseConfigParser(),
+            BaseAttentionConfigParser(),
+            AttentionQuantizationConfigParser(),
+        )
+
+    def get_num_flops_breakdown(
+        self, ctx: ExecutionContext, per_gpu: bool = True
+    ) -> dict[str, int]:
+        L, D, q, kv, d = (
+            self.num_hidden_layers,
+            self.hidden_size,
+            self.num_attention_heads,
+            self.num_key_value_heads,
+            self.head_dim,
+        )
+        T = ctx.total_num_tokens()
+        TC = ctx.total_token_context_product()
+
+        if per_gpu:
+            L //= self.pp_size
+            # tensor parallel along heads
+            q = max(1, q // self.tp_size)
+            kv = max(1, kv // self.tp_size)
+
+        return {
+            "qkv_proj": 2 * T * D * (q + 2 * kv) * d * L,
+            "attn_qk": 2 * q * TC * d * L,
+            "attn_av": 2 * q * TC * d * L,
+            "out_proj": 2 * T * D * q * d * L,
+        }
+
+    def get_read_bytes_breakdown(
+        self, ctx: ExecutionContext, per_gpu: bool = True
+    ) -> dict[str, int]:
+        L, D, q, kv, d = (
+            self.num_hidden_layers,
+            self.hidden_size,
+            self.num_attention_heads,
+            self.num_key_value_heads,
+            self.head_dim,
+        )
+        T = ctx.total_num_tokens()
+
+        if per_gpu:
+            L //= self.pp_size
+            # tensor parallel along heads
+            q = max(1, q // self.tp_size)
+            kv = max(1, kv // self.tp_size)
+
+        read_bytes = {}
+
+        read_bytes["qkv_input"] = T * D * self.activation_byte_size * L
+        read_bytes["qkv_weight"] = int(D * (q + 2 * kv) * d * self.weight_byte_size * L)
+
+        # Attention input reads differ between prefill and decode
+        # Prefill: read Q, K, V activations (all in activation_byte_size)
+        if ctx.prefill_num_tokens > 0:
+            read_bytes["attn_input"] = (
+                (ctx.prefill_num_tokens * q + 2 * ctx.prefill_context_len * kv)
+                * d
+                * self.activation_byte_size
+                * L
+            )
+
+        # Decode: read Q activations + read K, V from cache (in cache_byte_size)
+        if ctx.decode_num_tokens > 0:
+            read_bytes["attn_input"] = read_bytes.get("attn_input", 0) + (
+                ctx.decode_num_tokens * q * d * self.activation_byte_size * L
+                + 2 * ctx.decode_context_len * kv * d * self.cache_byte_size * L
+            )
+
+        read_bytes["out_input"] = T * q * d * self.activation_byte_size * L
+        read_bytes["out_weight"] = int(q * d * D * self.weight_byte_size * L)
+
+        return read_bytes
+
+    def get_write_bytes_breakdown(
+        self, ctx: ExecutionContext, per_gpu: bool = True
+    ) -> dict[str, int]:
+        """Calculate write memory traffic for attention layers."""
+        L, D, q, kv, d = (
+            self.num_hidden_layers,
+            self.hidden_size,
+            self.num_attention_heads,
+            self.num_key_value_heads,
+            self.head_dim,
+        )
+        T = ctx.total_num_tokens()
+
+        if per_gpu:
+            L //= self.pp_size
+            # tensor parallel along heads
+            q = max(1, q // self.tp_size)
+            kv = max(1, kv // self.tp_size)
+
+        return {
+            "qkv_output": T * (q + 2 * kv) * d * self.activation_byte_size * L,
+            "kv_cache": 2 * T * kv * d * self.cache_byte_size * L,
+            "out_output": T * D * self.activation_byte_size * L,
+        }
+
+
+#### Ffn ####
+
+
+class BaseFfnConfigParser(Parser):
+    """
+    Parses FFN and MoE configuration.
+    Provides: intermediate_size, num_experts, num_experts_per_tok,
+    moe_intermediate_size, num_shared_experts, num_moe_layers
+    """
+
+    def parse(self, args: ParsedArgs, vllm_config: VllmConfig) -> ParsedArgs:
+        cfg = vllm_config.model_config.hf_config
+        if hasattr(cfg, "text_config") and cfg.text_config is not None:
+            cfg = cfg.text_config
+
+        args.intermediate_size = getattr(cfg, "intermediate_size", args.hidden_size * 4)
+
+        # Try different naming conventions.
+        args.num_experts = vllm_config.model_config.get_num_experts()
+        args.num_experts_per_tok = getattr_from_list(
+            cfg, ["num_experts_per_tok", "moe_topk"], 0
+        )
+        args.moe_intermediate_size = getattr_from_list(
+            cfg, ["moe_intermediate_size", "intermediate_size"], 0
+        )
+        args.num_shared_experts = getattr_from_list(
+            cfg, ["n_shared_experts", "num_shared_experts"], 0
+        )
+
+        is_moe = args.num_experts != 0
+        # Assume all MoE layers by default
+        args.num_moe_layers = args.num_hidden_layers if is_moe else 0
+
+        return args
+
+
+class FfnParallelParser(Parser):
+    """
+    Parses FFN parallelism configuration.
+
+    Provides: ffn_tp_size, ffn_ep_size
+    """
+
+    def parse(self, args: ParsedArgs, vllm_config: VllmConfig) -> ParsedArgs:
+        # NOTE: ffn tp_size does not equal the tp_size parameter directly.
+        # e.g.) If we use DP2TP4, ffn will use TP8 (or EP8 if EP is enabled.)
+        if args.enable_ep:
+            ffn_tp_size, ffn_ep_size = 1, args.dp_size * args.tp_size
+        else:
+            ffn_tp_size, ffn_ep_size = args.dp_size * args.tp_size, 1
+
+        args.ffn_tp_size = ffn_tp_size
+        args.ffn_ep_size = ffn_ep_size
+
+        return args
+
+
+class InterleaveMoeLayerStepParser(Parser):
+    """
+    Parses interleave_moe_layer_step field for models like Llama4.
+
+    Overrides: num_moe_layers
+    """
+
+    def parse(self, args: ParsedArgs, vllm_config: VllmConfig) -> ParsedArgs:
+        cfg = vllm_config.model_config.hf_config
+        if hasattr(cfg, "text_config") and cfg.text_config is not None:
+            cfg = cfg.text_config
+
+        if (
+            hasattr(cfg, "interleave_moe_layer_step")
+            and cfg.interleave_moe_layer_step > 0
+        ):
+            args.num_moe_layers = len(
+                [
+                    layer
+                    for layer in range(args.num_hidden_layers)
+                    if (layer + 1) % cfg.interleave_moe_layer_step == 0
+                ]
+            )
+
+        return args
+
+
+class MoeLayerFreqParser(Parser):
+    """
+    Parses moe_layer_freq and first_k_dense_replace fields for models like Deepseek.
+
+    Overrides: num_moe_layers
+    """
+
+    def parse(self, args: ParsedArgs, vllm_config: VllmConfig) -> ParsedArgs:
+        cfg = vllm_config.model_config.hf_config
+        if hasattr(cfg, "text_config") and cfg.text_config is not None:
+            cfg = cfg.text_config
+
+        if hasattr(cfg, "moe_layer_freq") and hasattr(cfg, "first_k_dense_replace"):
+            args.num_moe_layers = len(
+                [
+                    layer
+                    for layer in range(args.num_hidden_layers)
+                    if layer >= cfg.first_k_dense_replace
+                    and layer % cfg.moe_layer_freq == 0
+                ]
+            )
+
+        return args
+
+
+class FfnQuantizationConfigParser(Parser):
+    """
+    Parses quantization configuration for FFN layers.
+
+    Overrides: weight_byte_size
+    """
+
+    def parse(self, args: ParsedArgs, vllm_config: VllmConfig) -> ParsedArgs:
+        cfg = vllm_config.quant_config
+
+        if cfg is None:
+            return args
+
+        quant_method = cfg.get_name()
+        if quant_method in ["fp8", "fbgemm_fp8"]:
+            # FIXME: This is a hacky coarse-grained fp8 quantization detection.
+            # (there might be more quantization methods for fp8).
+            # FIXME: These configs also have concept of "ignored layers" and we
+            # need to solve the same problem as above.
+            args.weight_byte_size = 1
+            pass
+        elif quant_method == "mxfp4":
+            # FIXME: Also has "ignored layers" issue above
+            args.weight_byte_size = 0.5
+        else:
+            # FIXME: Add more parsing logic for different quant methods.
+            raise InvalidComponent
+
+        return args
+
+
+class FfnMetrics(ComponentMetrics):
+    # From BaseConfigParser
+    num_hidden_layers: int = Field(..., gt=0)
+    hidden_size: int = Field(..., gt=0)
+    activation_byte_size: int = Field(..., gt=0)
+    pp_size: int = Field(..., gt=0)
+
+    # From FfnParallelParser
+    ffn_tp_size: int = Field(..., gt=0)
+    ffn_ep_size: int = Field(..., gt=0)
+
+    # From BaseFfnConfigParser
+    intermediate_size: int = Field(..., gt=0)
+    num_experts: int = Field(0)
+    num_experts_per_tok: int = Field(1)
+    moe_intermediate_size: int = Field(0)
+    num_shared_experts: int = Field(0)
+
+    # From BaseConfigParser, can be overridden InterleaveMoeLayerStep or MoeLayerFreq
+    num_moe_layers: int = Field(..., ge=0)
+
+    # FIXME: might have to make this more granular
+    # (i.e. dense_weight_byte_size, moe_routed_weight_byte_size,
+    # moe_shared_weight_byte_size)
+    # since it can differ from byte size of other components (e.g. attn)
+    # and can differ even from each other.
+
+    # From BaseConfigParser, can be overridden by FfnQuantizationConfigParser
+    weight_byte_size: int | float = Field(..., gt=0)
+
+    @model_validator(mode="after")
+    def validate_moe_fields(self) -> Self:
+        """Validate that MoE-related fields are properly set when num_moe_layers > 0."""
+        if self.num_moe_layers > 0:
+            assert self.num_experts, f"{self.num_experts=}"
+            assert self.num_experts_per_tok, f"{self.num_experts_per_tok=}"
+            assert self.moe_intermediate_size, f"{self.moe_intermediate_size=}"
+        return self
+
+    @classmethod
+    def component_type(cls) -> str:
+        return "ffn"
+
+    @classmethod
+    def get_parser(cls) -> ParserChain:
+        return ParserChain(
+            BaseConfigParser(),
+            FfnParallelParser(),
+            BaseFfnConfigParser(),
+            InterleaveMoeLayerStepParser(),
+            MoeLayerFreqParser(),
+            FfnQuantizationConfigParser(),
+        )
+
+    def get_num_flops_breakdown(
+        self, ctx: ExecutionContext, per_gpu: bool = True
+    ) -> dict[str, int]:
+        """Calculate flops breakdown for FFN layers."""
+        L, D, DI = self.num_hidden_layers, self.hidden_size, self.intermediate_size
+        Lm, E, MI, S = (
+            self.num_moe_layers,
+            self.num_experts_per_tok,
+            self.moe_intermediate_size,
+            self.num_shared_experts,
+        )
+        T = ctx.total_num_tokens()
+
+        Ld = L - Lm
+
+        num_activated_tokens = T * E if E else 0
+
+        if per_gpu:
+            Ld //= self.pp_size
+            Lm //= self.pp_size
+
+            DI //= self.ffn_tp_size
+            if MI is not None:
+                MI //= self.ffn_tp_size
+            if E:
+                num_activated_tokens //= self.ffn_ep_size
+
+        flops = {}
+
+        # Dense FFN layers (SwiGLU: 3 linear layers: up, gate, down)
+        if Ld:
+            flops["dense_ffn"] = 2 * D * 3 * DI * T * Ld
+
+        # MoE routed experts (each token activates E experts)
+        if Lm and E:
+            flops["routed_ffn"] = 2 * D * 3 * MI * num_activated_tokens * Lm
+
+        # MoE shared experts (all S shared experts run for every token)
+        if Lm and S:
+            flops["shared_ffn"] = 2 * D * 3 * MI * S * T * Lm
+
+        return flops
+
+    def get_read_bytes_breakdown(
+        self, ctx: ExecutionContext, per_gpu: bool = True
+    ) -> dict[str, int]:
+        """Calculate read memory traffic for FFN layers."""
+        L, D, DI = self.num_hidden_layers, self.hidden_size, self.intermediate_size
+        Lm, E, MI, S = (
+            self.num_moe_layers,
+            self.num_experts_per_tok,
+            self.moe_intermediate_size,
+            self.num_shared_experts,
+        )
+        T = ctx.total_num_tokens()
+        num_experts = self.num_experts
+
+        Ld = L - Lm
+
+        num_activated_tokens = T * E if E else 0
+
+        if per_gpu:
+            Ld //= self.pp_size
+            Lm //= self.pp_size
+
+            DI //= self.ffn_tp_size
+            if MI is not None:
+                MI //= self.ffn_tp_size
+            if E:
+                num_activated_tokens //= self.ffn_ep_size
+            if num_experts is not None:
+                num_experts //= self.ffn_ep_size
+
+        read_bytes = {}
+
+        # Dense FFN layers (3 GEMMs: up, gate, down projections + SiLU activation)
+        if Ld:
+            read_bytes["dense_up_gate_input"] = int(
+                T * D * self.activation_byte_size * Ld
+            )
+            read_bytes["dense_up_gate_weights"] = int(
+                2 * D * DI * self.weight_byte_size * Ld
+            )
+            read_bytes["dense_silu_input"] = int(
+                2 * T * DI * self.activation_byte_size * Ld
+            )
+            read_bytes["dense_down_input"] = int(
+                T * DI * self.activation_byte_size * Ld
+            )
+            read_bytes["dense_down_weights"] = int(D * DI * self.weight_byte_size * Ld)
+
+        if Lm:
+            # MoE routed expert reads
+            if E:
+                # FIXME: Assume perfect load balancing for now.
+                num_activated_experts = min(num_activated_tokens, num_experts)
+
+                read_bytes["routed_up_gate_input"] = int(
+                    num_activated_tokens * D * self.activation_byte_size * Lm
+                )
+                read_bytes["routed_up_gate_weights"] = int(
+                    2 * D * MI * num_activated_experts * self.weight_byte_size * Lm
+                )
+                read_bytes["routed_silu_input"] = int(
+                    2 * num_activated_tokens * MI * self.activation_byte_size * Lm
+                )
+                read_bytes["routed_down_input"] = int(
+                    num_activated_tokens * MI * self.activation_byte_size * Lm
+                )
+                read_bytes["routed_down_weights"] = int(
+                    D * MI * num_activated_experts * self.weight_byte_size * Lm
+                )
+
+            # MoE shared expert reads
+            if S:
+                read_bytes["shared_up_gate_input"] = int(
+                    T * D * self.activation_byte_size * Lm
+                )
+                read_bytes["shared_up_gate_weights"] = int(
+                    2 * D * MI * S * self.weight_byte_size * Lm
+                )
+                read_bytes["shared_silu_input"] = int(
+                    2 * T * MI * S * self.activation_byte_size * Lm
+                )
+                read_bytes["shared_down_input"] = int(
+                    T * MI * self.activation_byte_size * Lm
+                )
+                read_bytes["shared_down_weights"] = int(
+                    D * MI * S * self.weight_byte_size * Lm
+                )
+
+        return read_bytes
+
+    def get_write_bytes_breakdown(
+        self, ctx: ExecutionContext, per_gpu: bool = True
+    ) -> dict[str, int]:
+        """Calculate write memory traffic for FFN layers."""
+        L, D, DI = self.num_hidden_layers, self.hidden_size, self.intermediate_size
+        Lm, E, MI, S = (
+            self.num_moe_layers,
+            self.num_experts_per_tok,
+            self.moe_intermediate_size,
+            self.num_shared_experts,
+        )
+        T = ctx.total_num_tokens()
+
+        Ld = L - Lm
+
+        num_activated_tokens = T * E if E else 0
+
+        if per_gpu:
+            Ld //= self.pp_size
+            Lm //= self.pp_size
+
+            DI //= self.ffn_tp_size
+            if MI is not None:
+                MI //= self.ffn_tp_size
+            if E:
+                num_activated_tokens //= self.ffn_ep_size
+
+        write_bytes = {}
+
+        # Dense FFN layers
+        if Ld:
+            write_bytes["dense_up_gate_output"] = int(
+                2 * T * DI * self.activation_byte_size * Ld
+            )
+            write_bytes["dense_silu_output"] = int(
+                T * DI * self.activation_byte_size * Ld
+            )
+            write_bytes["dense_down_output"] = int(
+                T * D * self.activation_byte_size * Ld
+            )
+
+        # MoE outputs
+        if Lm:
+            if E:
+                write_bytes["routed_up_gate_output"] = int(
+                    2 * num_activated_tokens * MI * self.activation_byte_size * Lm
+                )
+                write_bytes["routed_silu_output"] = int(
+                    num_activated_tokens * MI * self.activation_byte_size * Lm
+                )
+                write_bytes["routed_down_output"] = int(
+                    num_activated_tokens * D * self.activation_byte_size * Lm
+                )
+            if S:
+                write_bytes["shared_up_gate_output"] = int(
+                    2 * T * S * MI * self.activation_byte_size * Lm
+                )
+                write_bytes["shared_silu_output"] = int(
+                    T * S * MI * self.activation_byte_size * Lm
+                )
+                write_bytes["shared_down_output"] = int(
+                    T * S * D * self.activation_byte_size * Lm
+                )
+
+        return write_bytes
+
+
+#### Unembed ####
+
+
+class UnembedMetrics(ComponentMetrics):
+    # From BaseConfigParser
+    hidden_size: int = Field(..., gt=0)
+    vocab_size: int = Field(..., gt=0)
+    weight_byte_size: int = Field(..., gt=0)
+    activation_byte_size: int = Field(..., gt=0)
+
+    tp_size: int
+
+    @classmethod
+    def component_type(cls) -> str:
+        return "unembed"
+
+    @classmethod
+    def get_parser(cls) -> ParserChain:
+        return ParserChain(
+            BaseConfigParser(),
+        )
+
+    def get_num_flops_breakdown(
+        self, ctx: ExecutionContext, per_gpu: bool = True
+    ) -> dict[str, int]:
+        """Calculate flops breakdown for unembedding layer."""
+        D, V = self.hidden_size, self.vocab_size
+        T = ctx.total_num_tokens()
+
+        if per_gpu:
+            V //= self.tp_size
+
+        return {
+            "unembed": 2 * T * D * V,
+        }
+
+    def get_read_bytes_breakdown(
+        self, ctx: ExecutionContext, per_gpu: bool = True
+    ) -> dict[str, int]:
+        """Calculate read memory traffic for unembedding layer."""
+        D, V = self.hidden_size, self.vocab_size
+        T = ctx.total_num_tokens()
+
+        if per_gpu:
+            V //= self.tp_size
+
+        return {
+            "input": T * D * self.activation_byte_size,
+            "weight": D * V * self.weight_byte_size,
+        }
+
+    def get_write_bytes_breakdown(
+        self, ctx: ExecutionContext, per_gpu: bool = True
+    ) -> dict[str, int]:
+        """Calculate write memory traffic for unembedding layer."""
+        V = self.vocab_size
+        T = ctx.total_num_tokens()
+
+        if per_gpu:
+            V //= self.tp_size
+
+        return {
+            "output": T * V * self.activation_byte_size,
+        }
+
+
+#### ModelMetrics ####
+
+
+class ModelMetrics:
+    def __init__(self, vllm_config: VllmConfig) -> None:
+        """
+        Parse vllm_config to instantiate metrics for each component.
+        is_enabled() will return False if no component metrics could be instantiated.
+        """
+
+        self.vllm_config = vllm_config
+
+        self.metrics: list[ComponentMetrics] = []
+        for metric_cls in ComponentMetrics.registered_metrics():
+            try:
+                metric = metric_cls.from_vllm_config(vllm_config)
+                self.metrics.append(metric)
+                logger.info(
+                    "Instantiated ComponentMetrics [%s] with (%s)",
+                    metric.component_type(),
+                    str(metric),
+                )
+            except InvalidComponent as e:
+                logger.debug(
+                    "Failed to instantiate %s from %s",
+                    metric_cls.component_type(),
+                    str(e),
+                )
+
+    def is_enabled(self) -> bool:
+        return len(self.metrics) > 0
+
+    def get_num_flops(self, ctx: ExecutionContext, per_gpu: bool = True) -> int:
+        return sum(metric.get_num_flops(ctx, per_gpu) for metric in self.metrics)
+
+    def get_read_bytes(self, ctx: ExecutionContext, per_gpu: bool = True) -> int:
+        return sum(metric.get_read_bytes(ctx, per_gpu) for metric in self.metrics)
+
+    def get_write_bytes(self, ctx: ExecutionContext, per_gpu: bool = True) -> int:
+        return sum(metric.get_write_bytes(ctx, per_gpu) for metric in self.metrics)
+
+    def get_num_flops_breakdown(
+        self, ctx: ExecutionContext, per_gpu: bool = True
+    ) -> dict[str, int]:
+        total = {}
+        for metric in self.metrics:
+            breakdown = metric.get_num_flops_breakdown(ctx, per_gpu)
+            component = metric.component_type()
+            prefixed = {f"{component}.{key}": val for key, val in breakdown.items()}
+            total.update(prefixed)
+        return total
+
+    def get_read_bytes_breakdown(
+        self, ctx: ExecutionContext, per_gpu: bool = True
+    ) -> dict[str, int]:
+        total = {}
+        for metric in self.metrics:
+            breakdown = metric.get_read_bytes_breakdown(ctx, per_gpu)
+            component = metric.component_type()
+            prefixed = {f"{component}.{key}": val for key, val in breakdown.items()}
+            total.update(prefixed)
+        return total
+
+    def get_write_bytes_breakdown(
+        self, ctx: ExecutionContext, per_gpu: bool = True
+    ) -> dict[str, int]:
+        total = {}
+        for metric in self.metrics:
+            breakdown = metric.get_write_bytes_breakdown(ctx, per_gpu)
+            component = metric.component_type()
+            prefixed = {f"{component}.{key}": val for key, val in breakdown.items()}
+            total.update(prefixed)
+        return total
+
+    def get_step_perf_stats_per_gpu(
+        self, scheduler_output: SchedulerOutput
+    ) -> PerfStats:
+        """
+        Calculate perf stats for the current step based on scheduled tokens.
+        """
+
+        t0 = time.monotonic()
+
+        # Build a single batch context
+        ctx = ExecutionContext()
+
+        # Process new requests (these are in prefill phase)
+        for new_req in scheduler_output.scheduled_new_reqs:
+            req_id = new_req.req_id
+            num_tokens = scheduler_output.num_scheduled_tokens.get(req_id, 0)
+            if num_tokens == 0:
+                continue
+
+            # For new requests, context_len = num_computed_tokens + num_tokens
+            # num_computed_tokens represents previously computed tokens in the sequence
+            context_len = new_req.num_computed_tokens + num_tokens
+            ctx.add(num_tokens, context_len, is_prefill=True)
+
+        # Process cached requests (continuing requests)
+        cached_reqs = scheduler_output.scheduled_cached_reqs
+        for i, req_id in enumerate(cached_reqs.req_ids):
+            num_tokens = scheduler_output.num_scheduled_tokens.get(req_id, 0)
+            if num_tokens == 0:
+                continue
+
+            # For cached requests, we have the current num_computed_tokens
+            num_computed_tokens = cached_reqs.num_computed_tokens[i]
+            context_len = num_computed_tokens + num_tokens
+
+            # Cached requests are typically in decode phase (num_tokens == 1)
+            # unless they're doing chunked prefill (num_tokens > 1)
+            is_prefill = num_tokens > 1
+            ctx.add(num_tokens, context_len, is_prefill)
+
+        num_flops_breakdown = self.get_num_flops_breakdown(ctx, True)
+        read_bytes_breakdown = self.get_read_bytes_breakdown(ctx, True)
+        write_bytes_breakdown = self.get_write_bytes_breakdown(ctx, True)
+        perf_stats = PerfStats(
+            sum(num_flops_breakdown.values()),
+            sum(read_bytes_breakdown.values()),
+            sum(write_bytes_breakdown.values()),
+        )
+
+        if envs.VLLM_DEBUG_MFU_METRICS:
+            perf_stats.debug_stats = DebugPerfStats(
+                time.monotonic() - t0,
+                ctx.num_prefill_requests,
+                ctx.num_decode_requests,
+                asdict(ctx),
+                num_flops_breakdown,
+                read_bytes_breakdown,
+                write_bytes_breakdown,
+            )
+
+        return perf_stats
+
+
+#### Logging ####
+
+
+class PerfMetricsDebugLogging:
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.total_calc_duration: float = 0.0
+        self.total_num_prefill_requests: int = 0
+        self.total_num_decode_requests: int = 0
+        self.total_num_batches: int = 0
+        self.total_context_breakdown: dict[str, int] = {}
+        self.total_num_flops_per_gpu_breakdown: dict[str, int] = {}
+        self.total_read_bytes_per_gpu_breakdown: dict[str, int] = {}
+        self.total_write_bytes_per_gpu_breakdown: dict[str, int] = {}
+
+    def observe(self, debug_stats: DebugPerfStats) -> None:
+        self.total_calc_duration += debug_stats.calc_duration
+        self.total_num_prefill_requests += debug_stats.num_prefill_requests
+        self.total_num_decode_requests += debug_stats.num_decode_requests
+        self.total_num_batches += 1
+
+        for dst, src in zip(
+            [
+                self.total_context_breakdown,
+                self.total_num_flops_per_gpu_breakdown,
+                self.total_read_bytes_per_gpu_breakdown,
+                self.total_write_bytes_per_gpu_breakdown,
+            ],
+            [
+                debug_stats.context_breakdown,
+                debug_stats.num_flops_per_gpu_breakdown,
+                debug_stats.num_read_bytes_per_gpu_breakdown,
+                debug_stats.num_write_bytes_per_gpu_breakdown,
+            ],
+        ):
+            assert isinstance(src, dict)
+            for key, val in src.items():
+                dst[key] = dst.get(key, 0) + val
+
+    def log(self, log_fn, log_prefix: str, delta_time: float):
+        # pretty print breakdowns
+        total_num_flops_per_gpu_breakdown = {
+            k: f"{v / 1e12:.1f}TF"
+            for k, v in self.total_num_flops_per_gpu_breakdown.items()
+        }
+        total_read_bytes_per_gpu_breakdown = {
+            k: f"{v / 1e9:.1f}GB"
+            for k, v in self.total_read_bytes_per_gpu_breakdown.items()
+        }
+        total_write_bytes_per_gpu_breakdown = {
+            k: f"{v / 1e9:.1f}GB"
+            for k, v in self.total_write_bytes_per_gpu_breakdown.items()
+        }
+
+        logger.debug(
+            "%sMFU details: %s",
+            log_prefix,
+            json.dumps(
+                {
+                    "prefill_reqs": self.total_num_prefill_requests,
+                    "decode_reqs": self.total_num_decode_requests,
+                    "num_batches": self.total_num_batches,
+                    "context_breakdown": self.total_context_breakdown,
+                    "flops_breakdown": total_num_flops_per_gpu_breakdown,
+                    "num_read_bytes_breakdown": total_read_bytes_per_gpu_breakdown,
+                    "num_write_bytes_breakdown": (total_write_bytes_per_gpu_breakdown),
+                    "duration": f"{delta_time:.1f}s",
+                    "mfu_calc_overhead": (
+                        f"{self.total_calc_duration / delta_time:.1%}"
+                    ),
+                },
+                indent=2,
+            ),
+        )
+
+
+class PerfMetricsLogging:
+    def __init__(self, vllm_config: VllmConfig):
+        self.vllm_config = vllm_config
+        self.pp_size = vllm_config.parallel_config.pipeline_parallel_size
+
+        self.debug_logging: PerfMetricsDebugLogging | None = None
+        if envs.VLLM_DEBUG_MFU_METRICS:
+            self.debug_logging = PerfMetricsDebugLogging()
+
+        self.reset()
+
+    def reset(self):
+        self.last_log_time = time.monotonic()
+
+        self.total_num_flops_per_gpu: int = 0
+        self.total_read_bytes_per_gpu: int = 0
+        self.total_write_bytes_per_gpu: int = 0
+
+        if self.debug_logging:
+            self.debug_logging.reset()
+
+    def observe(self, perf_stats: PerfStats) -> None:
+        self.total_num_flops_per_gpu += perf_stats.num_flops_per_gpu
+        self.total_read_bytes_per_gpu += perf_stats.num_read_bytes_per_gpu
+        self.total_write_bytes_per_gpu += perf_stats.num_write_bytes_per_gpu
+
+        if self.debug_logging:
+            assert perf_stats.debug_stats is not None
+            self.debug_logging.observe(perf_stats.debug_stats)
+
+    def log(self, log_fn=logger.info, log_prefix: str = "") -> None:
+        if not (
+            self.total_num_flops_per_gpu
+            or self.total_read_bytes_per_gpu
+            or self.total_write_bytes_per_gpu
+        ):
+            return
+
+        now = time.monotonic()
+        delta_time = now - self.last_log_time
+
+        if delta_time <= 0.0:
+            avg_tflops_per_gpu = 0.0
+            avg_gbps_per_gpu = 0.0
+        else:
+            avg_tflops_per_gpu = self.total_num_flops_per_gpu / delta_time / 1e12
+            avg_gbps_per_gpu = (
+                (self.total_read_bytes_per_gpu + self.total_write_bytes_per_gpu)
+                / delta_time
+                / 1e9
+            )
+
+        log_fn(
+            "%sMFU: %.1f TF/s/GPU %.1f GB/s/GPU",
+            log_prefix,
+            avg_tflops_per_gpu,
+            avg_gbps_per_gpu,
+        )
+
+        if self.debug_logging:
+            self.debug_logging.log(log_fn, log_prefix, delta_time)
+
+        self.reset()
+
+
+## util functions
+
+
+def get_required(obj: object, attr: str):
+    """Get an attr from an object, or throw a InvalidComponentError if it's not set."""
+    if not hasattr(obj, attr):
+        raise InvalidComponent(f"Missing required attr {attr} in config")
+    return getattr(obj, attr)
+
+
+def getattr_from_list(obj: object, attrs: list[str], default: object = None):
+    """Try to get the first attr that exists in the object
+    from a list of attrs. Otherwise return None."""
+    for attr in attrs:
+        if hasattr(obj, attr):
+            return getattr(obj, attr)
+    return default
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index a0cc58d0a64e8..cb1a860e38fbc 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -8,6 +8,7 @@ from typing import TYPE_CHECKING, Any
 
 import vllm.envs as envs
 from vllm.compilation.cuda_graph import CUDAGraphStat
+from vllm.v1.metrics.perf import PerfStats
 from vllm.v1.spec_decode.metrics import SpecDecodingStats
 
 if TYPE_CHECKING:
@@ -186,6 +187,8 @@ class SchedulerStats:
 
     cudagraph_stats: CUDAGraphStat | None = None
 
+    perf_stats: PerfStats | None = None
+
 
 @dataclass
 class RequestStateStats:
diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py
index 50b91d8292ee8..f2338e9b4b7d0 100644
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@@ -119,8 +119,14 @@ class RejectionSampler(nn.Module):
         raw_target_logits = logits[target_logits_indices]
         # Use float32 for the target_logits.
         raw_target_logits = raw_target_logits.to(torch.float32)
+        target_logits = raw_target_logits
+        if not self.is_processed_logprobs_mode:
+            # Clone raw_target_logits before applying processors to preserve
+            # the original raw logits for logprobs computation, since
+            # apply_logits_processors modifies the tensor in-place.
+            target_logits = target_logits.clone()
         target_logits = self.apply_logits_processors(
-            raw_target_logits, sampling_metadata, metadata
+            target_logits, sampling_metadata, metadata
         )
         # [num_tokens, vocab_size]
         # NOTE(woosuk): `target_logits` can be updated in place inside the
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 65a0a88ec0f5d..66697132b365c 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -236,6 +236,7 @@ class EagleProposer:
         common_attn_metadata: CommonAttentionMetadata,
         sampling_metadata: SamplingMetadata,
         mm_embed_inputs: tuple[list[torch.Tensor], torch.Tensor] | None = None,
+        num_rejected_tokens_gpu: torch.Tensor | None = None,
     ) -> torch.Tensor:
         num_tokens = target_token_ids.shape[0]
         batch_size = next_token_ids.shape[0]
@@ -414,6 +415,17 @@ class EagleProposer:
         common_attn_metadata.query_start_loc_cpu = torch.from_numpy(
             self.token_arange_np[: batch_size + 1]
         ).clone()
+
+        # In padded drafter batch, we need to adjust the sequence lengths
+        # to remove the "padding" (i.e. rejected tokens).
+        # Only apply this adjustment when we have rejected tokens
+        # (i.e., not the first proposal).
+        if self.num_speculative_tokens > 1 and num_rejected_tokens_gpu is not None:
+            common_attn_metadata.seq_lens -= num_rejected_tokens_gpu
+            # Invalidate the CPU-side shadows to avoid H<>D sync.
+            common_attn_metadata._seq_lens_cpu = None
+            common_attn_metadata._num_computed_tokens_cpu = None
+
         for token_index in range(self.num_speculative_tokens - 1):
             # Update the inputs.
             # cast to int32 is crucial when eagle model is compiled.
@@ -628,13 +640,14 @@ class EagleProposer:
         common_attn_metadata: CommonAttentionMetadata,
         spec_decode_metadata: SpecDecodeMetadata,
         valid_sampled_tokens_count: torch.Tensor,
-    ) -> tuple[CommonAttentionMetadata, torch.Tensor]:
+    ) -> tuple[CommonAttentionMetadata, torch.Tensor, torch.Tensor]:
         """
         This function is used to prepare the inputs for speculative decoding
         It updates the common_attn_metadata for speculative decoding,
         but does not consider the rejected tokens. Instead, all tokens
         are included as inputs to the speculator, with the rejected tokens
         used as padding and filtered out later by `token_indices_to_sample`.
+        No blocking CPU operations should be introduced in this function.
         """
         num_reqs = common_attn_metadata.num_reqs
         device = valid_sampled_tokens_count.device
@@ -642,14 +655,17 @@ class EagleProposer:
         token_indices_to_sample = torch.empty(
             (num_reqs,), dtype=torch.int32, device=device
         )
+        num_rejected_tokens_gpu = torch.empty(
+            (num_reqs,), dtype=torch.int32, device=device
+        )
 
-        # Kernel grid: one program per request (row)
         grid = (num_reqs,)
         eagle_prepare_inputs_padded_kernel[grid](
             spec_decode_metadata.cu_num_draft_tokens,
             valid_sampled_tokens_count,
             common_attn_metadata.query_start_loc,
             token_indices_to_sample,
+            num_rejected_tokens_gpu,
             num_reqs,
         )
 
@@ -674,7 +690,11 @@ class EagleProposer:
             dcp_local_seq_lens=common_attn_metadata.dcp_local_seq_lens,
         )
 
-        return spec_common_attn_metadata, token_indices_to_sample
+        return (
+            spec_common_attn_metadata,
+            token_indices_to_sample,
+            num_rejected_tokens_gpu,
+        )
 
     def propose_tree(
         self,
diff --git a/vllm/v1/spec_decode/utils.py b/vllm/v1/spec_decode/utils.py
index 9d4399d00487a..783b6ed5961bc 100644
--- a/vllm/v1/spec_decode/utils.py
+++ b/vllm/v1/spec_decode/utils.py
@@ -23,6 +23,7 @@ def eagle_prepare_inputs_padded_kernel(
     valid_sampled_tokens_count_ptr,  # [num_reqs]
     query_start_loc_gpu_ptr,  # [num_reqs + 1]
     token_indices_to_sample_ptr,  # [num_reqs] (output)
+    num_rejected_tokens_gpu_ptr,  # [num_reqs] (output)
     num_reqs,  # tl.int32
 ):
     """
@@ -56,6 +57,7 @@ def eagle_prepare_inputs_padded_kernel(
 
     index_to_sample = q_last_tok_idx - num_rejected_tokens
     tl.store(token_indices_to_sample_ptr + req_idx, index_to_sample)
+    tl.store(num_rejected_tokens_gpu_ptr + req_idx, num_rejected_tokens)
 
 
 @triton.jit
diff --git a/vllm/v1/structured_output/backend_guidance.py b/vllm/v1/structured_output/backend_guidance.py
index 2962a439dcb3e..727a67333bd71 100644
--- a/vllm/v1/structured_output/backend_guidance.py
+++ b/vllm/v1/structured_output/backend_guidance.py
@@ -44,6 +44,32 @@ def _walk_json_for_additional_properties(data: object):
             _walk_json_for_additional_properties(item)
 
 
+def has_guidance_unsupported_json_features(schema: dict[str, Any]) -> bool:
+    """Check if JSON schema contains features unsupported by guidance/llguidance."""
+
+    def check_object(obj: dict[str, Any]) -> bool:
+        if not isinstance(obj, dict):
+            return False
+
+        # patternProperties is not supported by llguidance
+        if "patternProperties" in obj:
+            return True
+
+        # Recursively check all nested objects and arrays
+        for value in obj.values():
+            if isinstance(value, dict):
+                if check_object(value):
+                    return True
+            elif isinstance(value, list):
+                for item in value:
+                    if isinstance(item, dict) and check_object(item):
+                        return True
+
+        return False
+
+    return check_object(schema)
+
+
 def process_for_additional_properties(
     guide_json: str | dict[str, Any],
 ) -> dict[str, Any]:
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index ead7a3619dea5..08b595845bb40 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -128,7 +128,6 @@ class InputBatch:
         # allocation if max_model_len is big.
         # Maps req_index -> tensor of shape (num_prompt_tokens, hidden_size)
         self.req_prompt_embeds: dict[int, torch.Tensor] = {}
-        self.num_tokens = np.zeros(max_num_reqs, dtype=np.int32)
         self.num_tokens_no_spec = np.zeros(max_num_reqs, dtype=np.int32)
         self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32)
         self.num_computed_tokens_cpu_tensor = torch.zeros(
@@ -340,9 +339,6 @@ class InputBatch:
             self.req_prompt_embeds[req_index] = request.prompt_embeds
         self.token_ids_cpu[req_index, start_idx:end_idx] = request.output_token_ids
         self.is_token_ids[req_index, start_idx:end_idx] = True
-        # Number of token ids in prompt (token_ids_cpu or prompt_embeds).
-        # NOTE(woosuk): This may include spec decode tokens.
-        self.num_tokens[req_index] = request.num_tokens
         # Number of tokens without spec decode tokens.
         self.num_tokens_no_spec[req_index] = request.num_tokens
 
@@ -522,10 +518,6 @@ class InputBatch:
             self.req_id_to_index[old_id_i2],
             self.req_id_to_index[old_id_i1],
         )
-        self.num_tokens[i1], self.num_tokens[i2] = (
-            self.num_tokens[i2],
-            self.num_tokens[i1],
-        )
         self.num_tokens_no_spec[i1], self.num_tokens_no_spec[i2] = (
             self.num_tokens_no_spec[i2],
             self.num_tokens_no_spec[i1],
@@ -661,17 +653,16 @@ class InputBatch:
             self.req_output_token_ids[last_req_index] = None
             self.req_id_to_index[req_id] = empty_index
 
-            if last_req_index != empty_index:
-                (
-                    self.spec_token_ids[last_req_index],
-                    self.spec_token_ids[empty_index],
-                ) = (
-                    self.spec_token_ids[empty_index],
-                    self.spec_token_ids[last_req_index],
-                )
-                self.spec_token_ids[last_req_index].clear()
+            num_tokens = self.num_tokens_no_spec[last_req_index] + len(
+                self.spec_token_ids[last_req_index]
+            )
+
+            (self.spec_token_ids[last_req_index], self.spec_token_ids[empty_index]) = (
+                self.spec_token_ids[empty_index],
+                self.spec_token_ids[last_req_index],
+            )
+            self.spec_token_ids[last_req_index].clear()
 
-            num_tokens = self.num_tokens[last_req_index]
             self.token_ids_cpu[empty_index, :num_tokens] = self.token_ids_cpu[
                 last_req_index, :num_tokens
             ]
@@ -682,7 +673,6 @@ class InputBatch:
                 self.req_prompt_embeds[empty_index] = self.req_prompt_embeds.pop(
                     last_req_index
                 )
-            self.num_tokens[empty_index] = num_tokens
             self.num_tokens_no_spec[empty_index] = self.num_tokens_no_spec[
                 last_req_index
             ]
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index a44150432434b..00c585aaaacbb 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -66,6 +66,7 @@ from vllm.model_executor.models.interfaces import (
     SupportsXDRoPE,
     is_mixture_of_experts,
     supports_eagle3,
+    supports_mm_encoder_only,
     supports_mrope,
     supports_multimodal_pruning,
     supports_transcription,
@@ -923,7 +924,6 @@ class GPUModelRunner(
                         self.input_batch.num_prompt_tokens[req_index]
                         + num_output_tokens
                     )
-                    self.input_batch.num_tokens[req_index] = end_idx
                     self.input_batch.num_tokens_no_spec[req_index] = end_idx
 
             # Update the block IDs.
@@ -968,7 +968,6 @@ class GPUModelRunner(
                     req_index, start_token_index:end_token_index
                 ] = new_token_ids
                 self.input_batch.num_tokens_no_spec[req_index] = end_token_index
-                self.input_batch.num_tokens[req_index] = end_token_index
 
             # Add spec_token_ids to token_ids_cpu.
             spec_token_ids = scheduler_output.scheduled_spec_decode_tokens.get(
@@ -984,8 +983,6 @@ class GPUModelRunner(
                 self.input_batch.token_ids_cpu[
                     req_index, start_index:end_token_index
                 ] = spec_token_ids
-                # NOTE(woosuk): `num_tokens` here may include spec tokens.
-                self.input_batch.num_tokens[req_index] += num_spec_tokens
 
             # When speculative decoding is used with structured output,
             # the scheduler can drop draft tokens that do not
@@ -1645,7 +1642,10 @@ class GPUModelRunner(
         ) -> None:
             attn_group = self.attn_groups[kv_cache_gid][attn_gid]
             builder = attn_group.get_metadata_builder(ubid or 0)
-            cache_key = (kv_cache_groups[kv_cache_gid].kv_cache_spec, type(builder))
+            kv_cache_spec = kv_cache_groups[kv_cache_gid].kv_cache_spec
+            if isinstance(kv_cache_spec, UniformTypeKVCacheSpecs):
+                kv_cache_spec = kv_cache_spec.kv_cache_specs[attn_group.layer_names[0]]
+            cache_key = (kv_cache_spec, type(builder))
 
             cascade_attn_prefix_len = (
                 cascade_attn_prefix_lens[kv_cache_gid][attn_gid]
@@ -2457,6 +2457,17 @@ class GPUModelRunner(
             return round_up(num_scheduled_tokens, tp_size)
         return num_scheduled_tokens
 
+    def _prepare_mm_inputs(
+        self, num_tokens: int
+    ) -> tuple[torch.Tensor | None, torch.Tensor]:
+        if self.model.requires_raw_input_tokens:
+            input_ids = self.input_ids.gpu[:num_tokens]
+        else:
+            input_ids = None
+
+        inputs_embeds = self.inputs_embeds.gpu[:num_tokens]
+        return input_ids, inputs_embeds
+
     def _preprocess(
         self,
         scheduler_output: "SchedulerOutput",
@@ -2499,8 +2510,7 @@ class GPUModelRunner(
             # TODO(woosuk): Avoid the copy. Optimize.
             self.inputs_embeds.gpu[:num_scheduled_tokens].copy_(inputs_embeds_scheduled)
 
-            input_ids = None
-            inputs_embeds = self.inputs_embeds.gpu[:num_input_tokens]
+            input_ids, inputs_embeds = self._prepare_mm_inputs(num_input_tokens)
             model_kwargs = {
                 **self._init_model_kwargs(num_scheduled_tokens),
                 **self._extract_mm_kwargs(scheduler_output),
@@ -2702,7 +2712,6 @@ class GPUModelRunner(
             self.input_batch.token_ids_cpu[req_idx, start_idx:end_idx] = sampled_ids
             self.input_batch.is_token_ids[req_idx, start_idx:end_idx] = True
             self.input_batch.num_tokens_no_spec[req_idx] = end_idx
-            self.input_batch.num_tokens[req_idx] = end_idx
 
             req_id = req_ids[req_idx]
             req_state = self.requests[req_id]
@@ -3398,9 +3407,13 @@ class GPUModelRunner(
         return async_output
 
     def take_draft_token_ids(self) -> DraftTokenIds | None:
-        if self._draft_token_ids is None:
+        if not self.num_spec_tokens:
             return None
+
         req_ids = self.input_batch.req_ids
+        if self._draft_token_ids is None:
+            return DraftTokenIds(req_ids, [[] for _ in req_ids])
+
         if isinstance(self._draft_token_ids, torch.Tensor):
             draft_token_ids = self._draft_token_ids.tolist()
         else:
@@ -3531,6 +3544,7 @@ class GPUModelRunner(
                     next_token_ids, valid_sampled_tokens_count
                 )
 
+            num_rejected_tokens_gpu = None
             if spec_decode_metadata is None:
                 token_indices_to_sample = None
                 # input_ids can be None for multimodal models.
@@ -3561,12 +3575,14 @@ class GPUModelRunner(
                     else:
                         target_hidden_states = hidden_states[token_indices]
                 else:
-                    common_attn_metadata, token_indices_to_sample = (
-                        self.drafter.prepare_inputs_padded(
-                            common_attn_metadata,
-                            spec_decode_metadata,
-                            valid_sampled_tokens_count,
-                        )
+                    (
+                        common_attn_metadata,
+                        token_indices_to_sample,
+                        num_rejected_tokens_gpu,
+                    ) = self.drafter.prepare_inputs_padded(
+                        common_attn_metadata,
+                        spec_decode_metadata,
+                        valid_sampled_tokens_count,
                     )
                     total_num_tokens = common_attn_metadata.num_actual_tokens
                     # When padding the batch, token_indices is just a range
@@ -3597,6 +3613,7 @@ class GPUModelRunner(
                 sampling_metadata=sampling_metadata,
                 common_attn_metadata=common_attn_metadata,
                 mm_embed_inputs=mm_embed_inputs,
+                num_rejected_tokens_gpu=num_rejected_tokens_gpu,
             )
 
         return draft_token_ids
@@ -4065,6 +4082,11 @@ class GPUModelRunner(
             remove_lora: If False, dummy LoRAs are not destroyed after the run
             activate_lora: If False, dummy_run is performed without LoRAs.
         """
+        if supports_mm_encoder_only(self.model):
+            # The current dummy run only covers LM execution, so we can skip it.
+            # mm encoder dummy run may need to add in the future.
+            return torch.tensor([]), torch.tensor([])
+
         assert (
             cudagraph_runtime_mode is None
             or cudagraph_runtime_mode.valid_runtime_modes()
@@ -4208,8 +4230,8 @@ class GPUModelRunner(
             assert num_tokens_padded <= self.max_num_tokens
             model_kwargs = self._init_model_kwargs(num_tokens_padded)
             if self.supports_mm_inputs and not self.model_config.is_encoder_decoder:
-                input_ids = None
-                inputs_embeds = self.inputs_embeds.gpu[:num_tokens_padded]
+                input_ids, inputs_embeds = self._prepare_mm_inputs(num_tokens_padded)
+
                 model_kwargs = {
                     **model_kwargs,
                     **self._dummy_mm_kwargs(num_reqs),
@@ -4342,6 +4364,11 @@ class GPUModelRunner(
         # The dummy hidden states may contain special values,
         # like `inf` or `nan`.
         # To avoid breaking the sampler, we use a random tensor here instead.
+
+        if supports_mm_encoder_only(self.model):
+            # MM Encoder only model no need to run sampler.
+            return torch.tensor([])
+
         hidden_states = torch.rand_like(hidden_states)
 
         logits = self.model.compute_logits(hidden_states)
@@ -4470,6 +4497,10 @@ class GPUModelRunner(
         self,
         hidden_states: torch.Tensor,
     ) -> PoolerOutput:
+        if supports_mm_encoder_only(self.model):
+            # MM Encoder only model not need to run pooler.
+            return torch.tensor([])
+
         # Find the task that has the largest output for subsequent steps
         supported_pooling_tasks = self.get_supported_pooling_tasks()
 
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index bc71351d2cc55..4747388e22b3d 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -634,7 +634,12 @@ class Worker(WorkerBase):
 
     def profile(self, is_start: bool = True):
         if self.profiler is None:
-            raise RuntimeError("Profiling is not enabled.")
+            raise RuntimeError(
+                "Profiling is not enabled. Please set --profiler-config to enable "
+                "profiling. Example: "
+                "'--profiler-config.profiler=torch --profiler-config.torch_profiler_dir"
+                "=YOUR_DIR_PATH_TO_DUMP_TRACE'"
+            )
         if is_start:
             self.profiler.start()
         else:
diff --git a/vllm/v1/worker/tpu_input_batch.py b/vllm/v1/worker/tpu_input_batch.py
index 2ed65ca9d31cd..3758a73ee4967 100644
--- a/vllm/v1/worker/tpu_input_batch.py
+++ b/vllm/v1/worker/tpu_input_batch.py
@@ -51,7 +51,6 @@ class InputBatch:
             pin_memory=False,
         )
         self.token_ids_cpu = self.token_ids_cpu_tensor.numpy()
-        self.num_tokens = np.zeros(max_num_reqs, dtype=np.int32)
         self.num_tokens_no_spec = np.zeros(max_num_reqs, dtype=np.int32)
         self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32)
         self.num_computed_tokens_cpu_tensor = torch.zeros(
@@ -200,9 +199,6 @@ class InputBatch:
         start_idx = num_prompt_tokens
         end_idx = start_idx + len(request.output_token_ids)
         self.token_ids_cpu[req_index, start_idx:end_idx] = request.output_token_ids
-        # Number of token ids in token_ids_cpu.
-        # NOTE(woosuk): This may include spec decode tokens.
-        self.num_tokens[req_index] = request.num_tokens
         # Number of tokens without spec decode tokens.
         self.num_tokens_no_spec[req_index] = request.num_tokens
 
@@ -344,10 +340,6 @@ class InputBatch:
             self.req_id_to_index[old_id_i2],
             self.req_id_to_index[old_id_i1],
         )
-        self.num_tokens[i1], self.num_tokens[i2] = (
-            self.num_tokens[i2],
-            self.num_tokens[i1],
-        )
         self.num_tokens_no_spec[i1], self.num_tokens_no_spec[i2] = (
             self.num_tokens_no_spec[i2],
             self.num_tokens_no_spec[i1],
@@ -448,11 +440,10 @@ class InputBatch:
             self.req_output_token_ids[last_req_index] = None
             self.req_id_to_index[req_id] = empty_index
 
-            num_tokens = self.num_tokens[last_req_index]
+            num_tokens = self.num_tokens_no_spec[last_req_index]
             self.token_ids_cpu[empty_index, :num_tokens] = self.token_ids_cpu[
                 last_req_index, :num_tokens
             ]
-            self.num_tokens[empty_index] = num_tokens
             self.num_tokens_no_spec[empty_index] = self.num_tokens_no_spec[
                 last_req_index
             ]
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 283f21b779e38..c7404c4642d7e 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -1283,7 +1283,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 token_id = valid_sampled_token_ids[i][0]
                 self.input_batch.token_ids_cpu[i, seq_len] = token_id
                 req_state.output_token_ids.append(token_id)
-                self.input_batch.num_tokens[i] += 1
+                self.input_batch.num_tokens_no_spec[i] += 1
 
         else:
             valid_mask = selected_token_ids != INVALID_TOKEN_ID
@@ -1291,7 +1291,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             valid_sampled_token_ids = [
                 seq.tolist() for seq in selected_token_ids[valid_mask].split(gen_lens)
             ]
-            self.input_batch.num_tokens[:num_reqs] += gen_lens
+            self.input_batch.num_tokens_no_spec[:num_reqs] += gen_lens
             for i, req_state, seq_len in request_seq_lens:
                 target_slice = slice(seq_len - gen_lens[i] + 1, seq_len + 1)
                 self.input_batch.token_ids_cpu[i, target_slice] = (