diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md
index ae42f70077cec..3721d3d1d6749 100644
--- a/.buildkite/nightly-benchmarks/README.md
+++ b/.buildkite/nightly-benchmarks/README.md
@@ -28,6 +28,7 @@ See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performanc
 ## Trigger the benchmark
 
 Performance benchmark will be triggered when:
+
 - A PR being merged into vllm.
 - Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
 
@@ -38,6 +39,7 @@ bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
 ```
 
 Runtime environment variables:
+
 - `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
 - `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
 - `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
@@ -46,12 +48,14 @@ Runtime environment variables:
 - `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
 
 Nightly benchmark will be triggered when:
+
 - Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
 
 ## Performance benchmark details
 
 See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
 > NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
+>
 ### Latency test
 
 Here is an example of one test inside `latency-tests.json`:
@@ -100,7 +104,6 @@ We test the throughput by using `vllm bench serve` with request rate = inf to co
             "tensor_parallel_size": 1,
             "swap_space": 16,
             "disable_log_stats": "",
-            "disable_log_requests": "",
             "load_format": "dummy"
         },
         "client_parameters": {
@@ -149,6 +152,7 @@ Here is an example using the script to compare result_a and result_b without det
 
 Here is an example using the script to compare result_a and result_b with detail test name.
 `python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
+
 |   | results_a/benchmark_results.json_name | results_a/benchmark_results.json | results_b/benchmark_results.json_name | results_b/benchmark_results.json | perf_ratio        |
 |---|---------------------------------------------|----------------------------------------|---------------------------------------------|----------------------------------------|----------|
 | 0 | serving_llama8B_tp1_sharegpt_qps_1          | 142.633982                             | serving_llama8B_tp1_sharegpt_qps_1          | 156.526018                             | 1.097396 |
diff --git a/.buildkite/nightly-benchmarks/nightly-annotation.md b/.buildkite/nightly-benchmarks/nightly-annotation.md
index ef11c040057c8..466def07b6f1f 100644
--- a/.buildkite/nightly-benchmarks/nightly-annotation.md
+++ b/.buildkite/nightly-benchmarks/nightly-annotation.md
@@ -1,3 +1,4 @@
+# Nightly benchmark annotation
 
 ## Description
 
@@ -13,15 +14,15 @@ Please download the visualization scripts in the post
 
 - Find the docker we use in `benchmarking pipeline`
 - Deploy the docker, and inside the docker:
-  - Download `nightly-benchmarks.zip`.
-  - In the same folder, run the following code:
+    - Download `nightly-benchmarks.zip`.
+    - In the same folder, run the following code:
 
-  ```bash
-  export HF_TOKEN=<your HF token>
-  apt update
-  apt install -y git
-  unzip nightly-benchmarks.zip
-  VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
-  ```
+    ```bash
+    export HF_TOKEN=<your HF token>
+    apt update
+    apt install -y git
+    unzip nightly-benchmarks.zip
+    VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+    ```
 
 And the results will be inside `./benchmarks/results`.
diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md
index 5f003f42f07c0..8afde017d383e 100644
--- a/.buildkite/nightly-benchmarks/nightly-descriptions.md
+++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@@ -13,25 +13,25 @@ Latest reproduction guilde: [github issue link](https://github.com/vllm-project/
 ## Setup
 
 - Docker images:
-  - vLLM: `vllm/vllm-openai:v0.6.2`
-  - SGLang: `lmsysorg/sglang:v0.3.2-cu121`
-  - LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
-  - TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
-    - *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.*
-  - Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
+    - vLLM: `vllm/vllm-openai:v0.6.2`
+    - SGLang: `lmsysorg/sglang:v0.3.2-cu121`
+    - LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
+    - TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
+        - *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.*
+    - Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
 - Hardware
-  - 8x Nvidia A100 GPUs
+    - 8x Nvidia A100 GPUs
 - Workload:
-  - Dataset
-    - ShareGPT dataset
-    - Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
-    - Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
-    - Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
-  - Models: llama-3 8B, llama-3 70B.
-    - We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
-  - Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
-    - Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
-  - Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
+    - Dataset
+        - ShareGPT dataset
+        - Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
+        - Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
+        - Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
+    - Models: llama-3 8B, llama-3 70B.
+        - We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
+    - Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
+        - Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
+    - Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
 
 ## Known issues
 
diff --git a/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md b/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md
index a1f8441ccdac8..8bb16bd3cf373 100644
--- a/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md
+++ b/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md
@@ -1,3 +1,4 @@
+# Performance benchmarks descriptions
 
 ## Latency tests
 
diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
index 05623879c0c2c..554256b4bdb8b 100644
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -44,6 +44,7 @@ serving_column_mapping = {
     "test_name": "Test name",
     "gpu_type": "GPU",
     "completed": "# of req.",
+    "max_concurrency": "# of max concurrency.",
     "request_throughput": "Tput (req/s)",
     "total_token_throughput": "Total Token Tput (tok/s)",
     "output_throughput": "Output Tput (tok/s)",
diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
index b515ee43934d1..2c57666a81aa3 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@@ -33,7 +33,7 @@ check_gpus() {
 
 check_cpus() {
   # check the number of CPUs and NUMA Node and GPU type.
-  declare -g numa_count=$(python3 -c  "from numa import info;numa_size = info.get_num_configured_nodes(); print(numa_size)")
+  declare -g numa_count=$(lscpu | grep "NUMA node(s):" | awk '{print $3}')
   if [[ $numa_count -gt 0 ]]; then
     echo "NUMA found."
     echo $numa_count
diff --git a/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json b/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json
index edbe9f2df0ce0..f26ae7634f3d9 100644
--- a/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json
@@ -11,7 +11,6 @@
         },
         "vllm_server_parameters": {
             "disable_log_stats": "",
-            "disable_log_requests": "",
             "gpu_memory_utilization": 0.9,
             "num_scheduler_steps": 10,
             "max_num_seqs": 512,
diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
index fda1a7a3ec53c..41b4a4008801d 100644
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@@ -35,7 +35,6 @@
         }, 
         "vllm_server_parameters": {
             "disable_log_stats": "",
-            "disable_log_requests": "",
             "gpu_memory_utilization": 0.9,
             "num_scheduler_steps": 10,
             "max_num_seqs": 512,
@@ -90,7 +89,6 @@
         }, 
         "vllm_server_parameters": {
             "disable_log_stats": "",
-            "disable_log_requests": "",
             "gpu_memory_utilization": 0.9,
             "num_scheduler_steps": 10,
             "max_num_seqs": 512,
@@ -145,7 +143,6 @@
         }, 
         "vllm_server_parameters": {
             "disable_log_stats": "",
-            "disable_log_requests": "",
             "gpu_memory_utilization": 0.9,
             "num_scheduler_steps": 10,
             "max_num_seqs": 512,
@@ -197,7 +194,6 @@
         }, 
         "vllm_server_parameters": {
             "disable_log_stats": "",
-            "disable_log_requests": "",
             "gpu_memory_utilization": 0.9,
             "num_scheduler_steps": 10,
             "max_num_seqs": 512,
@@ -251,7 +247,6 @@
         }, 
         "vllm_server_parameters": {
             "disable_log_stats": "",
-            "disable_log_requests": "",
             "gpu_memory_utilization": 0.9,
             "num_scheduler_steps": 10,
             "max_num_seqs": 512,
@@ -305,7 +300,6 @@
         }, 
         "vllm_server_parameters": {
             "disable_log_stats": "",
-            "disable_log_requests": "",
             "gpu_memory_utilization": 0.9,
             "num_scheduler_steps": 10,
             "max_num_seqs": 512,
diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
new file mode 100644
index 0000000000000..dd0e24edff98d
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
@@ -0,0 +1,203 @@
+[
+    {
+        "test_name": "serving_llama8B_tp1_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+	    "max_concurrency": 60,
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_tp2_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+	    "max_concurrency": 60,
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_tp4_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 4,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+	    "max_concurrency": 60,
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_tp1_random_128_128",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+	    "max_concurrency": 1000,
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_tp2_random_128_128",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+	    "max_concurrency": 1000,
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_tp4_random_128_128",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 4,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+	    "max_concurrency": 1000,
+            "num_prompts": 1000
+        }
+    }
+]
diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
new file mode 100644
index 0000000000000..f1bda65a7590b
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
@@ -0,0 +1,205 @@
+[
+    {
+        "test_name": "serving_llama8B_pp1_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "pipeline_parallel_size": 1,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+	    "max_concurrency": 60,
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_pp3_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "pipeline_parallel_size": 3,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+	    "max_concurrency": 60,
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_tp2pp6_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 2,
+            "pipeline_parallel_size": 3,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+	    "max_concurrency": 60,
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_pp1_random_128_128",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "pipeline_parallel_size": 1,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+	    "max_concurrency": 1000,
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_pp3_random_128_128",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL:": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "pipeline_parallel_size": 3,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+	    "max_concurrency": 1000,
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_tp2pp3_random_128_128",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 2,
+            "pipeline_parallel_size": 3,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+	    "max_concurrency": 1000,
+            "num_prompts": 1000
+        }
+    }
+]
diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
index 22f71c993ff33..f150b9abeea45 100644
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
@@ -6,6 +6,7 @@
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "server_parameters": {
@@ -16,8 +17,9 @@
 	    "block_size": 128,
 	    "trust_remote_code": "",
             "disable_log_stats": "",
-            "disable_log_requests": "",
 	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
             "load_format": "dummy"
         },
         "client_parameters": {
@@ -36,6 +38,7 @@
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "server_parameters": {
@@ -46,8 +49,9 @@
 	    "block_size": 128,
 	    "trust_remote_code": "",
             "disable_log_stats": "",
-            "disable_log_requests": "",
 	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
             "load_format": "dummy"
         },
         "client_parameters": {
@@ -66,6 +70,7 @@
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "server_parameters": {
@@ -76,8 +81,9 @@
 	    "block_size": 128,
 	    "trust_remote_code": "",
             "disable_log_stats": "",
-            "disable_log_requests": "",
 	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
             "load_format": "dummy"
         },
         "client_parameters": {
@@ -96,6 +102,7 @@
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "server_parameters": {
@@ -107,8 +114,9 @@
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
             "disable_log_stats": "",
-            "disable_log_requests": "",
 	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
             "load_format": "dummy"
         },
         "client_parameters": {
@@ -129,6 +137,7 @@
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "server_parameters": {
@@ -140,8 +149,9 @@
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
             "disable_log_stats": "",
-            "disable_log_requests": "",
 	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
             "load_format": "dummy"
         },
         "client_parameters": {
diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests.json b/.buildkite/nightly-benchmarks/tests/serving-tests.json
index 13fd5aa8db97b..a6d4141d5c2dc 100644
--- a/.buildkite/nightly-benchmarks/tests/serving-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests.json
@@ -7,7 +7,6 @@
             "tensor_parallel_size": 1,
             "swap_space": 16,
             "disable_log_stats": "",
-            "disable_log_requests": "",
             "load_format": "dummy"
         },
         "client_parameters": {
@@ -26,7 +25,6 @@
             "tensor_parallel_size": 4,
             "swap_space": 16,
             "disable_log_stats": "",
-            "disable_log_requests": "",
             "load_format": "dummy"
         },
         "client_parameters": {
@@ -45,7 +43,6 @@
             "tensor_parallel_size": 2,
             "swap_space": 16,
             "disable_log_stats": "",
-            "disable_log_requests": "",
             "load_format": "dummy"
         },
         "client_parameters": {
@@ -60,8 +57,7 @@
         "test_name": "serving_llama70B_tp4_sharegpt_specdecode",
         "qps_list": [2],
         "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
-            "disable_log_requests": "", 
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", 
             "tensor_parallel_size": 4,
             "swap_space": 16,
             "speculative_config": {
diff --git a/.buildkite/scripts/hardware_ci/run-gh200-test.sh b/.buildkite/scripts/hardware_ci/run-gh200-test.sh
index 8c64e14606d3b..f69e4b06680f5 100644
--- a/.buildkite/scripts/hardware_ci/run-gh200-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-gh200-test.sh
@@ -16,8 +16,7 @@ DOCKER_BUILDKIT=1 docker build . \
   --build-arg max_jobs=66 \
   --build-arg nvcc_threads=2 \
   --build-arg RUN_WHEEL_CHECK=false \
-  --build-arg torch_cuda_arch_list="9.0+PTX" \
-  --build-arg vllm_fa_cmake_gpu_arches="90-real"
+  --build-arg torch_cuda_arch_list="9.0+PTX"
 
 # Setup cleanup
 remove_docker_container() { docker rm -f gh200-test || true; }
diff --git a/.buildkite/scripts/tpu/run_bm.sh b/.buildkite/scripts/tpu/run_bm.sh
index beecaf7a740ae..b1e17b438578d 100755
--- a/.buildkite/scripts/tpu/run_bm.sh
+++ b/.buildkite/scripts/tpu/run_bm.sh
@@ -44,7 +44,6 @@ echo
 
 VLLM_USE_V1=1 vllm serve $MODEL \
  --seed 42 \
- --disable-log-requests \
  --max-num-seqs $MAX_NUM_SEQS \
  --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
  --tensor-parallel-size $TENSOR_PARALLEL_SIZE \
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 948ce9e8667f5..e139c6b30586e 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -82,7 +82,7 @@ steps:
   - bash standalone_tests/python_only_compile.sh
 
 - label: Basic Correctness Test # 30min
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental]
   fast_check: true
   torch_nightly: true
   source_file_dependencies:
@@ -99,7 +99,7 @@ steps:
   - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
 
 - label: Chunked Prefill Test
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - vllm/
   - tests/basic_correctness/test_chunked_prefill
@@ -108,7 +108,7 @@ steps:
   - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
 
 - label: Core Test # 10min
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental]
   fast_check: true
   source_file_dependencies:
   - vllm/core
@@ -128,11 +128,10 @@ steps:
   - tests/entrypoints/offline_mode
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
+  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_collective_rpc.py
   - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
   - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
   - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
-  - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
   - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 
 - label: Entrypoints Test (API Server) # 40min
@@ -210,7 +209,7 @@ steps:
   - pytest -v -s distributed/test_eplb_execute.py
 
 - label: Metrics, Tracing Test # 10min
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental]
   num_gpus: 2
   source_file_dependencies:
   - vllm/
@@ -229,7 +228,7 @@ steps:
 #####  1 GPU test  #####
 
 - label: Regression Test # 5min
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - vllm/
   - tests/test_regression
@@ -281,7 +280,7 @@ steps:
     - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
 
 - label: Examples Test # 25min
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental]
   working_dir: "/vllm-workspace/examples"
   source_file_dependencies:
   - vllm/entrypoints
@@ -306,7 +305,7 @@ steps:
     - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
 
 - label: Prefix Caching Test # 9min
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - vllm/
   - tests/prefix_caching
@@ -315,7 +314,7 @@ steps:
 
 
 - label: Platform Tests (CUDA)
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - vllm/
   - tests/cuda
@@ -354,9 +353,10 @@ steps:
     - pytest -v -s compile/test_silu_mul_quant_fusion.py
     - pytest -v -s compile/test_sequence_parallelism.py
     - pytest -v -s compile/test_async_tp.py
+    - pytest -v -s compile/test_fusion_all_reduce.py
 
 - label: PyTorch Fullgraph Smoke Test # 9min
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental]
   torch_nightly: true
   source_file_dependencies:
   - vllm/
@@ -369,7 +369,7 @@ steps:
   - pytest -v -s compile/piecewise/test_full_cudagraph.py
 
 - label: PyTorch Fullgraph Test # 18min
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental]
   torch_nightly: true
   source_file_dependencies:
   - vllm/
@@ -378,7 +378,7 @@ steps:
   - pytest -v -s compile/test_full_graph.py
 
 - label: Kernels Core Operation Test
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - csrc/
   - tests/kernels/core
@@ -403,20 +403,21 @@ steps:
   - vllm/model_executor/layers/quantization
   - tests/kernels/quantization
   commands:
-    - pytest -v -s kernels/quantization  --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+    - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
   parallelism: 2
 
-- label: Kernels MoE Test
+- label: Kernels MoE Test %N
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - csrc/moe/
   - tests/kernels/moe
   - vllm/model_executor/layers/fused_moe/
   commands:
-    - pytest -v -s kernels/moe
+    - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  parallelism: 2
 
 - label: Kernels Mamba Test
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - csrc/mamba/
   - tests/kernels/mamba
@@ -424,7 +425,7 @@ steps:
     - pytest -v -s kernels/mamba
 
 - label: Tensorizer Test # 11min
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental]
   soft_fail: true
   source_file_dependencies:
   - vllm/model_executor/model_loader
@@ -437,7 +438,7 @@ steps:
     - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
 
 - label: Model Executor Test
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - vllm/model_executor
   - tests/model_executor
@@ -447,7 +448,7 @@ steps:
     - pytest -v -s model_executor
 
 - label: Benchmarks # 9min
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental]
   working_dir: "/vllm-workspace/.buildkite"
   source_file_dependencies:
   - benchmarks/
@@ -455,7 +456,7 @@ steps:
   - bash scripts/run-benchmarks.sh
 
 - label: Benchmarks CLI Test # 10min
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - vllm/
   - tests/benchmarks/
@@ -494,7 +495,7 @@ steps:
   - pytest -s entrypoints/openai/correctness/
 
 - label: Encoder Decoder tests # 5min
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - vllm/
   - tests/encoder_decoder
@@ -502,7 +503,7 @@ steps:
     - pytest -v -s encoder_decoder
 
 - label: OpenAI-Compatible Tool Use # 20 min
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental]
   fast_check: false
   source_file_dependencies:
     - vllm/
@@ -580,7 +581,8 @@ steps:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pip freeze | grep -E 'torch'
     - pytest -v -s models/multimodal/processing
-    - pytest -v -s --ignore models/multimodal/generation/test_whisper.py models/multimodal -m core_model
+    - pytest -v -s --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/test_tensor_schema.py models/multimodal -m core_model
+    - pytest -v -s models/multimodal/test_tensor_schema.py -m core_model  # Needs mp_method="spawn"
     - cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
 
 - label: Multi-Modal Models Test (Extended) 1
@@ -623,7 +625,7 @@ steps:
 
 # This test is used only in PR development phase to test individual models and should never run on main
 - label: Custom Models Test
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental]
   optional: true
   commands:
     - echo 'Testing custom models...'
@@ -643,11 +645,40 @@ steps:
     - python3 examples/offline_inference/audio_language.py --model-type whisper
     - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
 
+- label: Blackwell Test
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  # optional: true
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - csrc/attention/mla/
+  - csrc/quantization/cutlass_w8a8/moe/
+  - vllm/model_executor/layers/fused_moe/cutlass_moe.py
+  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
+  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/compilation/fusion.py
+  commands:
+    - nvidia-smi
+    - python3 examples/offline_inference/basic/chat.py
+    # Attention
+    # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
+    - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
+    - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
+    - pytest -v -s tests/kernels/test_cutlass_mla_decode.py
+    # Quantization
+    - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
+    - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
+    - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
+    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
+    # Fusion
+    - pytest -v -s tests/compile/test_fusion_all_reduce.py
+
 #####  1 GPU test  #####
 #####  multi gpus test  #####
 
 - label: Distributed Comm Ops Test # 7min
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental]
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
   source_file_dependencies:
@@ -718,7 +749,6 @@ steps:
   # this test fails consistently.
   # TODO: investigate and fix
   - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
-  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
   - pytest -v -s models/multimodal/generation/test_maverick.py
 
@@ -744,7 +774,7 @@ steps:
   - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
 
 - label: Multi-step Tests (4 GPUs) # 36min
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental]
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
   source_file_dependencies:
@@ -765,7 +795,7 @@ steps:
   - pytest -v -s multi_step/test_correctness_llm.py
 
 - label: Pipeline Parallelism Test # 45min
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental]
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
   source_file_dependencies:
@@ -779,7 +809,7 @@ steps:
   - pytest -v -s distributed/test_pipeline_parallel.py
 
 - label: LoRA TP Test (Distributed)
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental]
   num_gpus: 4
   source_file_dependencies:
   - vllm/lora
@@ -792,6 +822,7 @@ steps:
     # requires multi-GPU testing for validation.
     - pytest -v -s -x lora/test_chatglm3_tp.py
     - pytest -v -s -x lora/test_llama_tp.py
+    - pytest -v -s -x lora/test_multi_loras_with_tp.py
 
 
 - label: Weight Loading Multiple GPU Test  # 33min
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 2441055371663..5bc944296763d 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -10,7 +10,6 @@
 /vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
-/vllm/model_executor/guided_decoding @mgoin @russellb @aarnphm
 /vllm/multimodal @DarkLight1337 @ywang96
 /vllm/vllm_flash_attn @LucasWilkinson
 /vllm/lora @jeejeelee
@@ -35,9 +34,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/distributed/test_pipeline_parallel.py @youkaichao
 /tests/distributed/test_same_node.py @youkaichao
 /tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm
-/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb @aarnphm
 /tests/kernels @tlrmchlsmth @WoosukKwon
-/tests/model_executor/test_guided_processors.py @mgoin @russellb
 /tests/models @DarkLight1337 @ywang96
 /tests/multi_step @alexm-redhat @comaniac
 /tests/multimodal @DarkLight1337 @ywang96
@@ -64,3 +61,15 @@ mkdocs.yaml @hmellor
 /vllm/v1/worker/^xpu @jikunshang
 /vllm/platforms/xpu.py @jikunshang
 /docker/Dockerfile.xpu @jikunshang
+
+# Qwen-specific files
+/vllm/attention/backends/dual_chunk_flash_attn.py @sighingnow
+/vllm/model_executor/models/qwen* @sighingnow
+
+# Mistral-specific files
+/vllm/model_executor/models/mistral*.py @patrickvonplaten
+/vllm/model_executor/models/mixtral*.py @patrickvonplaten
+/vllm/model_executor/models/voxtral*.py @patrickvonplaten
+/vllm/model_executor/models/pixtral*.py @patrickvonplaten
+/vllm/transformers_utils/configs/mistral.py @patrickvonplaten
+/vllm/transformers_utils/tokenizers/mistral.py @patrickvonplaten
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index 017ec7ca82da7..d4aceab4472fa 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -1,4 +1,5 @@
-## Essential Elements of an Effective PR Description Checklist
+# Essential Elements of an Effective PR Description Checklist
+
 - [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
 - [ ] The test plan, such as providing test command.
 - [ ] The test results, such as pasting the results comparison before and after, or e2e results
@@ -14,5 +15,4 @@ PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE B
 
 ## (Optional) Documentation Update
 
-<!--- pyml disable-next-line no-emphasis-as-heading -->
 **BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)
diff --git a/.github/mergify.yml b/.github/mergify.yml
index 5c878ac02069f..d8ae509e0ac30 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -149,9 +149,6 @@ pull_request_rules:
       - files=examples/offline_inference/structured_outputs.py
       - files=examples/online_serving/openai_chat_completion_structured_outputs.py
       - files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
-      - files~=^vllm/model_executor/guided_decoding/
-      - files=tests/model_executor/test_guided_processors.py
-      - files=tests/entrypoints/llm/test_guided_generate.py
       - files~=^tests/v1/structured_output/
       - files=tests/v1/entrypoints/llm/test_guided_generate.py
       - files~=^vllm/v1/structured_output/
diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml
index d5736c0aee208..2b1086b7faf43 100644
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@@ -2,12 +2,16 @@ name: Lint and Deploy Charts
 
 on: pull_request
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 permissions:
   contents: read
 
 jobs:
   lint-and-deploy:
-    runs-on: ubuntu-24.04-arm
+    runs-on: ubuntu-latest
     steps:
       - name: Checkout
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
diff --git a/.github/workflows/matchers/markdownlint.json b/.github/workflows/matchers/markdownlint.json
new file mode 100644
index 0000000000000..fe094a9badb25
--- /dev/null
+++ b/.github/workflows/matchers/markdownlint.json
@@ -0,0 +1,17 @@
+{
+  "problemMatcher": [
+    {
+      "owner": "markdownlint",
+      "pattern": [
+        {
+          "regexp": "^([^:]*):(\\d+):?(\\d+)?\\s([\\w-\\/]*)\\s(.*)$",
+          "file": 1,
+          "line": 2,
+          "column": 3,
+          "code": 4,
+          "message": 5
+        }
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index 8e694d18134ef..195579f206a2f 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -5,6 +5,10 @@ on:
   push:
     branches: [main]
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
+
 permissions:
   contents: read
 
@@ -17,6 +21,7 @@ jobs:
       with:
         python-version: "3.12"
     - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
+    - run: echo "::add-matcher::.github/workflows/matchers/markdownlint.json"
     - run: echo "::add-matcher::.github/workflows/matchers/mypy.json"
     - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
       with:
diff --git a/.github/workflows/scripts/build.sh b/.github/workflows/scripts/build.sh
index 0f010832b465d..c69ebbb42da5a 100644
--- a/.github/workflows/scripts/build.sh
+++ b/.github/workflows/scripts/build.sh
@@ -15,7 +15,6 @@ $python_executable -m pip install -r requirements/build.txt -r requirements/cuda
 export MAX_JOBS=1
 # Make sure release wheels are built for the following architectures
 export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
-export VLLM_FA_CMAKE_GPU_ARCHES="80-real;90-real"
 
 bash tools/check_repo.sh
 
diff --git a/.markdownlint.yaml b/.markdownlint.yaml
new file mode 100644
index 0000000000000..c86fed9555d62
--- /dev/null
+++ b/.markdownlint.yaml
@@ -0,0 +1,13 @@
+MD007:
+  indent: 4
+MD013: false
+MD024:
+  siblings_only: true
+MD033: false
+MD042: false
+MD045: false
+MD046: false
+MD051: false
+MD052: false
+MD053: false
+MD059: false
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 5197820fb4020..612b290e88d46 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -35,12 +35,12 @@ repos:
     exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
     types_or: [c++, cuda]
     args: [--style=file, --verbose]
-- repo: https://github.com/jackdewinter/pymarkdown
-  rev: v0.9.29
+- repo: https://github.com/igorshubovych/markdownlint-cli
+  rev: v0.45.0
   hooks:
-  - id: pymarkdown
+  - id: markdownlint
     exclude: '.*\.inc\.md'
-    args: [fix]
+    stages: [manual] # Only run in CI
 - repo: https://github.com/rhysd/actionlint
   rev: v1.7.7
   hooks:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 664fb6a0ee9f0..e2cc0ccdef515 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -243,6 +243,7 @@ set(VLLM_EXT_SRC
   "csrc/sampler.cu"
   "csrc/cuda_view.cu"
   "csrc/quantization/gptq/q_gemm.cu"
+  "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
   "csrc/quantization/fp8/common.cu"
   "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
   "csrc/quantization/gguf/gguf_kernel.cu"
@@ -296,8 +297,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
     "csrc/cutlass_extensions/common.cpp"
     "csrc/attention/mla/cutlass_mla_entry.cu"
-    "csrc/quantization/fp8/per_token_group_quant.cu"
-    "csrc/quantization/compressed_tensors/int8_quant_kernels.cu")
+    "csrc/quantization/fp8/per_token_group_quant.cu")
 
   set_gencode_flags_for_srcs(
     SRCS "${VLLM_EXT_SRC}"
@@ -529,6 +529,25 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     endif()
   endif()
 
+  # The nvfp4_scaled_mm_sm120 kernels for Geforce Blackwell SM120 require
+  # CUDA 12.8 or later
+  cuda_archs_loose_intersection(FP4_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
+    set(SRCS
+      "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
+      "csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${SRCS}"
+      CUDA_ARCHS "${FP4_ARCHS}")
+    list(APPEND VLLM_EXT_SRC "${SRCS}")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM120=1")
+    message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
+  else()
+    message(STATUS "Not building NVFP4 as no compatible archs were found.")
+    # clear FP4_ARCHS
+    set(FP4_ARCHS)
+  endif()
+
   # FP4 Archs and flags
   cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
@@ -541,7 +560,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
       SRCS "${SRCS}"
       CUDA_ARCHS "${FP4_ARCHS}")
     list(APPEND VLLM_EXT_SRC "${SRCS}")
-    list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4=1")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM100=1")
     list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
     message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
   else()
diff --git a/README.md b/README.md
index dc2f0afbe3538..5348405b72d2c 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,4 @@
+<!-- markdownlint-disable MD001 MD041 -->
 <p align="center">
   <picture>
     <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/assets/logos/vllm-logo-text-dark.png">
@@ -16,6 +17,7 @@ Easy, fast, and cheap LLM serving for everyone
 ---
 
 *Latest News* 🔥
+
 - [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
 - [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/).
 - [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
@@ -46,6 +48,7 @@ Easy, fast, and cheap LLM serving for everyone
 </details>
 
 ---
+
 ## About
 
 vLLM is a fast and easy-to-use library for LLM inference and serving.
@@ -75,6 +78,7 @@ vLLM is flexible and easy to use with:
 - Multi-LoRA support
 
 vLLM seamlessly supports most popular open-source models on HuggingFace, including:
+
 - Transformer-like LLMs (e.g., Llama)
 - Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3)
 - Embedding Models (e.g., E5-Mistral)
@@ -91,6 +95,7 @@ pip install vllm
 ```
 
 Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
+
 - [Installation](https://docs.vllm.ai/en/latest/getting_started/installation.html)
 - [Quickstart](https://docs.vllm.ai/en/latest/getting_started/quickstart.html)
 - [List of Supported Models](https://docs.vllm.ai/en/latest/models/supported_models.html)
@@ -107,6 +112,7 @@ vLLM is a community project. Our compute resources for development and testing a
 <!-- Note: Please sort them in alphabetical order. -->
 <!-- Note: Please keep these consistent with docs/community/sponsors.md -->
 Cash Donations:
+
 - a16z
 - Dropbox
 - Sequoia Capital
@@ -114,6 +120,7 @@ Cash Donations:
 - ZhenFund
 
 Compute Resources:
+
 - AMD
 - Anyscale
 - AWS
diff --git a/RELEASE.md b/RELEASE.md
index 9352e7ef706c6..db0d51afc7be1 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -60,9 +60,10 @@ Please note: **No feature work allowed for cherry picks**. All PRs that are cons
 Before each release, we perform end-to-end performance validation to ensure no regressions are introduced. This validation uses the [vllm-benchmark workflow](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) on PyTorch CI.
 
 **Current Coverage:**
+
 * Models: Llama3, Llama4, and Mixtral
 * Hardware: NVIDIA H100 and AMD MI300x
-* *Note: Coverage may change based on new model releases and hardware availability*
+* _Note: Coverage may change based on new model releases and hardware availability_
 
 **Performance Validation Process:**
 
@@ -71,11 +72,13 @@ Request write access to the [pytorch/pytorch-integration-testing](https://github
 
 **Step 2: Review Benchmark Setup**
 Familiarize yourself with the benchmark configurations:
+
 * [CUDA setup](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks/cuda)
 * [ROCm setup](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks/rocm)
 
 **Step 3: Run the Benchmark**
 Navigate to the [vllm-benchmark workflow](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) and configure:
+
 * **vLLM branch**: Set to the release branch (e.g., `releases/v0.9.2`)
 * **vLLM commit**: Set to the RC commit hash
 
diff --git a/SECURITY.md b/SECURITY.md
index 6053cfb41f35b..414669fb3712e 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -1,13 +1,45 @@
 # Security Policy
 
-## Reporting a Vulnerability
+## Reporting security issues
 
-If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem.
+Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new).
 
-Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html).
+## Issue triage
 
----
+Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html).
+
+## Threat model
 
 Please see the [Security Guide in the vLLM documentation](https://docs.vllm.ai/en/latest/usage/security.html) for more information on vLLM's security assumptions and recommendations.
 
 Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models.
+
+## Issue severity
+
+We will determine the risk of each issue, taking into account our experience dealing with past issues, versions affected, common defaults, and use cases. We use the following severity categories:
+
+### CRITICAL Severity
+
+Vulnerabilities that allow remote attackers to execute arbitrary code, take full control of the system, or significantly compromise confidentiality, integrity, or availability without any interaction or privileges needed, examples include remote code execution via network, deserialization issues that allow exploit chains. Generally those issues which are rated as CVSS  ≥ 9.0.
+
+### HIGH Severity
+
+Serious security flaws that allow elevated impact—like RCE in specific, limited contexts or significant data loss—but require advanced conditions or some trust, examples include RCE in advanced deployment modes (e.g. multi-node), or high impact issues where some sort of privileged network access is required. These issues typically have CVSS scores between 7.0 and 8.9
+
+### MODERATE Severity
+
+Vulnerabilities that cause denial of service or partial disruption, but do not allow arbitrary code execution or data breach and have limited impact. These issues have a CVSS rating between 4.0 and 6.9
+
+### LOW Severity
+
+Minor issues such as informational disclosures, logging errors, non-exploitable flaws, or weaknesses that require local or high-privilege access and offer negligible impact. Examples include side channel attacks or hash collisions. These issues often have CVSS scores less than 4.0
+
+## Prenotification policy
+
+For certain security issues of CRITICAL, HIGH, or MODERATE severity level, we may prenotify certain organizations or vendors that ship vLLM. The purpose of this prenotification is to allow for a coordinated release of fixes for severe issues.
+
+* This prenotification will be in the form of a private email notification. It may also include adding security contacts to the GitHub security advisory, typically a few days before release.
+
+* If you wish to be added to the prenotification group, please send an email copying all the members of the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html). Each vendor contact will be analyzed on a case-by-case basis.
+
+* We may withdraw organizations from receiving future prenotifications if they release fixes or any other information about issues before they are public. Group membership may also change based on policy refinements for who may be included.
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 3b10963c3e014..d6442a4fc3872 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -4,7 +4,7 @@ This README guides you through running benchmark tests with the extensive
 datasets supported on vLLM. It’s a living document, updated as new features and datasets
 become available.
 
-**Dataset Overview**
+## Dataset Overview
 
 <table style="width:100%; border-collapse: collapse;">
   <thead>
@@ -81,16 +81,17 @@ become available.
 
 **Note**: HuggingFace dataset's `dataset-name` should be set to `hf`
 
----
+## 🚀 Example - Online Benchmark
+
 <details>
-<summary><b>🚀 Example - Online Benchmark</b></summary>
+<summary>Show more</summary>
 
 <br/>
 
 First start serving your model
 
 ```bash
-vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
+vllm serve NousResearch/Hermes-3-Llama-3.1-8B
 ```
 
 Then run the benchmarking script
@@ -109,7 +110,7 @@ vllm bench serve \
 
 If successful, you will see the following output
 
-```
+```text
 ============ Serving Benchmark Result ============
 Successful requests:                     10
 Benchmark duration (s):                  5.78
@@ -133,11 +134,11 @@ P99 ITL (ms):                            8.39
 ==================================================
 ```
 
-**Custom Dataset**
+### Custom Dataset
 
 If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl
 
-```
+```json
 {"prompt": "What is the capital of India?"}
 {"prompt": "What is the capital of Iran?"}
 {"prompt": "What is the capital of China?"}
@@ -145,7 +146,7 @@ If the dataset you want to benchmark is not supported yet in vLLM, even then you
 
 ```bash
 # start server
-VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests
+VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct
 ```
 
 ```bash
@@ -166,11 +167,11 @@ vllm bench serve --port 9001 --save-result --save-detailed \
 
 You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.
 
-**VisionArena Benchmark for Vision Language Models**
+### VisionArena Benchmark for Vision Language Models
 
 ```bash
 # need a model with vision capability here
-vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
+vllm serve Qwen/Qwen2-VL-7B-Instruct
 ```
 
 ```bash
@@ -184,7 +185,7 @@ vllm bench serve \
   --num-prompts 1000
 ```
 
-**InstructCoder Benchmark with Speculative Decoding**
+### InstructCoder Benchmark with Speculative Decoding
 
 ``` bash
 VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
@@ -201,13 +202,13 @@ vllm bench serve \
     --num-prompts 2048
 ```
 
-**Other HuggingFaceDataset Examples**
+### Other HuggingFaceDataset Examples
 
 ```bash
-vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
+vllm serve Qwen/Qwen2-VL-7B-Instruct
 ```
 
-**`lmms-lab/LLaVA-OneVision-Data`**
+`lmms-lab/LLaVA-OneVision-Data`:
 
 ```bash
 vllm bench serve \
@@ -221,7 +222,7 @@ vllm bench serve \
   --num-prompts 10
 ```
 
-**`Aeala/ShareGPT_Vicuna_unfiltered`**
+`Aeala/ShareGPT_Vicuna_unfiltered`:
 
 ```bash
 vllm bench serve \
@@ -234,7 +235,7 @@ vllm bench serve \
   --num-prompts 10
 ```
 
-**`AI-MO/aimo-validation-aime`**
+`AI-MO/aimo-validation-aime`:
 
 ``` bash
 vllm bench serve \
@@ -245,7 +246,7 @@ vllm bench serve \
     --seed 42
 ```
 
-**`philschmid/mt-bench`**
+`philschmid/mt-bench`:
 
 ``` bash
 vllm bench serve \
@@ -255,7 +256,7 @@ vllm bench serve \
     --num-prompts 80
 ```
 
-**Running With Sampling Parameters**
+### Running With Sampling Parameters
 
 When using OpenAI-compatible backends such as `vllm`, optional sampling
 parameters can be specified. Example client command:
@@ -273,25 +274,29 @@ vllm bench serve \
   --num-prompts 10
 ```
 
-**Running With Ramp-Up Request Rate**
+### Running With Ramp-Up Request Rate
 
 The benchmark tool also supports ramping up the request rate over the
 duration of the benchmark run. This can be useful for stress testing the
 server or finding the maximum throughput that it can handle, given some latency budget.
 
 Two ramp-up strategies are supported:
+
 - `linear`: Increases the request rate linearly from a start value to an end value.
 - `exponential`: Increases the request rate exponentially.
 
 The following arguments can be used to control the ramp-up:
+
 - `--ramp-up-strategy`: The ramp-up strategy to use (`linear` or `exponential`).
 - `--ramp-up-start-rps`: The request rate at the beginning of the benchmark.
 - `--ramp-up-end-rps`: The request rate at the end of the benchmark.
 
 </details>
 
+## 📈 Example - Offline Throughput Benchmark
+
 <details>
-<summary><b>📈 Example - Offline Throughput Benchmark</b></summary>
+<summary>Show more</summary>
 
 <br/>
 
@@ -305,15 +310,15 @@ vllm bench throughput \
 
 If successful, you will see the following output
 
-```
+```text
 Throughput: 7.15 requests/s, 4656.00 total tokens/s, 1072.15 output tokens/s
 Total num prompt tokens:  5014
 Total num output tokens:  1500
 ```
 
-**VisionArena Benchmark for Vision Language Models**
+### VisionArena Benchmark for Vision Language Models
 
-``` bash
+```bash
 vllm bench throughput \
   --model Qwen/Qwen2-VL-7B-Instruct \
   --backend vllm-chat \
@@ -325,13 +330,13 @@ vllm bench throughput \
 
 The `num prompt tokens` now includes image token counts
 
-```
+```text
 Throughput: 2.55 requests/s, 4036.92 total tokens/s, 326.90 output tokens/s
 Total num prompt tokens:  14527
 Total num output tokens:  1280
 ```
 
-**InstructCoder Benchmark with Speculative Decoding**
+### InstructCoder Benchmark with Speculative Decoding
 
 ``` bash
 VLLM_WORKER_MULTIPROC_METHOD=spawn \
@@ -349,15 +354,15 @@ vllm bench throughput \
     "prompt_lookup_min": 2}'
 ```
 
-```
+```text
 Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s
 Total num prompt tokens:  261136
 Total num output tokens:  204800
 ```
 
-**Other HuggingFaceDataset Examples**
+### Other HuggingFaceDataset Examples
 
-**`lmms-lab/LLaVA-OneVision-Data`**
+`lmms-lab/LLaVA-OneVision-Data`:
 
 ```bash
 vllm bench throughput \
@@ -370,7 +375,7 @@ vllm bench throughput \
   --num-prompts 10
 ```
 
-**`Aeala/ShareGPT_Vicuna_unfiltered`**
+`Aeala/ShareGPT_Vicuna_unfiltered`:
 
 ```bash
 vllm bench throughput \
@@ -382,7 +387,7 @@ vllm bench throughput \
   --num-prompts 10
 ```
 
-**`AI-MO/aimo-validation-aime`**
+`AI-MO/aimo-validation-aime`:
 
 ```bash
 vllm bench throughput \
@@ -394,7 +399,7 @@ vllm bench throughput \
   --num-prompts 10
 ```
 
-**Benchmark with LoRA Adapters**
+Benchmark with LoRA adapters:
 
 ``` bash
 # download dataset
@@ -413,20 +418,22 @@ vllm bench throughput \
 
 </details>
 
+## 🛠️ Example - Structured Output Benchmark
+
 <details>
-<summary><b>🛠️ Example - Structured Output Benchmark</b></summary>
+<summary>Show more</summary>
 
 <br/>
 
 Benchmark the performance of structured output generation (JSON, grammar, regex).
 
-**Server Setup**
+### Server Setup
 
 ```bash
-vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
+vllm serve NousResearch/Hermes-3-Llama-3.1-8B
 ```
 
-**JSON Schema Benchmark**
+### JSON Schema Benchmark
 
 ```bash
 python3 benchmarks/benchmark_serving_structured_output.py \
@@ -438,7 +445,7 @@ python3 benchmarks/benchmark_serving_structured_output.py \
   --num-prompts 1000
 ```
 
-**Grammar-based Generation Benchmark**
+### Grammar-based Generation Benchmark
 
 ```bash
 python3 benchmarks/benchmark_serving_structured_output.py \
@@ -450,7 +457,7 @@ python3 benchmarks/benchmark_serving_structured_output.py \
   --num-prompts 1000
 ```
 
-**Regex-based Generation Benchmark**
+### Regex-based Generation Benchmark
 
 ```bash
 python3 benchmarks/benchmark_serving_structured_output.py \
@@ -461,7 +468,7 @@ python3 benchmarks/benchmark_serving_structured_output.py \
   --num-prompts 1000
 ```
 
-**Choice-based Generation Benchmark**
+### Choice-based Generation Benchmark
 
 ```bash
 python3 benchmarks/benchmark_serving_structured_output.py \
@@ -472,7 +479,7 @@ python3 benchmarks/benchmark_serving_structured_output.py \
   --num-prompts 1000
 ```
 
-**XGrammar Benchmark Dataset**
+### XGrammar Benchmark Dataset
 
 ```bash
 python3 benchmarks/benchmark_serving_structured_output.py \
@@ -485,14 +492,16 @@ python3 benchmarks/benchmark_serving_structured_output.py \
 
 </details>
 
+## 📚 Example - Long Document QA Benchmark
+
 <details>
-<summary><b>📚 Example - Long Document QA Benchmark</b></summary>
+<summary>Show more</summary>
 
 <br/>
 
 Benchmark the performance of long document question-answering with prefix caching.
 
-**Basic Long Document QA Test**
+### Basic Long Document QA Test
 
 ```bash
 python3 benchmarks/benchmark_long_document_qa_throughput.py \
@@ -504,7 +513,7 @@ python3 benchmarks/benchmark_long_document_qa_throughput.py \
   --repeat-count 5
 ```
 
-**Different Repeat Modes**
+### Different Repeat Modes
 
 ```bash
 # Random mode (default) - shuffle prompts randomly
@@ -537,14 +546,16 @@ python3 benchmarks/benchmark_long_document_qa_throughput.py \
 
 </details>
 
+## 🗂️ Example - Prefix Caching Benchmark
+
 <details>
-<summary><b>🗂️ Example - Prefix Caching Benchmark</b></summary>
+<summary>Show more</summary>
 
 <br/>
 
 Benchmark the efficiency of automatic prefix caching.
 
-**Fixed Prompt with Prefix Caching**
+### Fixed Prompt with Prefix Caching
 
 ```bash
 python3 benchmarks/benchmark_prefix_caching.py \
@@ -555,7 +566,7 @@ python3 benchmarks/benchmark_prefix_caching.py \
   --input-length-range 128:256
 ```
 
-**ShareGPT Dataset with Prefix Caching**
+### ShareGPT Dataset with Prefix Caching
 
 ```bash
 # download dataset
@@ -572,14 +583,16 @@ python3 benchmarks/benchmark_prefix_caching.py \
 
 </details>
 
+## ⚡ Example - Request Prioritization Benchmark
+
 <details>
-<summary><b>⚡ Example - Request Prioritization Benchmark</b></summary>
+<summary>Show more</summary>
 
 <br/>
 
 Benchmark the performance of request prioritization in vLLM.
 
-**Basic Prioritization Test**
+### Basic Prioritization Test
 
 ```bash
 python3 benchmarks/benchmark_prioritization.py \
@@ -590,7 +603,7 @@ python3 benchmarks/benchmark_prioritization.py \
   --scheduling-policy priority
 ```
 
-**Multiple Sequences per Prompt**
+### Multiple Sequences per Prompt
 
 ```bash
 python3 benchmarks/benchmark_prioritization.py \
diff --git a/benchmarks/auto_tune/README.md b/benchmarks/auto_tune/README.md
index c479ff1aa29c0..9aad51df6e003 100644
--- a/benchmarks/auto_tune/README.md
+++ b/benchmarks/auto_tune/README.md
@@ -3,6 +3,7 @@
 This script automates the process of finding the optimal server parameter combination (`max-num-seqs` and `max-num-batched-tokens`) to maximize throughput for a vLLM server. It also supports additional constraints such as E2E latency and prefix cache hit rate.
 
 ## Table of Contents
+
 - [Prerequisites](#prerequisites)
 - [Configuration](#configuration)
 - [How to Run](#how-to-run)
@@ -52,7 +53,7 @@ You must set the following variables at the top of the script before execution.
 1. **Configure**: Edit the script and set the variables in the [Configuration](#configuration) section.
 2. **Execute**: Run the script. Since the process can take a long time, it is highly recommended to use a terminal multiplexer like `tmux` or `screen` to prevent the script from stopping if your connection is lost.
 
-```
+```bash
 cd <FOLDER_OF_THIS_SCRIPT>
 bash auto_tune.sh
 ```
@@ -64,6 +65,7 @@ bash auto_tune.sh
 Here are a few examples of how to configure the script for different goals:
 
 ### 1. Maximize Throughput (No Latency Constraint)
+
 - **Goal**: Find the best `max-num-seqs` and `max-num-batched-tokens` to get the highest possible throughput for 1800 input tokens and 20 output tokens.
 - **Configuration**:
 
@@ -76,6 +78,7 @@ MAX_LATENCY_ALLOWED_MS=100000000000 # A very large number
 ```
 
 #### 2. Maximize Throughput with a Latency Requirement
+
 - **Goal**: Find the best server parameters when P99 end-to-end latency must be below 500ms.
 - **Configuration**:
 
@@ -88,6 +91,7 @@ MAX_LATENCY_ALLOWED_MS=500
 ```
 
 #### 3. Maximize Throughput with Prefix Caching and Latency Requirements
+
 - **Goal**: Find the best server parameters assuming a 60% prefix cache hit rate and a latency requirement of 500ms.
 - **Configuration**:
 
@@ -109,7 +113,7 @@ After the script finishes, you will find the results in a new, timestamped direc
 
 - **Final Result Summary**: A file named `result.txt` is created in the log directory. It contains a summary of each tested combination and concludes with the overall best parameters found.
 
-```
+```text
 # Example result.txt content
 hash:a1b2c3d4...
 max_num_seqs: 128, max_num_batched_tokens: 2048, request_rate: 10.0, e2el: 450.5, throughput: 9.8, goodput: 9.8
diff --git a/benchmarks/auto_tune/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh
index 3cd8580e065dd..82c20ffa6554c 100644
--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
@@ -49,6 +49,7 @@ best_throughput=0
 best_max_num_seqs=0
 best_num_batched_tokens=0
 best_goodput=0
+best_request_rate=0
 
 start_server() {
     local gpu_memory_utilization=$1
@@ -57,19 +58,35 @@ start_server() {
     local vllm_log=$4
     local profile_dir=$5
 
-    pkill -f vllm
+    pkill -if vllm
 
-    VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir vllm serve $MODEL \
-        --disable-log-requests \
-        --port 8004 \
-        --gpu-memory-utilization $gpu_memory_utilization \
-        --max-num-seqs $max_num_seqs \
-        --max-num-batched-tokens $max_num_batched_tokens \
-        --tensor-parallel-size $TP \
-        --enable-prefix-caching \
-        --load-format dummy \
-        --download-dir "$DOWNLOAD_DIR" \
-        --max-model-len $MAX_MODEL_LEN > "$vllm_log" 2>&1 &
+    # Define the common arguments as a bash array.
+    # Each argument and its value are separate elements.
+    local common_args_array=(
+        "$MODEL"
+        "--disable-log-requests"
+        "--port" "8004"
+        "--gpu-memory-utilization" "$gpu_memory_utilization"
+        "--max-num-seqs" "$max_num_seqs"
+        "--max-num-batched-tokens" "$max_num_batched_tokens"
+        "--tensor-parallel-size" "$TP"
+        "--enable-prefix-caching"
+        "--load-format" "dummy"
+        "--download-dir" "$DOWNLOAD_DIR"
+        "--max-model-len" "$MAX_MODEL_LEN"
+    )
+
+    # Use the array expansion "${common_args_array[@]}"
+    # This correctly passes each element as a separate argument.
+    if [[ -n "$profile_dir" ]]; then
+        # Start server with profiling enabled
+        VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
+            vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
+    else
+        # Start server without profiling
+        VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 \
+            vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
+    fi
 
     # wait for 10 minutes...
     server_started=0
@@ -83,6 +100,7 @@ start_server() {
             sleep 10
         fi
     done
+
     if (( ! server_started )); then
         echo "server did not start within 10 minutes. Please check server log at $vllm_log".
         return 1
@@ -91,37 +109,20 @@ start_server() {
     fi
 }
 
-update_best_profile() {
-    local profile_dir=$1
-    local profile_index=$2
-    sorted_paths=($(find "$profile_dir" -maxdepth 1 -not -path "$profile_dir" | sort))
-    selected_profile_file=
-    if [[ "$SYSTEM" == "TPU" ]]; then
-        selected_profile_file="${sorted_paths[$profile_index]}/*.xplane.pb"
-    fi
-    if [[ "$SYSTEM" == "GPU" ]]; then
-        selected_profile_file="${sorted_paths[$profile_index]}"
-    fi
-    rm -f $PROFILE_PATH/*
-    cp $selected_profile_file $PROFILE_PATH
-}
-
 run_benchmark() {
     local max_num_seqs=$1
     local max_num_batched_tokens=$2
     local gpu_memory_utilization=$3
     echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
     local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
-    local profile_dir="$LOG_FOLDER/profile_${max_num_seqs}_${max_num_batched_tokens}"
     echo "vllm_log: $vllm_log"
     echo
     rm -f $vllm_log
-    mkdir -p $profile_dir
-    pkill -f vllm
-    local profile_index=0
+    pkill -if vllm
 
     echo "starting server..."
-    start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log $profile_dir
+    # Call start_server without a profile_dir to avoid profiling overhead
+    start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log ""
     result=$?
     if [[ "$result" -eq 1 ]]; then
         echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
@@ -135,7 +136,8 @@ run_benchmark() {
     # get a basic qps by using request-rate inf
     bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
     prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
-adjusted_input_len=$(( INPUT_LEN - prefix_len ))
+    adjusted_input_len=$(( INPUT_LEN - prefix_len ))
+    # --profile flag is removed from this call
     vllm bench serve \
         --backend vllm \
         --model $MODEL  \
@@ -149,8 +151,7 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len ))
         --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
         --num-prompts 1000 \
         --random-prefix-len $prefix_len \
-        --port 8004 \
-        --profile &> "$bm_log"
+        --port 8004 &> "$bm_log"
     throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
     e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
     goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
@@ -164,7 +165,6 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len ))
     # start from request-rate as int(throughput) + 1
         request_rate=$((${throughput%.*} + 1))
         while ((request_rate > 0)); do
-            profile_index=$((profile_index+1))
             # clear prefix cache
             curl -X POST http://0.0.0.0:8004/reset_prefix_cache
             sleep 5
@@ -202,12 +202,7 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len ))
             best_max_num_seqs=$max_num_seqs
             best_num_batched_tokens=$max_num_batched_tokens
             best_goodput=$goodput
-            if [[ "$SYSTEM" == "TPU" ]]; then
-                update_best_profile "$profile_dir/plugins/profile" $profile_index
-            fi
-            if [[ "$SYSTEM" == "GPU" ]]; then
-                update_best_profile "$profile_dir" $profile_index
-            fi
+            best_request_rate=$request_rate
         fi
     else
         echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}"
@@ -216,7 +211,7 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len ))
 
     echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
 
-    pkill vllm
+    pkill -if vllm
     sleep 10
     printf '=%.0s' $(seq 1 20)
     return 0
@@ -229,7 +224,8 @@ read -r -a num_batched_tokens_list <<< "$NUM_BATCHED_TOKENS_LIST"
 gpu_memory_utilization=0.98
 find_gpu_memory_utilization=0
 while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do
-    start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log"
+    # Pass empty string for profile_dir argument
+    start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log" ""
     result=$?
     if [[ "$result" -eq 0 ]]; then
         find_gpu_memory_utilization=1
@@ -252,5 +248,45 @@ for num_seqs in "${num_seqs_list[@]}"; do
     done
 done
 echo "finish permutations"
+
+# =================================================================================
+# FINAL PROFILING RUN FOR THE BEST CONFIGURATION
+# =================================================================================
+if (( $(echo "$best_throughput > 0" | bc -l) )); then
+    echo
+    echo "Benchmark tuning finished. Now running profiling on the best configuration found..."
+    echo "Best config: max_num_seqs: $best_max_num_seqs, max_num_batched_tokens: $best_num_batched_tokens, throughput: $best_throughput"
+    echo
+
+    vllm_log="$LOG_FOLDER/vllm_log_BEST_PROFILE.txt"
+    bm_log="$LOG_FOLDER/bm_log_BEST_PROFILE.txt"
+
+    # Start server with the best params and profiling ENABLED
+    echo "Starting server for profiling..."
+    start_server $gpu_memory_utilization $best_max_num_seqs $best_num_batched_tokens "$vllm_log" "$PROFILE_PATH"
+
+    # Run benchmark with the best params and the --profile flag
+    echo "Running benchmark with profiling..."
+    prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
+    adjusted_input_len=$(( INPUT_LEN - prefix_len ))
+    vllm bench serve \
+        --backend vllm \
+        --model $MODEL \
+        --dataset-name random \
+        --random-input-len $adjusted_input_len \
+        --random-output-len $OUTPUT_LEN \
+        --ignore-eos \
+        --disable-tqdm \
+        --request-rate $best_request_rate \
+        --percentile-metrics ttft,tpot,itl,e2el \
+        --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
+        --num-prompts 100 \
+        --random-prefix-len $prefix_len \
+        --port 8004 \
+        --profile &> "$bm_log"
+else
+    echo "No configuration met the latency requirements. Skipping final profiling run."
+fi
+pkill -if vllm
 echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
 echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 53bd3247afbb6..93b72211eb332 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -5,8 +5,7 @@ r"""Benchmark online serving throughput.
 On the server side, run one of the following commands:
     vLLM OpenAI API server
     vllm serve <your_model> \
-        --swap-space 16 \
-        --disable-log-requests
+        --swap-space 16
 
 On the client side, run:
     python benchmarks/benchmark_serving.py \
@@ -413,6 +412,10 @@ async def benchmark(
 
     print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
     print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
+    if max_concurrency is not None:
+        print("{:<40} {:<10}".format("Maximum request concurrency:", max_concurrency))
+    if request_rate != float("inf"):
+        print("{:<40} {:<10.2f}".format("Request rate configured (RPS):", request_rate))
     print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
     print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
     print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py
index d535cd5d7e1a6..ca6843a72aa36 100644
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -4,7 +4,7 @@ r"""Benchmark online serving throughput with structured outputs.
 
 On the server side, run one of the following commands:
     (vLLM OpenAI API server)
-    vllm serve <your_model> --disable-log-requests
+    vllm serve <your_model>
 
 On the client side, run:
     python benchmarks/benchmark_serving_structured_output.py \
@@ -555,6 +555,10 @@ async def benchmark(
 
     print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
     print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
+    if max_concurrency is not None:
+        print("{:<40} {:<10}".format("Maximum request concurrency:", max_concurrency))
+    if request_rate != float("inf"):
+        print("{:<40} {:<10.2f}".format("Request rate configured (RPS):", request_rate))
     print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
     print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
     print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index c350aaf5d3ad2..72250e2fb6d2b 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -22,6 +22,13 @@ from vllm.utils import FlexibleArgumentParser
 FP8_DTYPE = current_platform.fp8_dtype()
 
 
+def ensure_divisibility(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator."""
+    assert numerator % denominator == 0, (
+        "intermediate_size {} is not divisible by tp {}.".format(numerator, denominator)
+    )
+
+
 class BenchmarkConfig(TypedDict):
     BLOCK_SIZE_M: int
     BLOCK_SIZE_N: int
@@ -603,7 +610,7 @@ def main(args: argparse.Namespace):
         topk = config.num_experts_per_tok
         intermediate_size = config.intermediate_size
         shard_intermediate_size = 2 * intermediate_size // args.tp_size
-
+    ensure_divisibility(intermediate_size, args.tp_size)
     hidden_size = config.hidden_size
     dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
     use_fp8_w8a8 = args.dtype == "fp8_w8a8"
diff --git a/benchmarks/kernels/benchmark_per_token_group_quant.py b/benchmarks/kernels/benchmark_per_token_group_quant.py
new file mode 100644
index 0000000000000..1ccb5e08b3d57
--- /dev/null
+++ b/benchmarks/kernels/benchmark_per_token_group_quant.py
@@ -0,0 +1,159 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import argparse
+import math
+from contextlib import contextmanager
+from typing import Callable
+from unittest.mock import patch
+
+import torch
+
+from vllm.model_executor.layers.quantization.utils import fp8_utils, int8_utils
+from vllm.platforms import current_platform
+
+
+@contextmanager
+def _triton_mode():
+    """Temporarily force the Triton fallback path"""
+    with patch("vllm.platforms.current_platform.is_cuda", return_value=False):
+        yield
+
+
+def _time_cuda(
+    fn: Callable[[], tuple[torch.Tensor, torch.Tensor]],
+    warmup_iters: int,
+    bench_iters: int,
+) -> float:
+    # warmup
+    for _ in range(warmup_iters):
+        fn()
+    torch.cuda.synchronize()
+
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+
+    start.record()
+    for _ in range(bench_iters):
+        fn()
+    end.record()
+    torch.cuda.synchronize()
+
+    return start.elapsed_time(end) / bench_iters  # ms/iter
+
+
+def _run_single(
+    shape: tuple[int, int],
+    group_size: int,
+    dtype: str,
+    *,
+    column_major: bool = False,
+    scale_ue8m0: bool = False,
+    warmup_iters: int,
+    bench_iters: int,
+) -> None:
+    num_tokens, hidden_dim = shape
+
+    device = torch.device("cuda")
+    torch.manual_seed(42)
+    x = torch.randn(num_tokens, hidden_dim, device=device, dtype=torch.bfloat16) * 8
+
+    if dtype == "fp8":
+
+        def cuda_impl():
+            return fp8_utils.per_token_group_quant_fp8(
+                x,
+                group_size,
+                column_major_scales=column_major,
+                use_ue8m0=scale_ue8m0,
+            )
+
+        def triton_impl():
+            with _triton_mode():
+                return fp8_utils.per_token_group_quant_fp8(
+                    x,
+                    group_size,
+                    column_major_scales=column_major,
+                    use_ue8m0=scale_ue8m0,
+                )
+    elif dtype == "int8":
+
+        def cuda_impl():
+            return int8_utils.per_token_group_quant_int8(x, group_size)
+
+        def triton_impl():
+            with _triton_mode():
+                return int8_utils.per_token_group_quant_int8(x, group_size)
+    else:
+        raise ValueError("dtype must be 'fp8' or 'int8'")
+
+    cuda_ms = _time_cuda(cuda_impl, warmup_iters, bench_iters)
+    triton_ms = _time_cuda(triton_impl, warmup_iters, bench_iters)
+
+    speedup = triton_ms / cuda_ms if cuda_ms else math.inf
+
+    cfg_desc = (
+        f"shape={shape}  gs={group_size:<3}  col_major={column_major:<5}  "
+        f"ue8m0={scale_ue8m0:<5}  dtype={dtype}"
+    )
+    print(
+        f"{cfg_desc:55} | CUDA {cuda_ms:7.3f} ms  | Triton {triton_ms:7.3f} ms  | "
+        f"speed-up ×{speedup:5.2f}"
+    )
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--warmup-iters", type=int, default=10)
+    parser.add_argument("--bench-iters", type=int, default=100)
+    parser.add_argument("--dtype", choices=["fp8", "int8", "both"], default="both")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    if not current_platform.is_cuda():
+        raise RuntimeError("CUDA device is required to run this benchmark.")
+
+    args = parse_args()
+    warmup_iters, bench_iters = args.warmup_iters, args.bench_iters
+
+    shapes = [(32, 128), (64, 256), (16, 512)]
+    group_sizes = [64, 128]
+
+    dtypes = ["fp8", "int8"] if args.dtype == "both" else [args.dtype]
+
+    header = (
+        "Configuration".ljust(55)
+        + " | "
+        + "CUDA (ms)".center(12)
+        + " | "
+        + "Triton (ms)".center(13)
+        + " | "
+        + "Speed-up"
+    )
+    print(header)
+    print("-" * len(header))
+
+    for dtype in dtypes:
+        for shape in shapes:
+            for gs in group_sizes:
+                if dtype == "fp8":
+                    for col_major in (False, True):
+                        for ue8m0 in (False, True):
+                            _run_single(
+                                shape,
+                                gs,
+                                dtype,
+                                column_major=col_major,
+                                scale_ue8m0=ue8m0,
+                                warmup_iters=warmup_iters,
+                                bench_iters=bench_iters,
+                            )
+                else:  # INT8 has no col-major / ue8m0 switches
+                    _run_single(
+                        shape,
+                        gs,
+                        dtype,
+                        warmup_iters=warmup_iters,
+                        bench_iters=bench_iters,
+                    )
diff --git a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
new file mode 100644
index 0000000000000..d4648c18f31d5
--- /dev/null
+++ b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
@@ -0,0 +1,156 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
+import random
+import time
+
+import torch
+from tabulate import tabulate
+
+from vllm import _custom_ops as ops
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.utils import (
+    STR_DTYPE_TO_TORCH_DTYPE,
+    FlexibleArgumentParser,
+    create_kv_caches_with_random_flash,
+)
+
+logger = init_logger(__name__)
+
+
+@torch.inference_mode()
+def run_benchmark(
+    num_tokens: int,
+    num_heads: int,
+    head_size: int,
+    block_size: int,
+    num_blocks: int,
+    dtype: torch.dtype,
+    kv_cache_dtype: str,
+    kv_cache_layout: str,
+    num_iters: int,
+    device: str = "cuda",
+) -> float:
+    """Return latency (seconds) for given num_tokens."""
+
+    if kv_cache_dtype == "fp8" and head_size % 16:
+        raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.")
+
+    current_platform.seed_everything(42)
+    torch.set_default_device(device)
+
+    # create random key / value tensors [T, H, D].
+    key = torch.randn(num_tokens, num_heads, head_size, dtype=dtype, device=device)
+    value = torch.randn_like(key)
+
+    # prepare the slot mapping.
+    # each token is assigned a unique slot in the KV-cache.
+    num_slots = block_size * num_blocks
+    if num_tokens > num_slots:
+        raise ValueError("num_tokens cannot exceed the total number of cache slots")
+    slot_mapping_lst = random.sample(range(num_slots), num_tokens)
+    slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long, device=device)
+
+    key_caches, value_caches = create_kv_caches_with_random_flash(
+        num_blocks,
+        block_size,
+        1,  # num_layers
+        num_heads,
+        head_size,
+        kv_cache_dtype,
+        dtype,
+        device=device,
+        cache_layout=kv_cache_layout,
+    )
+    key_cache, value_cache = key_caches[0], value_caches[0]
+
+    # compute per-kernel scaling factors for fp8 conversion (if used).
+    k_scale = (key.amax() / 64.0).to(torch.float32)
+    v_scale = (value.amax() / 64.0).to(torch.float32)
+
+    def run_cuda_benchmark(n_iters: int) -> float:
+        nonlocal key, value, key_cache, value_cache, slot_mapping
+        torch.cuda.synchronize()
+        start = time.perf_counter()
+        for _ in range(n_iters):
+            ops.reshape_and_cache_flash(
+                key,
+                value,
+                key_cache,
+                value_cache,
+                slot_mapping,
+                kv_cache_dtype,
+                k_scale,
+                v_scale,
+            )
+        torch.cuda.synchronize()
+        end = time.perf_counter()
+        return (end - start) / n_iters
+
+    # warm-up
+    run_cuda_benchmark(3)
+
+    lat = run_cuda_benchmark(num_iters)
+
+    # free tensors to mitigate OOM when sweeping
+    del key, value, key_cache, value_cache, slot_mapping
+    torch.cuda.empty_cache()
+
+    return lat
+
+
+def main(args):
+    rows = []
+    for layout in ["NHD", "HND"]:
+        for exp in range(1, 17):
+            n_tok = 2**exp
+            lat = run_benchmark(
+                num_tokens=n_tok,
+                num_heads=args.num_heads,
+                head_size=args.head_size,
+                block_size=args.block_size,
+                num_blocks=args.num_blocks,
+                dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
+                kv_cache_dtype=args.kv_cache_dtype,
+                kv_cache_layout=layout,
+                num_iters=args.iters,
+                device="cuda",
+            )
+            rows.append([n_tok, layout, f"{lat * 1e6:.3f}"])
+
+    print(tabulate(rows, headers=["num_tokens", "layout", "latency (µs)"]))
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser()
+
+    parser.add_argument("--num-heads", type=int, default=128)
+    parser.add_argument(
+        "--head-size",
+        type=int,
+        choices=[64, 80, 96, 112, 120, 128, 192, 256],
+        default=128,
+    )
+    parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
+    parser.add_argument("--num-blocks", type=int, default=128 * 512)
+
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        choices=["half", "bfloat16", "float"],
+        default="bfloat16",
+    )
+
+    parser.add_argument(
+        "--kv-cache-dtype",
+        type=str,
+        choices=["auto", "fp8"],
+        default="auto",
+    )
+
+    parser.add_argument("--iters", type=int, default=100)
+    args = parser.parse_args()
+
+    main(args)
diff --git a/benchmarks/kernels/benchmark_trtllm_attention.py b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
similarity index 88%
rename from benchmarks/kernels/benchmark_trtllm_attention.py
rename to benchmarks/kernels/benchmark_trtllm_decode_attention.py
index 8c980f930366c..77136edca45b5 100644
--- a/benchmarks/kernels/benchmark_trtllm_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
@@ -41,7 +41,6 @@ def benchmark_decode(
     device = "cuda"
     torch.manual_seed(0)
 
-    # Currently only HEAD_GRP_SIZE == 8 is supported
     HEAD_GRP_SIZE = 8
     MAX_SEQ_LEN = max_seq_len
 
@@ -71,22 +70,20 @@ def benchmark_decode(
     if kv_cache_dtype.startswith("fp8"):
         kv_cache, _ = to_float8(kv_cache)
 
+    output_trtllm = torch.empty(q.shape, dtype=dtype)
+
     # Benchmark TRT decode
     def trt_decode():
         return flashinfer.decode.trtllm_batch_decode_with_kv_cache(
             q,
             kv_cache,
             workspace_buffer,
-            num_qo_heads,
-            num_kv_heads,
-            sm_scale,
             block_tables,
             kv_lens_tensor,
-            page_size,
             max_kv_len,
-            kv_cache_dtype,
-            k_scale,
-            v_scale,
+            bmm1_scale=k_scale * sm_scale,
+            bmm2_scale=v_scale,
+            out=output_trtllm,
         )
 
     def time_fn(fn, warmup=10, trials=20):
@@ -125,6 +122,8 @@ def benchmark_decode(
     kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
     kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
 
+    output_baseline = torch.empty(q.shape, dtype=dtype)
+
     wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
         workspace_buffer,
         kv_layout,
@@ -145,7 +144,7 @@ def benchmark_decode(
     )
 
     def baseline_decode():
-        return wrapper.run(q, kv_cache, sm_scale, k_scale, v_scale)
+        return wrapper.run(q, kv_cache, sm_scale, k_scale, v_scale, output_baseline)
 
     baseline_mean, baseline_std = time_fn(baseline_decode)
 
@@ -214,25 +213,39 @@ if __name__ == "__main__":
     max_seq_lens = [1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072]
     all_results = []
 
-    print("Running benchmark for kv_cache_dtype: bfloat16")
     print(
-        "\tnum_seqs\tmax_seq_len\ttrt_mean\ttrt_std\tbaseline_mean\tbaseline_std\tspeedup_percent"
+        "Running benchmark for q_dtype = bfloat16, kv_cache_dtype: bfloat16, "
+        "output_dtype: bfloat16"
+    )
+    print(
+        "\tnum_seqs\tmax_seq_len\ttrt_mean\ttrt_std\tbaseline_mean\t"
+        "baseline_std\tspeedup_percent"
     )
     for max_seq_len in max_seq_lens:
         for bs in num_seqs:
             result = benchmark_decode(
-                bs, max_seq_len, dtype=torch.bfloat16, kv_cache_dtype="auto"
+                bs,
+                max_seq_len,
+                dtype=torch.bfloat16,
+                kv_cache_dtype="auto",
             )
             all_results.append(result)
 
-    print("Running benchmark for q_dtype = bfloat16, kv_cache_dtype: fp8")
     print(
-        "\tnum_seqs\tmax_seq_len\ttrt_mean\ttrt_std\tbaseline_mean\tbaseline_std\tspeedup_percent"
+        "Running benchmark for q_dtype = bfloat16, kv_cache_dtype: fp8, "
+        "output_dtype: bfloat16"
+    )
+    print(
+        "\tnum_seqs\tmax_seq_len\ttrt_mean\ttrt_std\tbaseline_mean\t"
+        "baseline_std\tspeedup_percent"
     )
     for max_seq_len in max_seq_lens:
         for bs in num_seqs:
             result = benchmark_decode(
-                bs, max_seq_len, dtype=torch.bfloat16, kv_cache_dtype="fp8"
+                bs,
+                max_seq_len,
+                dtype=torch.bfloat16,
+                kv_cache_dtype="fp8",
             )
             all_results.append(result)
 
diff --git a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
new file mode 100644
index 0000000000000..67bd9aebbcca9
--- /dev/null
+++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
@@ -0,0 +1,250 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import csv
+import os
+import random
+from datetime import datetime
+
+import flashinfer
+import torch
+
+FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
+
+# KV Cache Layout for TRT-LLM
+# kv_cache_shape = (num_blocks, 2, num_kv_heads, page_size, head_dim)
+
+
+def to_float8(x, dtype=torch.float8_e4m3fn):
+    finfo = torch.finfo(dtype)
+    min_val, max_val = x.aminmax()
+    amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12)
+    scale = finfo.max / amax * 0.1
+    x_scl_sat = (x * scale).clamp(min=finfo.min, max=finfo.max)
+    return x_scl_sat.to(dtype), scale.float().reciprocal()
+
+
+@torch.no_grad()
+def benchmark_prefill(
+    num_seqs,
+    max_seq_len,
+    page_size=16,
+    dtype=torch.bfloat16,
+    kv_layout="HND",
+    num_kv_heads=8,
+    kv_cache_dtype="auto",
+    head_dim=128,
+    warmup=10,
+    trials=20,
+):
+    torch.set_default_device("cuda")
+    torch.manual_seed(0)
+
+    HEAD_GRP_SIZE = 8
+    MAX_SEQ_LEN = max_seq_len
+
+    # large number to reduce kv_cache reuse
+    NUM_BLOCKS = int(256000 / page_size)
+
+    workspace_buffer = torch.empty(1024 * 1024 * 1024, dtype=torch.int8)
+
+    num_qo_heads = num_kv_heads * HEAD_GRP_SIZE
+    sm_scale = float(1.0 / (head_dim**0.5))
+
+    q_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_seqs)]
+    q_lens[-1] = MAX_SEQ_LEN
+    max_q_len = max(q_lens)
+    q_indptr = torch.cat(
+        [
+            torch.tensor([0], dtype=torch.int32),
+            torch.cumsum(
+                torch.tensor(q_lens, dtype=torch.int32), dim=0, dtype=torch.int32
+            ),
+        ]
+    )
+    q = torch.randn(sum(q_lens), num_qo_heads, head_dim, dtype=dtype)
+
+    kv_lens = [random.randint(0, MAX_SEQ_LEN) for _ in range(num_seqs)]
+    kv_lens[-1] = MAX_SEQ_LEN
+
+    seq_lens = [q_len + kv_len for q_len, kv_len in zip(q_lens, kv_lens)]
+    max_seq_len = max(seq_lens)
+    seq_lens_tensor = torch.tensor(seq_lens, dtype=torch.int32)
+
+    max_num_blocks_per_seq = (max_seq_len + page_size - 1) // page_size
+    block_tables = torch.randint(
+        0, NUM_BLOCKS, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32
+    )
+
+    kv_cache_shape = (NUM_BLOCKS, 2, num_kv_heads, page_size, head_dim)
+    kv_cache = torch.randn(size=kv_cache_shape, dtype=dtype)
+    k_scale = v_scale = 1.0
+
+    if kv_cache_dtype.startswith("fp8"):
+        kv_cache, _ = to_float8(kv_cache)
+
+    output_trtllm = torch.empty(q.shape, dtype=dtype)
+
+    kv_indptr = [0]
+    kv_indices = []
+    kv_last_page_lens = []
+    for i in range(num_seqs):
+        seq_len = seq_lens[i]
+        assert seq_len > 0
+        num_blocks = (seq_len + page_size - 1) // page_size
+        kv_indices.extend(block_tables[i, :num_blocks])
+        kv_indptr.append(kv_indptr[-1] + num_blocks)
+        kv_last_page_len = seq_len % page_size
+        if kv_last_page_len == 0:
+            kv_last_page_len = page_size
+        kv_last_page_lens.append(kv_last_page_len)
+
+    kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
+    kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
+    kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
+
+    output_baseline = torch.empty(q.shape, dtype=dtype)
+
+    wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
+        workspace_buffer, kv_layout
+    )
+    wrapper.plan(
+        q_indptr,
+        kv_indptr,
+        kv_indices,
+        kv_last_page_lens,
+        num_qo_heads,
+        num_kv_heads,
+        head_dim,
+        page_size,
+        causal=True,
+        sm_scale=sm_scale,
+        q_data_type=dtype,
+        kv_data_type=kv_cache.dtype,
+    )
+
+    def time_fn(fn, warmup=10, trials=20):
+        torch.cuda.synchronize()
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        times = []
+        for i in range(warmup):
+            fn()
+        for i in range(trials):
+            start.record()
+            fn()
+            end.record()
+            torch.cuda.synchronize()
+            times.append(start.elapsed_time(end))  # ms
+        return sum(times) / len(times), torch.std(torch.tensor(times))
+
+    def baseline_prefill():
+        return wrapper.run(
+            q, kv_cache, k_scale=k_scale, v_scale=v_scale, out=output_baseline
+        )
+
+    def trt_prefill():
+        return flashinfer.prefill.trtllm_batch_context_with_kv_cache(
+            query=q,
+            kv_cache=kv_cache,
+            workspace_buffer=workspace_buffer,
+            block_tables=block_tables,
+            seq_lens=seq_lens_tensor,
+            max_q_len=max_q_len,
+            max_kv_len=max_seq_len,
+            bmm1_scale=k_scale * sm_scale,
+            bmm2_scale=v_scale,
+            batch_size=num_seqs,
+            cum_seq_lens_q=q_indptr,
+            cum_seq_lens_kv=kv_indptr,
+            out=output_trtllm,
+        )
+
+    trt_mean, trt_std = time_fn(trt_prefill)
+    baseline_mean, baseline_std = time_fn(baseline_prefill)
+
+    # Calculate percentage speedup (positive means TRT is faster)
+    speedup_percent = (baseline_mean - trt_mean) / baseline_mean
+
+    print(
+        f"\t{num_seqs}\t{max_seq_len}\t{trt_mean:.5f}\t{trt_std.item():.5f}"
+        f"\t{baseline_mean:.5f}\t{baseline_std.item():.5f}\t{speedup_percent:.5f}"
+    )
+
+    # Return results for CSV writing
+    return {
+        "num_seqs": num_seqs,
+        "trt_mean": trt_mean,
+        "trt_std": trt_std.item(),
+        "baseline_mean": baseline_mean,
+        "baseline_std": baseline_std.item(),
+        "speedup_percent": speedup_percent,
+        "q_dtype": str(dtype),
+        "kv_cache_dtype": kv_cache_dtype,
+        "page_size": page_size,
+        "num_kv_heads": num_kv_heads,
+        "head_dim": head_dim,
+        "max_seq_len": max_seq_len,
+    }
+
+
+def write_results_to_csv(results, filename=None):
+    """Write benchmark results to CSV file."""
+    if filename is None:
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        filename = f"flashinfer_trtllm_benchmark_{timestamp}.csv"
+
+    fieldnames = [
+        "num_seqs",
+        "trt_mean",
+        "trt_std",
+        "baseline_mean",
+        "baseline_std",
+        "speedup_percent",
+        "q_dtype",
+        "kv_cache_dtype",
+        "page_size",
+        "num_kv_heads",
+        "head_dim",
+        "max_seq_len",
+    ]
+
+    file_exists = os.path.exists(filename)
+
+    with open(filename, "a", newline="") as csvfile:
+        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+
+        if not file_exists:
+            writer.writeheader()
+
+        for result in results:
+            writer.writerow(result)
+
+    print(f"Results written to {filename}")
+
+
+if __name__ == "__main__":
+    num_seqs = [1, 4, 8, 16, 32, 64, 128, 256]
+    max_seq_lens = [1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072]
+    all_results = []
+
+    print(
+        "Running benchmark for q_dtype = bfloat16, kv_cache_dtype: bfloat16, "
+        "output_dtype: bfloat16"
+    )
+    print(
+        "\tnum_seqs\tmax_seq_len\ttrt_mean\ttrt_std\tbaseline_mean\t"
+        "baseline_std\tspeedup_percent"
+    )
+    for max_seq_len in max_seq_lens:
+        for bs in num_seqs:
+            result = benchmark_prefill(
+                bs,
+                max_seq_len,
+                dtype=torch.bfloat16,
+                kv_cache_dtype="auto",
+            )
+            all_results.append(result)
+
+    # Write all results to CSV
+    write_results_to_csv(all_results)
diff --git a/benchmarks/kernels/deepgemm/README.md b/benchmarks/kernels/deepgemm/README.md
index 917e814010f89..41e68e047be82 100644
--- a/benchmarks/kernels/deepgemm/README.md
+++ b/benchmarks/kernels/deepgemm/README.md
@@ -8,7 +8,7 @@ Currently this just includes dense GEMMs and only works on Hopper GPUs.
 
 You need to install vLLM in your usual fashion, then install DeepGEMM from source in its own directory:
 
-```
+```bash
 git clone --recursive https://github.com/deepseek-ai/DeepGEMM
 cd DeepGEMM
 python setup.py install
@@ -17,7 +17,7 @@ uv pip install -e .
 
 ## Usage
 
-```
+```console
 python benchmark_fp8_block_dense_gemm.py
 INFO 02-26 21:55:13 [__init__.py:207] Automatically detected platform cuda.
 ===== STARTING FP8 GEMM BENCHMARK =====
diff --git a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
index 43c54d56ca8c1..b99c2099f2c38 100644
--- a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
+++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
@@ -4,49 +4,16 @@
 # ruff: noqa: E501
 import time
 
-# Import DeepGEMM functions
-import deep_gemm
 import torch
-from deep_gemm import calc_diff, ceil_div, get_col_major_tma_aligned_tensor
 
-# Import vLLM functions
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    get_col_major_tma_aligned_tensor,
     per_token_group_quant_fp8,
     w8a8_block_fp8_matmul,
 )
 from vllm.triton_utils import triton
-
-
-# Copied from
-# https://github.com/deepseek-ai/DeepGEMM/blob/78cacf70d41d15d688bd493ebc85845f7f2a3d5d/tests/test_core.py#L9
-def per_token_cast_to_fp8(
-        x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
-    """Convert tensor to FP8 format with per-token scaling."""
-    assert x.dim() == 2 and x.size(1) % 128 == 0
-    m, n = x.shape
-    x_view = x.view(m, -1, 128)
-    x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
-    return (x_view * (448.0 / x_amax.unsqueeze(2))).to(
-        torch.float8_e4m3fn).view(m, n), (x_amax / 448.0).view(m, -1)
-
-
-# Copied from
-# https://github.com/deepseek-ai/DeepGEMM/blob/78cacf70d41d15d688bd493ebc85845f7f2a3d5d/tests/test_core.py#L17
-def per_block_cast_to_fp8(
-        x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
-    """Convert tensor to FP8 format with per-block scaling."""
-    assert x.dim() == 2
-    m, n = x.shape
-    x_padded = torch.zeros((ceil_div(m, 128) * 128, ceil_div(n, 128) * 128),
-                           dtype=x.dtype,
-                           device=x.device)
-    x_padded[:m, :n] = x
-    x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128)
-    x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
-    x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn)
-    return x_scaled.view_as(x_padded)[:m, :n].contiguous(), (
-        x_amax / 448.0).view(x_view.size(0), x_view.size(2))
+from vllm.utils.deep_gemm import calc_diff, fp8_gemm_nt, per_block_cast_to_fp8
 
 
 def benchmark_shape(m: int,
@@ -69,14 +36,14 @@ def benchmark_shape(m: int,
 
     # Pre-quantize B for all implementations
     # (weights can be pre-quantized offline)
-    B_deepgemm, B_scale_deepgemm = per_block_cast_to_fp8(B)
-    B_vllm, B_scale_vllm = per_block_cast_to_fp8(B)
+    B_deepgemm, B_scale_deepgemm = per_block_cast_to_fp8(B, [128, 128], use_ue8m0=True)
+    B_vllm, B_scale_vllm = per_block_cast_to_fp8(B, [128, 128], use_ue8m0=True)
 
     # Block size configuration
     block_size = [128, 128]
 
     # Pre-quantize A for all implementations
-    A_deepgemm, A_scale_deepgemm = per_token_cast_to_fp8(A)
+    A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(A, block_size[1])
     A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm)
     C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16)
     A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
@@ -85,7 +52,7 @@ def benchmark_shape(m: int,
 
     # === DeepGEMM Implementation ===
     def deepgemm_gemm():
-        deep_gemm.gemm_fp8_fp8_bf16_nt((A_deepgemm, A_scale_deepgemm),
+        fp8_gemm_nt((A_deepgemm, A_scale_deepgemm),
                                        (B_deepgemm, B_scale_deepgemm),
                                        C_deepgemm)
         return C_deepgemm
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index 88559c8fe7183..131dcb15cd7e9 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -5,6 +5,7 @@
 #include "cuda_utils.h"
 #include "cuda_compat.h"
 #include "dispatch_utils.h"
+#include "quantization/vectorization_utils.cuh"
 
 #ifdef USE_ROCM
   #include "quantization/fp8/amd/quant_utils.cuh"
@@ -261,14 +262,26 @@ __global__ void reshape_and_cache_kernel(
   }
 }
 
+// Used by vectorization_utils to copy/convert one element
+template <typename OutT, typename InT, Fp8KVCacheDataType kv_dt>
+struct CopyWithScaleOp {
+  float scale;
+
+  __device__ __forceinline__ void operator()(OutT& dst, const InT src) const {
+    if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
+      dst = static_cast<OutT>(src);
+    } else {
+      dst = fp8::scaled_convert<OutT, InT, kv_dt>(src, scale);
+    }
+  }
+};
+
 template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
 __global__ void reshape_and_cache_flash_kernel(
     const scalar_t* __restrict__ key,    // [num_tokens, num_heads, head_size]
     const scalar_t* __restrict__ value,  // [num_tokens, num_heads, head_size]
-    cache_t* __restrict__ key_cache,     // [num_blocks, block_size, num_heads,
-                                         // head_size]
-    cache_t* __restrict__ value_cache,   // [num_blocks, block_size, num_heads,
-                                         // head_size]
+    cache_t* __restrict__ key_cache,     // NHD or HND, shape see comments below
+    cache_t* __restrict__ value_cache,   // same above
     const int64_t* __restrict__ slot_mapping,  // [num_tokens]
     const int64_t block_stride, const int64_t page_stride,
     const int64_t head_stride, const int64_t key_stride,
@@ -282,25 +295,58 @@ __global__ void reshape_and_cache_flash_kernel(
   }
   const int64_t block_idx = slot_idx / block_size;
   const int64_t block_offset = slot_idx % block_size;
-  const int n = num_heads * head_size;
-  for (int i = threadIdx.x; i < n; i += blockDim.x) {
-    const int64_t src_key_idx = token_idx * key_stride + i;
-    const int64_t src_value_idx = token_idx * value_stride + i;
-    const int head_idx = i / head_size;
-    const int head_offset = i % head_size;
-    const int64_t tgt_key_value_idx = block_idx * block_stride +
-                                      block_offset * page_stride +
-                                      head_idx * head_stride + head_offset;
-    scalar_t tgt_key = key[src_key_idx];
-    scalar_t tgt_value = value[src_value_idx];
-    if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
-      key_cache[tgt_key_value_idx] = tgt_key;
-      value_cache[tgt_key_value_idx] = tgt_value;
-    } else {
-      key_cache[tgt_key_value_idx] =
-          fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_key, *k_scale);
-      value_cache[tgt_key_value_idx] =
-          fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_value, *v_scale);
+  const int n_elems = num_heads * head_size;
+
+  // pointers to the beginning of the source row for this token.
+  const scalar_t* __restrict__ key_src = key + token_idx * key_stride;
+  const scalar_t* __restrict__ value_src = value + token_idx * value_stride;
+
+  // find the start position inside the kv-cache for this token.
+  cache_t* __restrict__ key_dst =
+      key_cache + block_idx * block_stride + block_offset * page_stride;
+  cache_t* __restrict__ value_dst =
+      value_cache + block_idx * block_stride + block_offset * page_stride;
+
+  // this is true for the NHD layout where `head_stride == head_size`
+  const bool is_contiguous_heads = (head_stride == head_size);
+
+  float k_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *k_scale;
+  float v_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *v_scale;
+  constexpr int VEC_SIZE = (sizeof(scalar_t) == 2) ? 8 : 4;
+  CopyWithScaleOp<cache_t, scalar_t, kv_dt> k_op{k_scale_val};
+  CopyWithScaleOp<cache_t, scalar_t, kv_dt> v_op{v_scale_val};
+  if (is_contiguous_heads) {
+    // NHD layout
+    // kv cache: [num_blocks, block_size, num_heads, head_size]
+    vectorize_with_alignment<VEC_SIZE>(key_src, key_dst, n_elems, threadIdx.x,
+                                       blockDim.x, k_op);
+
+    vectorize_with_alignment<VEC_SIZE>(value_src, value_dst, n_elems,
+                                       threadIdx.x, blockDim.x, v_op);
+
+  } else {
+    // HND layout: heads are strided, but each head_size segment is contiguous
+    // kv cache: [num_blocks, num_heads, block_size, head_size]
+    const int lane = threadIdx.x & 31;     // 0..31 within warp
+    const int warp_id = threadIdx.x >> 5;  // warp index within block
+    const int warps_per_block = blockDim.x >> 5;
+
+    for (int head = warp_id; head < num_heads; head += warps_per_block) {
+      const scalar_t* __restrict__ k_src_h = key_src + head * head_size;
+      const scalar_t* __restrict__ v_src_h = value_src + head * head_size;
+
+      cache_t* __restrict__ k_dst_h =
+          key_dst + static_cast<int64_t>(head) * head_stride;
+      cache_t* __restrict__ v_dst_h =
+          value_dst + static_cast<int64_t>(head) * head_stride;
+
+      // within each head, let the 32 threads of the warp perform the vector
+      // copy
+      vectorize_with_alignment<VEC_SIZE>(k_src_h, k_dst_h, head_size, lane, 32,
+                                         k_op);
+
+      vectorize_with_alignment<VEC_SIZE>(v_src_h, v_dst_h, head_size, lane, 32,
+                                         v_op);
     }
   }
 }
diff --git a/csrc/cpu/quant.cpp b/csrc/cpu/quant.cpp
index c1f7c64ea2f49..6e120b8d20a7e 100644
--- a/csrc/cpu/quant.cpp
+++ b/csrc/cpu/quant.cpp
@@ -16,12 +16,14 @@ struct KernelVecType<float> {
   using cvt_vec_type = vec_op::FP32Vec16;
 };
 
+#if !defined(__aarch64__) || defined(ARM_BF16_SUPPORT)
 template <>
 struct KernelVecType<c10::BFloat16> {
   using load_vec_type = vec_op::BF16Vec16;
   using azp_adj_load_vec_type = vec_op::INT32Vec16;
   using cvt_vec_type = vec_op::FP32Vec16;
 };
+#endif
 
 template <>
 struct KernelVecType<c10::Half> {
diff --git a/csrc/moe/topk_softmax_kernels.cu b/csrc/moe/topk_softmax_kernels.cu
index 0b505d2e04a21..7a7865b901de1 100644
--- a/csrc/moe/topk_softmax_kernels.cu
+++ b/csrc/moe/topk_softmax_kernels.cu
@@ -24,9 +24,12 @@
 #ifndef USE_ROCM
     #include <cub/util_type.cuh>
     #include <cub/cub.cuh>
+    #include <cuda/std/functional>
+    using AddOp = cuda::std::plus<float>;
 #else
     #include <hipcub/util_type.hpp>
     #include <hipcub/hipcub.hpp>
+    using AddOp = cub::Sum; 
 #endif
 
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
@@ -62,7 +65,6 @@ __launch_bounds__(TPB) __global__
 
     const int thread_row_offset = blockIdx.x * num_cols;
 
-    cub::Sum sum;
     float threadData(-FLT_MAX);
 
     // Don't touch finished rows.
@@ -92,7 +94,7 @@ __launch_bounds__(TPB) __global__
         threadData += exp((static_cast<float>(input[idx]) - float_max));
     }
 
-    const auto Z = BlockReduce(tmpStorage).Reduce(threadData, sum);
+    const auto Z = BlockReduce(tmpStorage).Reduce(threadData, AddOp());
 
     if (threadIdx.x == 0)
     {
diff --git a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
index 6a81f159f46ae..d8369108d0bd3 100644
--- a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
+++ b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
@@ -1,7 +1,9 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <torch/all.h>
 
-#include "../per_token_group_quant_8bit.h"
+#ifndef USE_ROCM
+  #include "../per_token_group_quant_8bit.h"
+#endif
 
 #include <cmath>
 
@@ -339,10 +341,12 @@ void dynamic_scaled_int8_quant(
       });
 }
 
+#ifndef USE_ROCM
 void per_token_group_quant_int8(const torch::Tensor& input,
                                 torch::Tensor& output_q,
                                 torch::Tensor& output_s, int64_t group_size,
                                 double eps, double int8_min, double int8_max) {
   per_token_group_quant_8bit(input, output_q, output_s, group_size, eps,
                              int8_min, int8_max);
-}
\ No newline at end of file
+}
+#endif
diff --git a/csrc/quantization/cutlass_w8a8/Epilogues.md b/csrc/quantization/cutlass_w8a8/Epilogues.md
index a30e1fdf3ac77..15a66913e97a3 100644
--- a/csrc/quantization/cutlass_w8a8/Epilogues.md
+++ b/csrc/quantization/cutlass_w8a8/Epilogues.md
@@ -86,6 +86,7 @@ D = s_a s_b \widehat A \widehat B
 ```
 
 Epilogue parameters:
+
 - `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
 - `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
 
@@ -135,7 +136,7 @@ That is precomputed and stored in `azp_with_adj` as a row-vector.
 Epilogue parameters:
 
 - `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
-  - Generally this will be per-tensor as the zero-points are per-tensor.
+    - Generally this will be per-tensor as the zero-points are per-tensor.
 - `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
 - `azp_with_adj` is the precomputed zero-point term ($` z_a J_a \widehat B `$), is per-channel (row-vector).
 - `bias` is the bias, is always per-channel (row-vector).
@@ -152,7 +153,7 @@ That means the zero-point term $` z_a J_a \widehat B `$ becomes an outer product
 Epilogue parameters:
 
 - `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
-  - Generally this will be per-token as the zero-points are per-token.
+    - Generally this will be per-token as the zero-points are per-token.
 - `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
 - `azp_adj` is the precomputed zero-point adjustment term ($` \mathbf 1 \widehat B `$), is per-channel (row-vector).
 - `azp` is the zero-point (`z_a`), is per-token (column-vector).
diff --git a/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu b/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
index a21ee55b65862..03db5cc196d59 100644
--- a/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
+++ b/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
@@ -335,7 +335,7 @@ void run_fp4_blockwise_scaled_group_mm(
   TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM");
 }
 
-#if defined ENABLE_NVFP4 && ENABLE_NVFP4
+#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
 constexpr auto FLOAT4_E2M1X2 = at::ScalarType::Byte;
 constexpr auto SF_DTYPE = at::ScalarType::Float8_e4m3fn;
 #endif
@@ -356,7 +356,7 @@ void cutlass_fp4_group_mm(
     const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
     const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
     const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets) {
-#if defined ENABLE_NVFP4 && ENABLE_NVFP4
+#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
   // Input validation
   CHECK_INPUT(a, FLOAT4_E2M1X2, "a");
   CHECK_INPUT(b, FLOAT4_E2M1X2, "b");
@@ -398,7 +398,7 @@ void cutlass_fp4_group_mm(
   TORCH_CHECK_NOT_IMPLEMENTED(
       false,
       "No compiled cutlass_fp4_group_mm kernel, vLLM must "
-      "be compiled with ENABLE_NVFP4 for SM100+ and CUDA "
+      "be compiled with ENABLE_NVFP4_SM100 for SM100+ and CUDA "
       "12.8 or above.");
 #endif
 }
diff --git a/csrc/quantization/fp4/nvfp4_quant_entry.cu b/csrc/quantization/fp4/nvfp4_quant_entry.cu
index badbb7e310df0..1b61bd4519fc3 100644
--- a/csrc/quantization/fp4/nvfp4_quant_entry.cu
+++ b/csrc/quantization/fp4/nvfp4_quant_entry.cu
@@ -16,14 +16,15 @@
 
 #include <torch/all.h>
 
-#if defined ENABLE_NVFP4 && ENABLE_NVFP4
-void scaled_fp4_quant_sm100a(torch::Tensor const& output,
+#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
+    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
+void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
                              torch::Tensor const& input,
                              torch::Tensor const& output_sf,
                              torch::Tensor const& input_sf);
 #endif
 
-#if defined ENABLE_NVFP4 && ENABLE_NVFP4
+#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
 void scaled_fp4_experts_quant_sm100a(
     torch::Tensor& output, torch::Tensor& output_scale,
     torch::Tensor const& input, torch::Tensor const& input_global_scale,
@@ -33,8 +34,9 @@ void scaled_fp4_experts_quant_sm100a(
 
 void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input,
                       torch::Tensor& output_sf, torch::Tensor const& input_sf) {
-#if defined ENABLE_NVFP4 && ENABLE_NVFP4
-  return scaled_fp4_quant_sm100a(output, input, output_sf, input_sf);
+#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
+    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
+  return scaled_fp4_quant_sm1xxa(output, input, output_sf, input_sf);
 #endif
   TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled nvfp4 quantization kernel");
 }
@@ -44,7 +46,7 @@ void scaled_fp4_experts_quant(
     torch::Tensor const& input, torch::Tensor const& input_global_scale,
     torch::Tensor const& input_offset_by_experts,
     torch::Tensor const& output_scale_offset_by_experts) {
-#if defined ENABLE_NVFP4 && ENABLE_NVFP4
+#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
   return scaled_fp4_experts_quant_sm100a(
       output, output_scale, input, input_global_scale, input_offset_by_experts,
       output_scale_offset_by_experts);
diff --git a/csrc/quantization/fp4/nvfp4_quant_kernels.cu b/csrc/quantization/fp4/nvfp4_quant_kernels.cu
index d32911357a953..4e080de151648 100644
--- a/csrc/quantization/fp4/nvfp4_quant_kernels.cu
+++ b/csrc/quantization/fp4/nvfp4_quant_kernels.cu
@@ -332,7 +332,7 @@ template void invokeFP4Quantization(int m, int n, __nv_bfloat16 const* input,
                                     int multiProcessorCount,
                                     cudaStream_t stream);
 
-void scaled_fp4_quant_sm100a(torch::Tensor const& output,
+void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
                              torch::Tensor const& input,
                              torch::Tensor const& output_sf,
                              torch::Tensor const& input_sf) {
diff --git a/csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu b/csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu
index 61b75e92dfaa0..9cba2828aac2e 100644
--- a/csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu
+++ b/csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu
@@ -16,7 +16,7 @@
 
 #include <torch/all.h>
 
-#if defined ENABLE_NVFP4 && ENABLE_NVFP4
+#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
 void cutlass_scaled_fp4_mm_sm100a(torch::Tensor& D, torch::Tensor const& A,
                                   torch::Tensor const& B,
                                   torch::Tensor const& A_sf,
@@ -24,12 +24,22 @@ void cutlass_scaled_fp4_mm_sm100a(torch::Tensor& D, torch::Tensor const& A,
                                   torch::Tensor const& alpha);
 #endif
 
+#if defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120
+void cutlass_scaled_fp4_mm_sm120a(torch::Tensor& D, torch::Tensor const& A,
+                                  torch::Tensor const& B,
+                                  torch::Tensor const& A_sf,
+                                  torch::Tensor const& B_sf,
+                                  torch::Tensor const& alpha);
+#endif
+
 void cutlass_scaled_fp4_mm(torch::Tensor& D, torch::Tensor const& A,
                            torch::Tensor const& B, torch::Tensor const& A_sf,
                            torch::Tensor const& B_sf,
                            torch::Tensor const& alpha) {
-#if defined ENABLE_NVFP4 && ENABLE_NVFP4
+#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
   return cutlass_scaled_fp4_mm_sm100a(D, A, B, A_sf, B_sf, alpha);
+#elif defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120
+  return cutlass_scaled_fp4_mm_sm120a(D, A, B, A_sf, B_sf, alpha);
 #endif
   TORCH_CHECK_NOT_IMPLEMENTED(false,
                               "No compiled nvfp4 mm kernel, vLLM should "
diff --git a/csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu b/csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu
new file mode 100644
index 0000000000000..89de23b76e65d
--- /dev/null
+++ b/csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu
@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include "cutlass_extensions/common.hpp"
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+
+#include "cutlass/util/packed_stride.hpp"
+
+#include "core/math.hpp"
+
+using namespace cute;
+
+#define CHECK_TYPE(x, st, m) \
+  TORCH_CHECK(x.scalar_type() == st, ": Inconsistency of Tensor type:", m)
+#define CHECK_TH_CUDA(x, m) \
+  TORCH_CHECK(x.is_cuda(), m, ": must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x, m) \
+  TORCH_CHECK(x.is_contiguous(), m, ": must be contiguous")
+#define CHECK_INPUT(x, st, m) \
+  CHECK_TH_CUDA(x, m);        \
+  CHECK_CONTIGUOUS(x, m);     \
+  CHECK_TYPE(x, st, m)
+
+constexpr auto FLOAT4_E2M1X2 = at::ScalarType::Byte;
+constexpr auto SF_DTYPE = at::ScalarType::Float8_e4m3fn;
+
+struct sm120_fp4_config_M256 {
+  using ClusterShape = Shape<_1, _1, _1>;
+  using MmaTileShape = Shape<_128, _128, _128>;
+  using PerSmTileShape_MNK = Shape<_128, _128, _128>;
+};
+
+struct sm120_fp4_config_default {
+  using ClusterShape = Shape<_1, _1, _1>;
+  using MmaTileShape = Shape<_256, _128, _128>;
+  using PerSmTileShape_MNK = Shape<_256, _128, _128>;
+};
+
+template <typename Config, typename OutType>
+struct Fp4GemmSm120 {
+  using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using LayoutATag = cutlass::layout::RowMajor;
+  static constexpr int AlignmentA = 32;
+
+  using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using LayoutBTag = cutlass::layout::ColumnMajor;
+  static constexpr int AlignmentB = 32;
+
+  using ElementD = OutType;
+  using ElementC = OutType;
+  using LayoutCTag = cutlass::layout::RowMajor;
+  using LayoutDTag = cutlass::layout::RowMajor;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+
+  using ElementAccumulator = float;
+  using ArchTag = cutlass::arch::Sm120;
+  using OperatorClass = cutlass::arch::OpClassBlockScaledTensorOp;
+
+  using MmaTileShape = typename Config::MmaTileShape;
+  using ClusterShape = typename Config::ClusterShape;
+  using PerSmTileShape_MNK = typename Config::PerSmTileShape_MNK;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, PerSmTileShape_MNK, ClusterShape,
+          cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
+          ElementAccumulator, ElementC, LayoutCTag, AlignmentC, ElementD,
+          LayoutDTag, AlignmentD,
+          cutlass::epilogue::collective::EpilogueScheduleAuto>::CollectiveOp;
+
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, ElementA, LayoutATag, AlignmentA, ElementB,
+          LayoutBTag, AlignmentB, ElementAccumulator, MmaTileShape,
+          ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+              sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          cutlass::gemm::collective::KernelScheduleAuto>::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+};
+
+template <typename Gemm>
+typename Gemm::Arguments args_from_options(at::Tensor& D, at::Tensor const& A,
+                                           at::Tensor const& B,
+                                           at::Tensor const& A_sf,
+                                           at::Tensor const& B_sf,
+                                           torch::Tensor const& alpha, int M,
+                                           int N, int K) {
+  using ElementA = typename Gemm::ElementA;
+  using ElementB = typename Gemm::ElementB;
+  using ElementD = typename Gemm::ElementD;
+  using ElementSFA = cutlass::float_ue4m3_t;
+  using ElementSFB = cutlass::float_ue4m3_t;
+  using ElementCompute = float;
+
+  using StrideA = typename Gemm::GemmKernel::StrideA;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using StrideC = typename Gemm::GemmKernel::StrideC;
+  using StrideD = typename Gemm::GemmKernel::StrideD;
+
+  using Sm1xxBlkScaledConfig =
+      typename Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
+
+  auto stride_A = cutlass::make_cute_packed_stride(StrideA{}, {M, K, 1});
+  auto stride_B = cutlass::make_cute_packed_stride(StrideB{}, {N, K, 1});
+  auto stride_D = cutlass::make_cute_packed_stride(StrideD{}, {M, N, 1});
+
+  auto layout_SFA = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(
+      cute::make_shape(M, N, K, 1));
+  auto layout_SFB = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFB(
+      cute::make_shape(M, N, K, 1));
+
+  typename Gemm::Arguments arguments{
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      {M, N, K, 1},
+      {static_cast<ElementA const*>(A.data_ptr()), stride_A,
+       static_cast<ElementB const*>(B.data_ptr()), stride_B,
+       static_cast<ElementSFA const*>(A_sf.data_ptr()), layout_SFA,
+       static_cast<ElementSFB const*>(B_sf.data_ptr()), layout_SFB},
+      {{},
+       static_cast<ElementD const*>(D.data_ptr()),
+       stride_D,
+       static_cast<ElementD*>(D.data_ptr()),
+       stride_D}};
+  auto& fusion_args = arguments.epilogue.thread;
+  fusion_args.alpha_ptr = static_cast<ElementCompute const*>(alpha.data_ptr());
+
+  return arguments;
+}
+
+template <typename Gemm>
+void runGemm(at::Tensor& D, at::Tensor const& A, at::Tensor const& B,
+             at::Tensor const& A_sf, at::Tensor const& B_sf,
+             torch::Tensor const& alpha, int M, int N, int K,
+             cudaStream_t stream) {
+  Gemm gemm;
+
+  auto arguments = args_from_options<Gemm>(D, A, B, A_sf, B_sf, alpha, M, N, K);
+
+  size_t workspace_size = Gemm::get_workspace_size(arguments);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(A.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  CUTLASS_CHECK(gemm.can_implement(arguments));
+
+  CUTLASS_CHECK(gemm.initialize(arguments, workspace.data_ptr(), stream));
+
+  CUTLASS_CHECK(gemm.run(arguments, workspace.data_ptr(), stream));
+}
+
+void cutlass_fp4_bf16_gemm_dispatch(torch::Tensor& D, torch::Tensor const& A,
+                                    torch::Tensor const& B,
+                                    torch::Tensor const& A_sf,
+                                    torch::Tensor const& B_sf,
+                                    torch::Tensor const& alpha, int m, int n,
+                                    int k, cudaStream_t stream) {
+  uint32_t const mp2 = std::max(static_cast<uint32_t>(16), next_pow_2(m));
+  if (mp2 <= 256) {
+    runGemm<Fp4GemmSm120<sm120_fp4_config_M256, cutlass::bfloat16_t>::Gemm>(
+        D, A, B, A_sf, B_sf, alpha, m, n, k, stream);
+  } else {
+    runGemm<Fp4GemmSm120<sm120_fp4_config_default, cutlass::bfloat16_t>::Gemm>(
+        D, A, B, A_sf, B_sf, alpha, m, n, k, stream);
+  }
+}
+
+void cutlass_fp4_f16_gemm_dispatch(torch::Tensor& D, torch::Tensor const& A,
+                                   torch::Tensor const& B,
+                                   torch::Tensor const& A_sf,
+                                   torch::Tensor const& B_sf,
+                                   torch::Tensor const& alpha, int m, int n,
+                                   int k, cudaStream_t stream) {
+  uint32_t const mp2 = std::max(static_cast<uint32_t>(16), next_pow_2(m));
+  if (mp2 <= 256) {
+    runGemm<Fp4GemmSm120<sm120_fp4_config_M256, cutlass::half_t>::Gemm>(
+        D, A, B, A_sf, B_sf, alpha, m, n, k, stream);
+  } else {
+    runGemm<Fp4GemmSm120<sm120_fp4_config_default, cutlass::half_t>::Gemm>(
+        D, A, B, A_sf, B_sf, alpha, m, n, k, stream);
+  }
+}
+
+void cutlass_scaled_fp4_mm_sm120a(torch::Tensor& D, torch::Tensor const& A,
+                                  torch::Tensor const& B,
+                                  torch::Tensor const& A_sf,
+                                  torch::Tensor const& B_sf,
+                                  torch::Tensor const& alpha) {
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+  CHECK_INPUT(A, FLOAT4_E2M1X2, "a");
+  CHECK_INPUT(B, FLOAT4_E2M1X2, "b");
+
+  CHECK_INPUT(A_sf, SF_DTYPE, "scale_a");
+  CHECK_INPUT(B_sf, SF_DTYPE, "scale_b");
+
+  CHECK_INPUT(alpha, at::ScalarType::Float, "alpha");
+
+  TORCH_CHECK(A.dim() == 2, "a must be a matrix");
+  TORCH_CHECK(B.dim() == 2, "b must be a matrix");
+  TORCH_CHECK(A.sizes()[1] == B.sizes()[1],
+              "a and b shapes cannot be multiplied (", A.sizes()[0], "x",
+              A.sizes()[1], " and ", B.sizes()[0], "x", B.sizes()[1], ")");
+
+  auto const m = A.sizes()[0];
+  auto const n = B.sizes()[0];
+  auto const k = A.sizes()[1] * 2;
+
+  constexpr int alignment = 32;
+  TORCH_CHECK(k % alignment == 0, "Expected k to be divisible by ", alignment,
+              ", but got a shape: (", A.sizes()[0], "x", A.sizes()[1],
+              "), k: ", k, ".");
+  TORCH_CHECK(n % alignment == 0, "Expected n to be divisible by ", alignment,
+              ", but got b shape: (", B.sizes()[0], "x", B.sizes()[1], ").");
+
+  auto round_up = [](int x, int y) { return (x + y - 1) / y * y; };
+  int rounded_m = round_up(m, 128);
+  int rounded_n = round_up(n, 128);
+  // Since k is divisible by 32 (alignment), k / 16 is guaranteed to be an
+  // integer.
+  int rounded_k = round_up(k / 16, 4);
+
+  TORCH_CHECK(A_sf.dim() == 2, "scale_a must be a matrix");
+  TORCH_CHECK(B_sf.dim() == 2, "scale_b must be a matrix");
+  TORCH_CHECK(A_sf.sizes()[1] == B_sf.sizes()[1],
+              "scale_a and scale_b shapes cannot be multiplied (",
+              A_sf.sizes()[0], "x", A_sf.sizes()[1], " and ", B_sf.sizes()[0],
+              "x", B_sf.sizes()[1], ")");
+  TORCH_CHECK(A_sf.sizes()[0] == rounded_m && A_sf.sizes()[1] == rounded_k,
+              "scale_a must be padded and swizzled to a shape (", rounded_m,
+              "x", rounded_k, "), but got a shape (", A_sf.sizes()[0], "x",
+              A_sf.sizes()[1], ")");
+  TORCH_CHECK(B_sf.sizes()[0] == rounded_n && B_sf.sizes()[1] == rounded_k,
+              "scale_b must be padded and swizzled to a shape (", rounded_n,
+              "x", rounded_k, "), but got a shape (", B_sf.sizes()[0], "x",
+              B_sf.sizes()[1], ")");
+
+  auto out_dtype = D.dtype();
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(A));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream(A.get_device());
+
+  if (out_dtype == at::ScalarType::BFloat16) {
+    return cutlass_fp4_bf16_gemm_dispatch(D, A, B, A_sf, B_sf, alpha, m, n, k,
+                                          stream);
+  } else if (out_dtype == at::ScalarType::Half) {
+    return cutlass_fp4_f16_gemm_dispatch(D, A, B, A_sf, B_sf, alpha, m, n, k,
+                                         stream);
+  } else {
+    TORCH_CHECK(false, "Unsupported output data type of nvfp4 mm sm120 (",
+                out_dtype, ")");
+  }
+#else
+  TORCH_CHECK(false,
+              "Unsupported CUTLASS version. Set VLLM_CUTLASS_SRC_DIR to "
+              "a CUTLASS 3.8 source directory to enable support.");
+#endif  // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+}
\ No newline at end of file
diff --git a/csrc/quantization/fp8/common.cu b/csrc/quantization/fp8/common.cu
index 0e1eab66f0b98..5fe5dd04bd891 100644
--- a/csrc/quantization/fp8/common.cu
+++ b/csrc/quantization/fp8/common.cu
@@ -1,7 +1,8 @@
 #include "common.cuh"
 #include "dispatch_utils.h"
-
+#include "../vectorization_utils.cuh"
 #include <c10/cuda/CUDAGuard.h>
+#include <ATen/cuda/Exceptions.h>
 
 #ifndef USE_ROCM
   #include <cub/cub.cuh>
@@ -12,74 +13,127 @@
 namespace vllm {
 
 template <typename scalar_t, typename fp8_type>
-__global__ void scaled_fp8_quant_kernel(fp8_type* __restrict__ out,
-                                        const scalar_t* __restrict__ input,
-                                        const float* __restrict__ scale,
-                                        int64_t num_elems) {
-  int tid = blockDim.x * blockIdx.x + threadIdx.x;
+__global__ void scaled_fp8_quant_kernel_strided(
+    fp8_type* __restrict__ out, const scalar_t* __restrict__ input,
+    const float* __restrict__ scale, int hidden_size, int64_t in_row_stride,
+    int64_t out_row_stride) {
+  const int64_t token_idx = blockIdx.x;  // one token per block
+  const int tid = threadIdx.x;
 
-  // Invert the scale so that we can use multiplications to avoid expensive
-  // division.
-  const float inverted_scale = 1.0f / (*scale);
-  scaled_fp8_conversion_vec<scalar_t, true>(
-      out, input, inverted_scale, num_elems, tid, blockDim.x * gridDim.x);
+  const scalar_t* token_in = input + token_idx * in_row_stride;
+  fp8_type* token_out = out + token_idx * out_row_stride;
+
+  const float inv_scale = 1.0f / (*scale);
+
+  vectorize_with_alignment<16>(
+      token_in, token_out, hidden_size, tid, blockDim.x,
+      [=] __device__(fp8_type & dst, const scalar_t& src) {
+        dst = scaled_fp8_conversion<true, fp8_type>(static_cast<float>(src),
+                                                    inv_scale);
+      });
 }
 
 template <typename scalar_t, typename fp8_type>
-__global__ void dynamic_per_token_scaled_fp8_quant_kernel(
-    fp8_type* __restrict__ out, float* __restrict__ scale,
-    scalar_t const* __restrict__ input, float const* __restrict__ scale_ub,
-    const int hidden_size) {
-  int const tid = threadIdx.x;
-  int const token_idx = blockIdx.x;
+__global__ void segmented_max_reduction_strided(
+    float* __restrict__ scale, const scalar_t* __restrict__ input,
+    int hidden_size, int64_t in_row_stride, int64_t num_tokens) {
+  __shared__ float cache[256];
+  const int tid = threadIdx.x;
+  int64_t token_idx = blockIdx.x;
 
-  // Use int64 to avoid overflowing an int32 when calculating this offset
-  int64_t offset = static_cast<int64_t>(token_idx) * hidden_size;
-  scalar_t const* __restrict__ token_input = &input[offset];
-  fp8_type* __restrict__ token_output = &out[offset];
-
-  // For vectorization, token_input and token_output pointers need to be
-  // aligned at 32-byte and 16-byte addresses respectively.
-  bool const can_vectorize = hidden_size % 16 == 0;
-
-  float absmax_val = 0.0f;
-  if (can_vectorize) {
-    absmax_val = thread_max_vec(token_input, hidden_size, tid, blockDim.x);
-  } else {
-    for (int i = tid; i < hidden_size; i += blockDim.x) {
-      float const x = static_cast<float>(token_input[i]);
-      absmax_val = fmaxf(absmax_val, fabsf(x));
-    }
+  // one block per token. Guard in case gridDim.x > num_tokens.
+  if (token_idx >= num_tokens) {
+    return;
   }
 
+  const scalar_t* row_ptr = input + token_idx * in_row_stride;
+
+  // each thread scans elements of the row in a strided fashion.
+  float thread_max = 0.0f;
+  for (int e = tid; e < hidden_size; e += blockDim.x) {
+    float v = fabsf(static_cast<float>(row_ptr[e]));
+    thread_max = fmaxf(thread_max, v);
+  }
+
+  cache[tid] = thread_max;
+  __syncthreads();
+
+  // parallel reduction to find row max.
+  for (int offset = blockDim.x / 2; offset > 0; offset >>= 1) {
+    if (tid < offset) {
+      cache[tid] = fmaxf(cache[tid], cache[tid + offset]);
+    }
+    __syncthreads();
+  }
+
+  // thread 0 updates global scale (per-tensor) atomically.
+  if (tid == 0) {
+    atomicMaxFloat(scale, cache[0] / quant_type_max_v<fp8_type>);
+  }
+}
+
+template <typename scalar_t, typename fp8_type>
+__global__ void scaled_fp8_quant_kernel_strided_dynamic(
+    fp8_type* __restrict__ out, const scalar_t* __restrict__ input,
+    const float* __restrict__ scale, int hidden_size, int64_t in_row_stride,
+    int64_t out_row_stride) {
+  const int64_t token_idx = blockIdx.x;
+  const int tid = threadIdx.x;
+
+  const scalar_t* token_in = input + token_idx * in_row_stride;
+  fp8_type* token_out = out + token_idx * out_row_stride;
+
+  const float reciprocal_scale = 1.0f / (*scale);
+  vectorize_with_alignment<16>(
+      token_in, token_out, hidden_size, tid, blockDim.x,
+      [=] __device__(fp8_type & dst, const scalar_t& src) {
+        dst = scaled_fp8_conversion<true, fp8_type>(static_cast<float>(src),
+                                                    reciprocal_scale);
+      });
+}
+
+template <typename scalar_t, typename fp8_type>
+__global__ void dynamic_per_token_scaled_fp8_quant_kernel_strided(
+    fp8_type* __restrict__ out, float* __restrict__ scale,
+    const scalar_t* __restrict__ input, const float* __restrict__ scale_ub,
+    int hidden_size, int64_t in_row_stride, int64_t out_row_stride) {
+  const int64_t token_idx = blockIdx.x;
+  const int tid = threadIdx.x;
+
+  // Use int64 to avoid overflowing an int32 when calculating this offset
+  int64_t in_offset = static_cast<int64_t>(token_idx) * in_row_stride;
+  int64_t out_offset = static_cast<int64_t>(token_idx) * out_row_stride;
+  const scalar_t* token_in = input + in_offset;
+  fp8_type* token_out = out + out_offset;
+
+  // 1) per-token absmax
+  float absmax_val = 0.f;
+  vectorize_read_with_alignment<16>(
+      token_in, hidden_size, tid, blockDim.x, [&] __device__(scalar_t v) {
+        absmax_val = fmaxf(absmax_val, fabsf(static_cast<float>(v)));
+      });
+
   using BlockReduce = cub::BlockReduce<float, 256>;
-  __shared__ typename BlockReduce::TempStorage reduceStorage;
-  float const block_absmax_val_maybe =
-      BlockReduce(reduceStorage).Reduce(absmax_val, cub::Max{}, blockDim.x);
+  __shared__ typename BlockReduce::TempStorage tmp;
+  const float block_max =
+      BlockReduce(tmp).Reduce(absmax_val, cub::Max{}, blockDim.x);
+
   __shared__ float token_scale;
   if (tid == 0) {
-    if (scale_ub) {
-      token_scale = fminf(block_absmax_val_maybe, *scale_ub);
-    } else {
-      token_scale = block_absmax_val_maybe;
-    }
-    // token scale computation
+    token_scale = scale_ub ? fminf(block_max, *scale_ub) : block_max;
     token_scale = fmaxf(token_scale / quant_type_max_v<fp8_type>,
                         min_scaling_factor<fp8_type>::val());
     scale[token_idx] = token_scale;
   }
   __syncthreads();
 
-  // Note that we don't use inverted scales so we can match FBGemm impl.
-  if (can_vectorize) {
-    scaled_fp8_conversion_vec<scalar_t, false>(
-        token_output, token_input, token_scale, hidden_size, tid, blockDim.x);
-  } else {
-    for (int i = tid; i < hidden_size; i += blockDim.x) {
-      token_output[i] = scaled_fp8_conversion<false, fp8_type>(
-          static_cast<float>(token_input[i]), token_scale);
-    }
-  }
+  // 2) quantize
+  vectorize_with_alignment<16>(
+      token_in, token_out, hidden_size, tid, blockDim.x,
+      [=] __device__(fp8_type & dst, const scalar_t& src) {
+        dst = scaled_fp8_conversion<false, fp8_type>(static_cast<float>(src),
+                                                     token_scale);
+      });
 }
 
 }  // namespace vllm
@@ -88,23 +142,31 @@ void static_scaled_fp8_quant(torch::Tensor& out,          // [..., d]
                              torch::Tensor const& input,  // [..., d]
                              torch::Tensor const& scale)  // [1]
 {
-  TORCH_CHECK(input.is_contiguous());
-  TORCH_CHECK(out.is_contiguous());
-  int const block_size = 256;
-  int const num_tokens = input.numel() / input.size(-1);
-  int const num_elems = input.numel();
-  dim3 const grid(num_tokens);
-  dim3 const block(block_size);
+  TORCH_CHECK(input.stride(-1) == 1,
+              "last dimension of input must be contiguous");
+  TORCH_CHECK(out.stride(-1) == 1,
+              "last dimension of output must be contiguous");
+
+  const int hidden_size = input.size(-1);
+  const int num_tokens = input.numel() / hidden_size;
+  const int block_size = 256;
+  dim3 grid(num_tokens);
+  dim3 block(block_size);
+
+  const int64_t in_row_stride = input.stride(-2);
+  const int64_t out_row_stride = out.stride(-2);
+
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   VLLM_DISPATCH_FLOATING_TYPES(
       input.scalar_type(), "scaled_fp8_quant_kernel_scalar_type", [&] {
         VLLM_DISPATCH_FP8_TYPES(
             out.scalar_type(), "scaled_fp8_quant_kernel_fp8_type", [&] {
-              vllm::scaled_fp8_quant_kernel<scalar_t, fp8_t>
+              vllm::scaled_fp8_quant_kernel_strided<scalar_t, fp8_t>
                   <<<grid, block, 0, stream>>>(
                       out.data_ptr<fp8_t>(), input.data_ptr<scalar_t>(),
-                      scale.data_ptr<float>(), num_elems);
+                      scale.data_ptr<float>(), hidden_size, in_row_stride,
+                      out_row_stride);
             });
       });
 }
@@ -113,27 +175,42 @@ void dynamic_scaled_fp8_quant(torch::Tensor& out,          // [..., d]
                               torch::Tensor const& input,  // [..., d]
                               torch::Tensor& scale)        // [1]
 {
-  TORCH_CHECK(input.is_contiguous());
-  TORCH_CHECK(out.is_contiguous());
-  int const block_size = 256;
-  int const num_tokens = input.numel() / input.size(-1);
-  int const num_elems = input.numel();
-  dim3 const grid(num_tokens);
-  dim3 const block(block_size);
+  TORCH_CHECK(input.stride(-1) == 1,
+              "last dimension of input must be contiguous");
+  TORCH_CHECK(out.stride(-1) == 1,
+              "last dimension of output must be contiguous");
+
+  const int hidden_size = input.size(-1);
+  const int num_tokens = input.numel() / hidden_size;
+  const int block_size = 256;
+  dim3 grid(num_tokens);
+  dim3 block(block_size);
+
+  const int64_t in_row_stride = input.stride(-2);
+  const int64_t out_row_stride = out.stride(-2);
+
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // scale tensor should be initialised to <=0 before reduction
+  AT_CUDA_CHECK(
+      cudaMemsetAsync(scale.data_ptr<float>(), 0, sizeof(float), stream));
+
   VLLM_DISPATCH_FLOATING_TYPES(
       input.scalar_type(), "scaled_fp8_quant_kernel_scalar_type", [&] {
         VLLM_DISPATCH_FP8_TYPES(
             out.scalar_type(), "scaled_fp8_quant_kernel_fp8_type", [&] {
-              vllm::segmented_max_reduction<scalar_t, fp8_t>
-                  <<<grid, block, 0, stream>>>(scale.data_ptr<float>(),
-                                               input.data_ptr<scalar_t>(),
-                                               num_elems);
-              vllm::scaled_fp8_quant_kernel<scalar_t, fp8_t>
+              vllm::segmented_max_reduction_strided<scalar_t, fp8_t>
+                  <<<grid, block, 0, stream>>>(
+                      scale.data_ptr<float>(), input.data_ptr<scalar_t>(),
+                      hidden_size, in_row_stride,
+                      static_cast<int64_t>(num_tokens));
+
+              vllm::scaled_fp8_quant_kernel_strided_dynamic<scalar_t, fp8_t>
                   <<<grid, block, 0, stream>>>(
                       out.data_ptr<fp8_t>(), input.data_ptr<scalar_t>(),
-                      scale.data_ptr<float>(), num_elems);
+                      scale.data_ptr<float>(), hidden_size, in_row_stride,
+                      out_row_stride);
             });
       });
 }
@@ -142,14 +219,19 @@ void dynamic_per_token_scaled_fp8_quant(
     torch::Tensor& out,          // [..., d]
     torch::Tensor const& input,  // [..., d]
     torch::Tensor& scales, std::optional<at::Tensor> const& scale_ub) {
-  TORCH_CHECK(input.is_contiguous());
-  TORCH_CHECK(out.is_contiguous());
+  TORCH_CHECK(input.stride(-1) == 1,
+              "last dimension of input must be contiguous");
+  TORCH_CHECK(out.stride(-1) == 1,
+              "last dimension of output must be contiguous");
 
-  int const hidden_size = input.size(-1);
-  int const num_tokens = input.numel() / hidden_size;
-  int const block_size = 256;
-  dim3 const grid(num_tokens);
-  dim3 const block(std::min(hidden_size, block_size));
+  const int hidden_size = input.size(-1);
+  const int num_tokens = input.numel() / hidden_size;
+  const int block_size = 256;
+  dim3 grid(num_tokens);
+  dim3 block(std::min(hidden_size, block_size));
+
+  const int64_t in_row_stride = input.stride(-2);
+  const int64_t out_row_stride = out.stride(-2);
 
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
@@ -159,13 +241,12 @@ void dynamic_per_token_scaled_fp8_quant(
         VLLM_DISPATCH_FP8_TYPES(
             out.scalar_type(),
             "dynamic_per_token_scaled_fp8_quant_kernel_fp8_type", [&] {
-              vllm::dynamic_per_token_scaled_fp8_quant_kernel<scalar_t, fp8_t>
-                  <<<grid, block, 0, stream>>>(
-                      out.data_ptr<fp8_t>(), scales.data_ptr<float>(),
-                      input.data_ptr<scalar_t>(),
-                      scale_ub.has_value() ? scale_ub->data_ptr<float>()
-                                           : nullptr,
-                      hidden_size);
+              vllm::dynamic_per_token_scaled_fp8_quant_kernel_strided<
+                  scalar_t, fp8_t><<<grid, block, 0, stream>>>(
+                  out.data_ptr<fp8_t>(), scales.data_ptr<float>(),
+                  input.data_ptr<scalar_t>(),
+                  scale_ub.has_value() ? scale_ub->data_ptr<float>() : nullptr,
+                  hidden_size, in_row_stride, out_row_stride);
             });
       });
 }
diff --git a/csrc/quantization/fp8/common.cuh b/csrc/quantization/fp8/common.cuh
index d36f94a8f10d6..1aad6330c44b8 100644
--- a/csrc/quantization/fp8/common.cuh
+++ b/csrc/quantization/fp8/common.cuh
@@ -55,111 +55,4 @@ __device__ __forceinline__ fp8_type scaled_fp8_conversion(float const val,
 #endif
 }
 
-// Compute the absolute maximum m of the input tensor and store
-// m / float8_e4m3::max() in *scale. Each thread block performs a
-// reduction tree and the memory in scale is atomically updated.
-// So to get the right answer, *scale needs to be initialized to
-// a value <= 0.0 and we need to wait for all thread blocks to
-// finish before consuming *scale.
-template <typename scalar_t, typename fp8_type>
-__global__ void segmented_max_reduction(float* __restrict__ scale,
-                                        const scalar_t* __restrict__ input,
-                                        int64_t num_elems) {
-  __shared__ float cache[256];
-  int64_t i = blockDim.x * blockIdx.x + threadIdx.x;
-
-  // First store maximum for all values processes by
-  // the current thread in cache[threadIdx.x]
-  scalar_t tmp = 0.0;
-  while (i < num_elems) {
-    float x = static_cast<float>(input[i]);
-    tmp = fmaxf(tmp, fabsf(x));
-    i += blockDim.x * gridDim.x;
-  }
-  cache[threadIdx.x] = tmp;
-
-  __syncthreads();
-
-  // Now perform parallel reduction within the thread block
-  int ib = blockDim.x / 2;
-  while (ib != 0) {
-    if (threadIdx.x < ib && cache[threadIdx.x + ib] > cache[threadIdx.x]) {
-      cache[threadIdx.x] = cache[threadIdx.x + ib];
-    }
-    __syncthreads();
-    ib /= 2;
-  }
-  // Finally, since cache[0] contains the maximum for this thread block,
-  // atomically write the max to the target location
-  if (threadIdx.x == 0) {
-    atomicMaxFloat(scale, cache[0] / quant_type_max_v<fp8_type>);
-  }
-}
-
-template <typename scalar_t>
-__device__ float thread_max_vec(scalar_t const* __restrict__ input,
-                                int64_t const num_elems, int const tid,
-                                int const step) {
-  constexpr size_t VEC_SIZE = 16;
-  using scalarxN_t = vec_n_t<scalar_t, VEC_SIZE>;
-  // Vectorized input/output to better utilize memory bandwidth.
-  auto const* vectorized_in = reinterpret_cast<scalarxN_t const*>(input);
-
-  // num_elems / VEC_SIZE (which is 16)
-  int64_t const num_vec_elems = num_elems >> 4;
-  float absmax_val = 0.0f;
-
-#pragma unroll
-  for (int64_t i = tid; i < num_vec_elems; i += step) {
-    scalarxN_t in_vec = vectorized_in[i];
-#pragma unroll
-    for (int j = 0; j < VEC_SIZE; ++j) {
-      absmax_val = fmaxf(absmax_val, fabsf(in_vec.val[j]));
-    }
-  }
-
-  // Handle the remaining elements if num_elems is not divisible by VEC_SIZE
-  for (int64_t i = num_vec_elems * VEC_SIZE + tid; i < num_elems; i += step) {
-    absmax_val = fmaxf(absmax_val, fabsf(input[i]));
-  }
-
-  return absmax_val;
-}
-
-template <typename scalar_t, bool is_scale_inverted, typename fp8_type>
-__device__ void scaled_fp8_conversion_vec(fp8_type* __restrict__ out,
-                                          scalar_t const* __restrict__ input,
-                                          float const scale,
-                                          int64_t const num_elems,
-                                          int const tid, int const step) {
-  constexpr size_t VEC_SIZE = 16;
-  using scalarxN_t = vec_n_t<scalar_t, VEC_SIZE>;
-  using float8xN_t = q8_n_t<fp8_type, VEC_SIZE>;
-  // Vectorized input/output to better utilize memory bandwidth.
-  auto const* vectorized_in = reinterpret_cast<scalarxN_t const*>(input);
-  auto* vectorized_out = reinterpret_cast<float8xN_t*>(out);
-
-  // num_elems / VEC_SIZE (which is 16)
-  int64_t const num_vec_elems = num_elems >> 4;
-
-#pragma unroll
-  for (int64_t i = tid; i < num_vec_elems; i += step) {
-    scalarxN_t in_vec = vectorized_in[i];
-    float8xN_t out_vec;
-
-#pragma unroll
-    for (int j = 0; j < VEC_SIZE; ++j) {
-      out_vec.val[j] = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
-          static_cast<float>(in_vec.val[j]), scale);
-    }
-    vectorized_out[i] = out_vec;
-  }
-
-  // Handle the remaining elements if num_elems is not divisible by VEC_SIZE
-  for (int64_t i = num_vec_elems * VEC_SIZE + tid; i < num_elems; i += step) {
-    out[i] = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
-        static_cast<float>(input[i]), scale);
-  }
-}
-
 }  // namespace vllm
diff --git a/csrc/quantization/fp8/per_token_group_quant.cu b/csrc/quantization/fp8/per_token_group_quant.cu
index 2609054f2072b..f5b40e35b6e5a 100644
--- a/csrc/quantization/fp8/per_token_group_quant.cu
+++ b/csrc/quantization/fp8/per_token_group_quant.cu
@@ -1,12 +1,10 @@
 #include <ATen/cuda/CUDAContext.h>
-#include <c10/util/Float8_e4m3fn.h>
 
 #include "../per_token_group_quant_8bit.h"
 
 #include <cmath>
 
-#include <cuda_fp16.h>
-#include <cuda_bf16.h>
+#include <cuda_fp8.h>
 
 #include <torch/all.h>
 
@@ -199,7 +197,7 @@ void per_token_group_quant_8bit(const torch::Tensor& input,
   VLLM_DISPATCH_FLOATING_TYPES(
       input.scalar_type(), "per_token_group_quant_8bit", ([&] {
         if (dst_type == at::ScalarType::Float8_e4m3fn) {
-          LAUNCH_KERNEL(scalar_t, c10::Float8_e4m3fn);
+          LAUNCH_KERNEL(scalar_t, __nv_fp8_e4m3);
         } else if (dst_type == at::ScalarType::Char) {
           LAUNCH_KERNEL(scalar_t, int8_t);
         }
diff --git a/docker/Dockerfile b/docker/Dockerfile
index b87401c593572..d444087a3eff7 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -1,4 +1,3 @@
-
 # The vLLM Dockerfile is used to construct vLLM image that can be directly used
 # to run the OpenAI compatible server.
 
@@ -16,6 +15,7 @@ ARG PYTHON_VERSION=3.12
 # Example:
 # docker build --build-arg BUILD_BASE_IMAGE=registry.acme.org/mirror/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
 ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
+# TODO: Restore to base image after FlashInfer AOT wheel fixed
 ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
 
 # By parameterizing the Deadsnakes repository URL, we allow third-party to use
@@ -119,6 +119,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
+# Use copy mode to avoid hardlink failures with Docker cache mounts
+ENV UV_LINK_MODE=copy
 
 # Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
 # as it was causing spam when compiling the CUTLASS kernels
@@ -164,9 +166,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 # see https://github.com/pytorch/pytorch/pull/123243
 ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0 12.0'
 ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
-# Override the arch list for flash-attn to reduce the binary size
-ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
-ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
 #################### BASE BUILD IMAGE ####################
 
 #################### WHEEL BUILD IMAGE ####################
@@ -184,6 +183,8 @@ COPY requirements/build.txt requirements/build.txt
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
+# Use copy mode to avoid hardlink failures with Docker cache mounts
+ENV UV_LINK_MODE=copy
 
 RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system -r requirements/build.txt \
@@ -275,6 +276,8 @@ ARG PYTORCH_CUDA_INDEX_BASE_URL
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
+# Use copy mode to avoid hardlink failures with Docker cache mounts
+ENV UV_LINK_MODE=copy
 
 COPY requirements/lint.txt requirements/lint.txt
 COPY requirements/test.txt requirements/test.txt
@@ -286,7 +289,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 
 #################### vLLM installation IMAGE ####################
 # image with vLLM installed
-# TODO: Restore to base image after FlashInfer AOT wheel fixed
 FROM ${FINAL_BASE_IMAGE} AS vllm-base
 ARG CUDA_VERSION
 ARG PYTHON_VERSION
@@ -345,6 +347,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
+# Use copy mode to avoid hardlink failures with Docker cache mounts
+ENV UV_LINK_MODE=copy
 
 # Workaround for https://github.com/openai/triton/issues/2507 and
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
@@ -386,7 +390,9 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 
 # Install FlashInfer from source
 ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
-ARG FLASHINFER_GIT_REF="v0.2.9rc2"
+# Keep this in sync with https://github.com/vllm-project/vllm/blob/main/requirements/cuda.txt
+# We use `--force-reinstall --no-deps` to avoid issues with the existing FlashInfer wheel.
+ARG FLASHINFER_GIT_REF="v0.2.9"
 RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
   . /etc/environment
     git clone --depth 1 --recursive --shallow-submodules \
@@ -408,7 +414,7 @@ RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
         TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
             python3 -m flashinfer.aot
         TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
-            uv pip install --system --no-build-isolation .
+            uv pip install --system --no-build-isolation --force-reinstall --no-deps .
     popd
     rm -rf flashinfer
 BASH
@@ -429,6 +435,33 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system -r requirements/build.txt \
         --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 
+# Install DeepGEMM from source
+ARG DEEPGEMM_GIT_REPO="https://github.com/deepseek-ai/DeepGEMM.git"
+ARG DEEPGEMM_GIT_REF="187656694f7f69e3e7975617a68bc3387680a7e1"
+RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
+  . /etc/environment
+    CUDA_MAJOR="${CUDA_VERSION%%.*}"
+    CUDA_MINOR="${CUDA_VERSION#${CUDA_MAJOR}.}"
+    CUDA_MINOR="${CUDA_MINOR%%.*}"
+    if [ "$CUDA_MAJOR" -ge 12 ] && [ "$CUDA_MINOR" -ge 8 ]; then
+        git clone --recursive --shallow-submodules \
+            ${DEEPGEMM_GIT_REPO} deepgemm
+        echo "🏗️  Building DeepGEMM"
+        pushd deepgemm
+            git checkout ${DEEPGEMM_GIT_REF}
+            # Build DeepGEMM
+            # (Based on https://github.com/deepseek-ai/DeepGEMM/blob/main/install.sh)
+            rm -rf build dist
+            rm -rf *.egg-info
+            python3 setup.py bdist_wheel
+            uv pip install --system dist/*.whl
+        popd
+        rm -rf deepgemm
+    else
+        echo "Skipping DeepGEMM installation (requires CUDA 12.8+ but got ${CUDA_VERSION})"
+    fi
+BASH
+
 #################### vLLM installation IMAGE ####################
 
 #################### TEST IMAGE ####################
@@ -447,6 +480,8 @@ ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
+# Use copy mode to avoid hardlink failures with Docker cache mounts
+ENV UV_LINK_MODE=copy
 
 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
diff --git a/docker/Dockerfile.nightly_torch b/docker/Dockerfile.nightly_torch
index 8d43de77aad59..e147b97f0e056 100644
--- a/docker/Dockerfile.nightly_torch
+++ b/docker/Dockerfile.nightly_torch
@@ -114,9 +114,6 @@ RUN cat torch_build_versions.txt
 # explicitly set the list to avoid issues with torch 2.2
 # see https://github.com/pytorch/pytorch/pull/123243
 
-# Override the arch list for flash-attn to reduce the binary size
-ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
-ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
 #################### BASE BUILD IMAGE ####################
 
 #################### WHEEL BUILD IMAGE ####################
diff --git a/docker/Dockerfile.tpu b/docker/Dockerfile.tpu
index b9fc9def88190..2190151369761 100644
--- a/docker/Dockerfile.tpu
+++ b/docker/Dockerfile.tpu
@@ -1,4 +1,4 @@
-ARG NIGHTLY_DATE="20250724"
+ARG NIGHTLY_DATE="20250730"
 ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.12_tpuvm_$NIGHTLY_DATE"
 
 FROM $BASE_IMAGE
diff --git a/docs/.nav.yml b/docs/.nav.yml
index ab54dc3e535bd..ad742be3d6947 100644
--- a/docs/.nav.yml
+++ b/docs/.nav.yml
@@ -56,9 +56,7 @@ nav:
       - contributing/model/tests.md
       - contributing/model/multimodal.md
     - CI: contributing/ci
-    - Design Documents:
-      - V0: design
-      - V1: design/v1
+    - Design Documents: design
   - API Reference:
     - Summary: api/README.md
     - Contents:
diff --git a/docs/assets/design/fused_moe_modular_kernel/fused_experts_blocks.png b/docs/assets/design/fused_moe_modular_kernel/fused_experts_blocks.png
new file mode 100644
index 0000000000000..5721d5582c7f1
Binary files /dev/null and b/docs/assets/design/fused_moe_modular_kernel/fused_experts_blocks.png differ
diff --git a/docs/assets/design/fused_moe_modular_kernel/fused_moe_batched.png b/docs/assets/design/fused_moe_modular_kernel/fused_moe_batched.png
new file mode 100644
index 0000000000000..8168155b9dbaf
Binary files /dev/null and b/docs/assets/design/fused_moe_modular_kernel/fused_moe_batched.png differ
diff --git a/docs/assets/design/fused_moe_modular_kernel/fused_moe_non_batched.png b/docs/assets/design/fused_moe_modular_kernel/fused_moe_non_batched.png
new file mode 100644
index 0000000000000..bc6cc0aaaf47b
Binary files /dev/null and b/docs/assets/design/fused_moe_modular_kernel/fused_moe_non_batched.png differ
diff --git a/docs/assets/design/fused_moe_modular_kernel/prepare_and_finalize_blocks.png b/docs/assets/design/fused_moe_modular_kernel/prepare_and_finalize_blocks.png
new file mode 100644
index 0000000000000..94364e593fe68
Binary files /dev/null and b/docs/assets/design/fused_moe_modular_kernel/prepare_and_finalize_blocks.png differ
diff --git a/docs/assets/design/v1/metrics/intervals-1.png b/docs/assets/design/metrics/intervals-1.png
similarity index 100%
rename from docs/assets/design/v1/metrics/intervals-1.png
rename to docs/assets/design/metrics/intervals-1.png
diff --git a/docs/assets/design/v1/metrics/intervals-2.png b/docs/assets/design/metrics/intervals-2.png
similarity index 100%
rename from docs/assets/design/v1/metrics/intervals-2.png
rename to docs/assets/design/metrics/intervals-2.png
diff --git a/docs/assets/design/v1/metrics/intervals-3.png b/docs/assets/design/metrics/intervals-3.png
similarity index 100%
rename from docs/assets/design/v1/metrics/intervals-3.png
rename to docs/assets/design/metrics/intervals-3.png
diff --git a/docs/assets/kernel/k_vecs.png b/docs/assets/design/paged_attention/k_vecs.png
similarity index 100%
rename from docs/assets/kernel/k_vecs.png
rename to docs/assets/design/paged_attention/k_vecs.png
diff --git a/docs/assets/kernel/key.png b/docs/assets/design/paged_attention/key.png
similarity index 100%
rename from docs/assets/kernel/key.png
rename to docs/assets/design/paged_attention/key.png
diff --git a/docs/assets/kernel/logits_vec.png b/docs/assets/design/paged_attention/logits_vec.png
similarity index 100%
rename from docs/assets/kernel/logits_vec.png
rename to docs/assets/design/paged_attention/logits_vec.png
diff --git a/docs/assets/kernel/q_vecs.png b/docs/assets/design/paged_attention/q_vecs.png
similarity index 100%
rename from docs/assets/kernel/q_vecs.png
rename to docs/assets/design/paged_attention/q_vecs.png
diff --git a/docs/assets/kernel/query.png b/docs/assets/design/paged_attention/query.png
similarity index 100%
rename from docs/assets/kernel/query.png
rename to docs/assets/design/paged_attention/query.png
diff --git a/docs/assets/kernel/v_vec.png b/docs/assets/design/paged_attention/v_vec.png
similarity index 100%
rename from docs/assets/kernel/v_vec.png
rename to docs/assets/design/paged_attention/v_vec.png
diff --git a/docs/assets/kernel/value.png b/docs/assets/design/paged_attention/value.png
similarity index 100%
rename from docs/assets/kernel/value.png
rename to docs/assets/design/paged_attention/value.png
diff --git a/docs/assets/design/v1/prefix_caching/example-time-1.png b/docs/assets/design/prefix_caching/example-time-1.png
similarity index 100%
rename from docs/assets/design/v1/prefix_caching/example-time-1.png
rename to docs/assets/design/prefix_caching/example-time-1.png
diff --git a/docs/assets/design/v1/prefix_caching/example-time-3.png b/docs/assets/design/prefix_caching/example-time-3.png
similarity index 100%
rename from docs/assets/design/v1/prefix_caching/example-time-3.png
rename to docs/assets/design/prefix_caching/example-time-3.png
diff --git a/docs/assets/design/v1/prefix_caching/example-time-4.png b/docs/assets/design/prefix_caching/example-time-4.png
similarity index 100%
rename from docs/assets/design/v1/prefix_caching/example-time-4.png
rename to docs/assets/design/prefix_caching/example-time-4.png
diff --git a/docs/assets/design/v1/prefix_caching/example-time-5.png b/docs/assets/design/prefix_caching/example-time-5.png
similarity index 100%
rename from docs/assets/design/v1/prefix_caching/example-time-5.png
rename to docs/assets/design/prefix_caching/example-time-5.png
diff --git a/docs/assets/design/v1/prefix_caching/example-time-6.png b/docs/assets/design/prefix_caching/example-time-6.png
similarity index 100%
rename from docs/assets/design/v1/prefix_caching/example-time-6.png
rename to docs/assets/design/prefix_caching/example-time-6.png
diff --git a/docs/assets/design/v1/prefix_caching/example-time-7.png b/docs/assets/design/prefix_caching/example-time-7.png
similarity index 100%
rename from docs/assets/design/v1/prefix_caching/example-time-7.png
rename to docs/assets/design/prefix_caching/example-time-7.png
diff --git a/docs/assets/design/v1/prefix_caching/free.png b/docs/assets/design/prefix_caching/free.png
similarity index 100%
rename from docs/assets/design/v1/prefix_caching/free.png
rename to docs/assets/design/prefix_caching/free.png
diff --git a/docs/assets/design/v1/prefix_caching/overview.png b/docs/assets/design/prefix_caching/overview.png
similarity index 100%
rename from docs/assets/design/v1/prefix_caching/overview.png
rename to docs/assets/design/prefix_caching/overview.png
diff --git a/docs/assets/design/tpu/most_model_len.png b/docs/assets/design/tpu/most_model_len.png
new file mode 100644
index 0000000000000..344a81ed90801
Binary files /dev/null and b/docs/assets/design/tpu/most_model_len.png differ
diff --git a/docs/cli/README.md b/docs/cli/README.md
index dfb6051a8c8a6..b1371c82a4c4d 100644
--- a/docs/cli/README.md
+++ b/docs/cli/README.md
@@ -6,13 +6,13 @@ toc_depth: 4
 
 The vllm command-line tool is used to run and manage vLLM models. You can start by viewing the help message with:
 
-```
+```bash
 vllm --help
 ```
 
 Available Commands:
 
-```
+```bash
 vllm {chat,complete,serve,bench,collect-env,run-batch}
 ```
 
diff --git a/docs/configuration/tpu.md b/docs/configuration/tpu.md
new file mode 100644
index 0000000000000..a2941c80bd27c
--- /dev/null
+++ b/docs/configuration/tpu.md
@@ -0,0 +1,111 @@
+# TPU Optimization Tips
+
+This doc serves as a collection of handy tips for optimizing your vLLM on TPU workload.
+
+## Get started
+
+Looking for setup and installation instructions? Find them [here](../getting_started/installation/google_tpu.md).
+
+### TPU workload sizing
+
+When selecting the ideal number of chips for a single serving instance, it's important to account for both the model size and the average request context length. Adequate HBM for the KV cache is essential to ensure a sufficient number of concurrent requests can be processed.
+
+The following colab [calculator](https://colab.research.google.com/github/ericehanley/rightsize-vllm/blob/main/HBM_Calculator.ipynb) will tell you:
+
+- KV cache size requirement per token and per request
+- TPU/GPU memory consumed by the model weights
+- TPU/GPU memory allocated for the KV cache
+- Maximum \# of requests you can approximately set (--max-num-seqs)
+
+This approach serves as a general rule of thumb.
+
+#### Latency-throughput tradeoff
+
+As with rightsizing the number of chips for your workload, consider adjusting `--max-num-seqs` to fine-tune the latency-throughput balance. Decreasing `--max-num-seqs` and/or increasing the number of chips can help reduce latency.
+
+`--max-num-seqs` defines the number of concurrent decode slots, effectively limiting the number of requests the server can process tokens for simultaneously. Increasing this value allows the server to pre-allocate more HBM to handle a higher number of concurrent requests, which can maximize overall throughput. However, this often increases the end-to-end (e2e) latency per request.
+
+Therefore, carefully tuning `--max-num-seqs` is crucial to achieving the desired balance between latency and throughput for your specific workload.
+
+In a similar way, `--max-num-batch-tokens` can be adjusted down to improve latency, or adjusted up to improve throughput.
+
+#### Compilation and Caching
+
+Coming from a GPU background, one of the key differences you'll notice with TPUs is an initial compilation step. TPUs are specialized accelerators (ASICs) that achieve maximum performance by executing pre-compiled, static computation graphs via the XLA compiler. Unlike GPUs, which can handle dynamic input shapes more flexibly, TPUs require a specific compiled graph for each tensor shape (e.g., batch size and sequence length) they process.
+
+To manage this, vLLM performs a one-time "warmup" process when you first launch the server. During this phase, it pre-compiles the model for various common input shapes and saves these compiled graphs to a cache on disk or remote storage (located at `~/.cache/vllm/xla_cache` by default). This process can range significantly, anywhere from a few minutes to an hour depending on the size of the model and context length used.
+
+Although the first compilation can take some time, for all subsequent server launches, vLLM can load these graphs directly from the cache, eliminating the compilation time for future runs.
+
+Use `VLLM_XLA_CACHE_PATH` environment variable to write to shareable storage for future deployed nodes (like when using autoscaling).
+
+#### Reducing compilation time
+
+This initial compilation time ranges significantly and is impacted by many of the arguments discussed in this optimization doc. Factors that influence the length of time to compile are things like model size and `--max-num-batch-tokens`. Other arguments you can tune are things like `VLLM_TPU_MOST_MODEL_LEN`.
+
+### Optimize based on your data
+
+#### max model len vs. most model len
+
+![most_model_len](../assets/design/tpu/most_model_len.png)
+
+If most of your requests are shorter than the maximum model length but you still need to accommodate occasional longer requests, setting a high maximum model length can negatively impact performance. In these cases, you can try introducing most model len by specifying the `VLLM_TPU_MOST_MODEL_LEN` environment variable.
+
+For example, 1% requests are 32k length and 99% requests are 2k length. You can pass 32k into `--max-model-len 32768` and use `VLLM_TPU_MOST_MODEL_LEN=2048`.
+
+The requests get subdivided into max-model-len and most-model-len categories, for the latter category, we can gain better performance since the server can process more requests at a time.
+
+#### Padding
+
+For online serving with latency requirements, consider switching to bucket padding by setting the `VLLM_TPU_BUCKET_PADDING_GAP` environment variable. Because of the layout of the TPU, try using increments of 128: 128, 256, etc.
+
+The server pads the requests into fixed lengths before sending them to the model to avoid recompilation. To read more about tpu padding, see [here](https://cloud.google.com/tpu/docs/performance-guide#xla-efficiencies). Currently, there are 2 ways to pad the requests:
+
+1) the default exponential padding (pad to the nearest power of 2)
+2) bucket padding (pad to the nearest linearly increasing bucket).
+
+When using bucket padding, the buckets start from 16, end at max_model_len, and increment by `VLLM_TPU_BUCKET_PADDING_GAP`.
+
+For example, max_model_len=512, padding_gap=64, the buckets will be [16, 32, 64, 128, 192, 256, 320, 384, 448, 512].
+
+The fewer tokens we pad, the less unnecessary computation TPU does, the better performance we can get. For example, if num_tokens=300, with exponential padding, we pad to 512, with the bucket_padding above, we pad to 320.
+
+However, you need to be careful to choose the padding gap. If the gap is too small, it means the number of buckets is large, leading to increased warmup (precompile) time and higher memory to store the compiled graph. Too many compilaed graphs may lead to HBM OOM. Conversely, an overly large gap yields no performance improvement compared to the default exponential padding.
+
+#### Quantization
+
+If possible, use the precision that matches the chip’s hardware acceleration:
+
+- v5e has int4/int8 hardware acceleration in the MXU
+- v6e has int4/int8 hardware acceleration in the MXU
+
+Supported quantized formats and features in vLLM on TPU [Jul '25]:
+
+- INT8 W8A8
+- INT8 W8A16
+- FP8 KV cache
+- [WIP] FP8 W8A8
+- [WIP] AWQ
+- [WIP] FP4 W4A8
+
+#### Parallelization
+
+Don't set TP to be less than the number of chips on a single-host deployment.
+
+Although it’s common to do this with GPUs, don't try to fragment 2 or 8 different workloads across 8 chips on a single host. If you need 1 or 4 chips, just create an instance with 1 or 4 chips (these are partial-host machine types).
+
+### Tune your workloads
+
+Although we try to have great default configs, we strongly recommend you check out the [vLLM auto-tuner](../../benchmarks/auto_tune/README.md) to optimize your workloads for your use case.
+
+### Future Topics We'll Cover
+
+#### Profiling
+
+The auto-tuner provides a profile of optimized configurations as its final step. However, interpreting this profile can be challenging for new users. We plan to expand this section in the future with more detailed guidance. In the meantime, you can learn how to collect a TPU profile using vLLM's native profiling tools [here](../examples/offline_inference/profiling_tpu.md). This profile can provide valuable insights into your workload's performance.
+
+#### SPMD
+
+More details to come.
+
+**Want us to cover something that isn't listed here? Open up an issue please and cite this doc. We'd love to hear your questions or tips.**
diff --git a/docs/contributing/README.md b/docs/contributing/README.md
index e3ae5055b9988..5a2a70d57e85f 100644
--- a/docs/contributing/README.md
+++ b/docs/contributing/README.md
@@ -26,6 +26,8 @@ See <gh-file:LICENSE>.
 
 ## Developing
 
+--8<-- "docs/getting_started/installation/python_env_setup.inc.md"
+
 Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation.
 Check out the [building from source][build-from-source] documentation for details.
 
@@ -42,7 +44,7 @@ For an optimized workflow when iterating on C++/CUDA kernels, see the [Increment
 Install MkDocs along with the [plugins](https://github.com/vllm-project/vllm/blob/main/mkdocs.yaml) used in the vLLM documentation, as well as required dependencies:
 
 ```bash
-pip install -r requirements/docs.txt
+uv pip install -r requirements/docs.txt
 ```
 
 !!! note
@@ -98,13 +100,14 @@ For additional features and advanced configurations, refer to the official [MkDo
 ??? console "Commands"
 
     ```bash
-    pip install -r requirements/common.txt -r requirements/dev.txt
+    # These commands are only for Nvidia CUDA platforms.
+    uv pip install -r requirements/common.txt -r requirements/dev.txt --torch-backend=auto
 
     # Linting, formatting and static type checking
-    pre-commit install --hook-type pre-commit --hook-type commit-msg
+    pre-commit install
 
     # You can manually run pre-commit with
-    pre-commit run --all-files
+    pre-commit run --all-files --show-diff-on-failure
 
     # To manually run something from CI that does not run
     # locally by default, you can run:
@@ -122,6 +125,10 @@ For additional features and advanced configurations, refer to the official [MkDo
 
     Therefore, we recommend developing with Python 3.12 to minimise the chance of your local environment clashing with our CI environment.
 
+!!! note "Install python3-dev if Python.h is missing"
+    If any of the above commands fails with `Python.h: No such file or directory`, install
+    `python3-dev` with `sudo apt install python3-dev`.
+
 !!! note
     Currently, the repository is not fully checked by `mypy`.
 
@@ -153,7 +160,7 @@ Using `-s` with `git commit` will automatically add this header.
 
 !!! tip
     You can enable automatic sign-off via your IDE:
-  
+
     - **PyCharm**: Click on the `Show Commit Options` icon to the right of the `Commit and Push...` button in the `Commit` window.
       It will bring up a `git` window where you can modify the `Author` and enable `Sign-off commit`.
     - **VSCode**: Open the [Settings editor](https://code.visualstudio.com/docs/configure/settings)
diff --git a/docs/contributing/ci/failures.md b/docs/contributing/ci/failures.md
index 573efb3b05f6e..d7e2dfbca8760 100644
--- a/docs/contributing/ci/failures.md
+++ b/docs/contributing/ci/failures.md
@@ -20,19 +20,19 @@ the failure?
 
 - **Use this title format:**
 
-    ```
+    ```text
     [CI Failure]: failing-test-job - regex/matching/failing:test
     ```
 
 - **For the environment field:**
 
-    ```
- Still failing on main as of commit abcdef123
+    ```text
+    Still failing on main as of commit abcdef123
     ```
 
 - **In the description, include failing tests:**
 
-    ```
+    ```text
     FAILED failing/test.py:failing_test1 - Failure description
     FAILED failing/test.py:failing_test2 - Failure description
     https://github.com/orgs/vllm-project/projects/20
diff --git a/docs/contributing/ci/update_pytorch_version.md b/docs/contributing/ci/update_pytorch_version.md
index 5046db11a4715..3a6026d450a67 100644
--- a/docs/contributing/ci/update_pytorch_version.md
+++ b/docs/contributing/ci/update_pytorch_version.md
@@ -57,8 +57,7 @@ cc the PyTorch release team to initiate discussion on how to address them.
 
 ## Update CUDA version
 
-The PyTorch release matrix includes both stable and experimental [CUDA versions](https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix). Due to limitations, only the latest stable CUDA version (for example,
-`torch2.7.0+cu12.6`) is uploaded to PyPI. However, vLLM may require a different CUDA version,
+The PyTorch release matrix includes both stable and experimental [CUDA versions](https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix). Due to limitations, only the latest stable CUDA version (for example, torch `2.7.1+cu126`) is uploaded to PyPI. However, vLLM may require a different CUDA version,
 such as 12.8 for Blackwell support.
 This complicates the process as we cannot use the out-of-the-box
 `pip install torch torchvision torchaudio` command. The solution is to use
@@ -107,6 +106,7 @@ releases (which would take too much time), they can be built from
 source to unblock the update process.
 
 ### FlashInfer
+
 Here is how to build and install it from source with `torch2.7.0+cu128` in vLLM [Dockerfile](https://github.com/vllm-project/vllm/blob/27bebcd89792d5c4b08af7a65095759526f2f9e1/docker/Dockerfile#L259-L271):
 
 ```bash
@@ -122,6 +122,7 @@ public location for immediate installation, such as [this FlashInfer wheel link]
 team if you want to get the package published there.
 
 ### xFormers
+
 Similar to FlashInfer, here is how to build and install xFormers from source:
 
 ```bash
@@ -139,7 +140,7 @@ uv pip install --system \
 
 ### causal-conv1d
 
-```
+```bash
 uv pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
 ```
 
diff --git a/docs/contributing/deprecation_policy.md b/docs/contributing/deprecation_policy.md
index ff69cbae08b23..904ef4ca058c0 100644
--- a/docs/contributing/deprecation_policy.md
+++ b/docs/contributing/deprecation_policy.md
@@ -31,7 +31,7 @@ Features that fall under this policy include (at a minimum) the following:
 The deprecation process consists of several clearly defined stages that span
 multiple Y releases:
 
-**1. Deprecated (Still On By Default)**
+### 1. Deprecated (Still On By Default)
 
 - **Action**: Feature is marked as deprecated.
 - **Timeline**: A removal version is explicitly stated in the deprecation
@@ -46,7 +46,7 @@ warning (e.g., "This will be removed in v0.10.0").
     - GitHub Issue (RFC) for feedback
     - Documentation and use of the `@typing_extensions.deprecated` decorator for Python APIs
 
-**2.Deprecated (Off By Default)**
+### 2.Deprecated (Off By Default)
 
 - **Action**: Feature is disabled by default, but can still be re-enabled via a
 CLI flag or environment variable. Feature throws an error when used without
@@ -55,7 +55,7 @@ re-enabling.
 while signaling imminent removal. Ensures any remaining usage is clearly
 surfaced and blocks silent breakage before full removal.
 
-**3. Removed**
+### 3. Removed
 
 - **Action**: Feature is completely removed from the codebase.
 - **Note**: Only features that have passed through the previous deprecation
diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md
index 13c3bc2c7e031..74627e9062167 100644
--- a/docs/contributing/profiling.md
+++ b/docs/contributing/profiling.md
@@ -5,7 +5,12 @@
 
 ## Profile with PyTorch Profiler
 
-We support tracing vLLM workers using the `torch.profiler` module. You can enable tracing by setting the `VLLM_TORCH_PROFILER_DIR` environment variable to the directory where you want to save the traces: `VLLM_TORCH_PROFILER_DIR=/mnt/traces/`
+We support tracing vLLM workers using the `torch.profiler` module. You can enable tracing by setting the `VLLM_TORCH_PROFILER_DIR` environment variable to the directory where you want to save the traces: `VLLM_TORCH_PROFILER_DIR=/mnt/traces/`. Additionally, you can control the profiling content by specifying the following environment variables:
+
+- `VLLM_TORCH_PROFILER_RECORD_SHAPES=1` to enable recording Tensor Shapes, off by default
+- `VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY=1` to record memory, off by default
+- `VLLM_TORCH_PROFILER_WITH_STACK=1` to enable recording stack information, on by default
+- `VLLM_TORCH_PROFILER_WITH_FLOPS=1` to enable recording FLOPs, off by default
 
 The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` environment variable set.
 
@@ -112,13 +117,13 @@ vllm bench serve \
 
 In practice, you should set the `--duration` argument to a large value. Whenever you want the server to stop profiling, run:
 
-```
+```bash
 nsys sessions list
 ```
 
 to get the session id in the form of `profile-XXXXX`, then run:
 
-```
+```bash
 nsys stop --session=profile-XXXXX
 ```
 
diff --git a/docs/contributing/vulnerability_management.md b/docs/contributing/vulnerability_management.md
index e20b10f8f7b32..847883f742974 100644
--- a/docs/contributing/vulnerability_management.md
+++ b/docs/contributing/vulnerability_management.md
@@ -32,9 +32,9 @@ We prefer to keep all vulnerability-related communication on the security report
 on GitHub. However, if you need to contact the VMT directly for an urgent issue,
 you may contact the following individuals:
 
-- Simon Mo - simon.mo@hey.com
-- Russell Bryant - rbryant@redhat.com
-- Huzaifa Sidhpurwala - huzaifas@redhat.com
+- Simon Mo - <simon.mo@hey.com>
+- Russell Bryant - <rbryant@redhat.com>
+- Huzaifa Sidhpurwala - <huzaifas@redhat.com>
 
 ## Slack Discussion
 
diff --git a/docs/deployment/docker.md b/docs/deployment/docker.md
index e500751896b34..1f19f2fecfab1 100644
--- a/docs/deployment/docker.md
+++ b/docs/deployment/docker.md
@@ -10,23 +10,23 @@ The image can be used to run OpenAI compatible server and is available on Docker
 ```bash
 docker run --runtime nvidia --gpus all \
     -v ~/.cache/huggingface:/root/.cache/huggingface \
-    --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
+    --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
     -p 8000:8000 \
     --ipc=host \
     vllm/vllm-openai:latest \
-    --model mistralai/Mistral-7B-v0.1
+    --model Qwen/Qwen3-0.6B
 ```
 
 This image can also be used with other container engines such as [Podman](https://podman.io/).
 
 ```bash
-podman run --gpus all \
+podman run --device nvidia.com/gpu=all \
   -v ~/.cache/huggingface:/root/.cache/huggingface \
   --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
   -p 8000:8000 \
   --ipc=host \
-  vllm/vllm-openai:latest \
-  --model mistralai/Mistral-7B-v0.1
+  docker.io/vllm/vllm-openai:latest \
+  --model Qwen/Qwen3-0.6B
 ```
 
 You can add any other [engine-args](../configuration/engine_args.md) you need after the image tag (`vllm/vllm-openai:latest`).
@@ -106,8 +106,7 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `--
     -t vllm/vllm-gh200-openai:latest \
     --build-arg max_jobs=66 \
     --build-arg nvcc_threads=2 \
-    --build-arg torch_cuda_arch_list="9.0 10.0+PTX" \
-    --build-arg vllm_fa_cmake_gpu_arches="90-real"
+    --build-arg torch_cuda_arch_list="9.0 10.0+PTX"
     ```
 
 !!! note
diff --git a/docs/deployment/frameworks/anything-llm.md b/docs/deployment/frameworks/anything-llm.md
index d6b28a358cc3d..e62a33b2085ca 100644
--- a/docs/deployment/frameworks/anything-llm.md
+++ b/docs/deployment/frameworks/anything-llm.md
@@ -19,9 +19,9 @@ vllm serve Qwen/Qwen1.5-32B-Chat-AWQ --max-model-len 4096
 - Download and install [Anything LLM desktop](https://anythingllm.com/desktop).
 
 - On the bottom left of open settings, AI Prooviders --> LLM:
-  - LLM Provider: Generic OpenAI
-  - Base URL: http://{vllm server host}:{vllm server port}/v1
-  - Chat Model Name: `Qwen/Qwen1.5-32B-Chat-AWQ`
+    - LLM Provider: Generic OpenAI
+    - Base URL: http://{vllm server host}:{vllm server port}/v1
+    - Chat Model Name: `Qwen/Qwen1.5-32B-Chat-AWQ`
 
 ![](../../assets/deployment/anything-llm-provider.png)
 
@@ -30,9 +30,9 @@ vllm serve Qwen/Qwen1.5-32B-Chat-AWQ --max-model-len 4096
 ![](../../assets/deployment/anything-llm-chat-without-doc.png)
 
 - Click the upload button:
-  - upload the doc
-  - select the doc and move to the workspace
-  - save and embed
+    - upload the doc
+    - select the doc and move to the workspace
+    - save and embed
 
 ![](../../assets/deployment/anything-llm-upload-doc.png)
 
diff --git a/docs/deployment/frameworks/chatbox.md b/docs/deployment/frameworks/chatbox.md
index 15f92ed1e34df..cbca6e6282fc6 100644
--- a/docs/deployment/frameworks/chatbox.md
+++ b/docs/deployment/frameworks/chatbox.md
@@ -19,11 +19,11 @@ vllm serve qwen/Qwen1.5-0.5B-Chat
 - Download and install [Chatbox desktop](https://chatboxai.app/en#download).
 
 - On the bottom left of settings, Add Custom Provider
-  - API Mode: `OpenAI API Compatible`
-  - Name: vllm
-  - API Host: `http://{vllm server host}:{vllm server port}/v1`
-  - API Path: `/chat/completions`
-  - Model: `qwen/Qwen1.5-0.5B-Chat`
+    - API Mode: `OpenAI API Compatible`
+    - Name: vllm
+    - API Host: `http://{vllm server host}:{vllm server port}/v1`
+    - API Path: `/chat/completions`
+    - Model: `qwen/Qwen1.5-0.5B-Chat`
 
 ![](../../assets/deployment/chatbox-settings.png)
 
diff --git a/docs/deployment/frameworks/dify.md b/docs/deployment/frameworks/dify.md
index a3063194fb513..35f02c33cb02b 100644
--- a/docs/deployment/frameworks/dify.md
+++ b/docs/deployment/frameworks/dify.md
@@ -34,11 +34,11 @@ docker compose up -d
 - In the top-right user menu (under the profile icon), go to Settings, then click `Model Provider`, and locate the `vLLM` provider to install it.
 
 - Fill in the model provider details as follows:
-  - **Model Type**: `LLM`
-  - **Model Name**: `Qwen/Qwen1.5-7B-Chat`
-  - **API Endpoint URL**: `http://{vllm_server_host}:{vllm_server_port}/v1`
-  - **Model Name for API Endpoint**: `Qwen/Qwen1.5-7B-Chat`
-  - **Completion Mode**: `Completion`
+    - **Model Type**: `LLM`
+    - **Model Name**: `Qwen/Qwen1.5-7B-Chat`
+    - **API Endpoint URL**: `http://{vllm_server_host}:{vllm_server_port}/v1`
+    - **Model Name for API Endpoint**: `Qwen/Qwen1.5-7B-Chat`
+    - **Completion Mode**: `Completion`
 
 ![](../../assets/deployment/dify-settings.png)
 
diff --git a/docs/deployment/frameworks/haystack.md b/docs/deployment/frameworks/haystack.md
index a18d68142cabb..70b4b48d4543e 100644
--- a/docs/deployment/frameworks/haystack.md
+++ b/docs/deployment/frameworks/haystack.md
@@ -1,7 +1,5 @@
 # Haystack
 
-# Haystack
-
 [Haystack](https://github.com/deepset-ai/haystack) is an end-to-end LLM framework that allows you to build applications powered by LLMs, Transformer models, vector search and more. Whether you want to perform retrieval-augmented generation (RAG), document search, question answering or answer generation, Haystack can orchestrate state-of-the-art embedding models and LLMs into pipelines to build end-to-end NLP applications and solve your use case.
 
 It allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints.
diff --git a/docs/deployment/frameworks/retrieval_augmented_generation.md b/docs/deployment/frameworks/retrieval_augmented_generation.md
index 96dd99e7118b6..d5f2ec302b6cd 100644
--- a/docs/deployment/frameworks/retrieval_augmented_generation.md
+++ b/docs/deployment/frameworks/retrieval_augmented_generation.md
@@ -3,6 +3,7 @@
 [Retrieval-augmented generation (RAG)](https://en.wikipedia.org/wiki/Retrieval-augmented_generation) is a technique that enables generative artificial intelligence (Gen AI) models to retrieve and incorporate new information. It modifies interactions with a large language model (LLM) so that the model responds to user queries with reference to a specified set of documents, using this information to supplement information from its pre-existing training data. This allows LLMs to use domain-specific and/or updated information. Use cases include providing chatbot access to internal company data or generating responses based on authoritative sources.
 
 Here are the integrations:
+
 - vLLM + [langchain](https://github.com/langchain-ai/langchain) + [milvus](https://github.com/milvus-io/milvus)
 - vLLM + [llamaindex](https://github.com/run-llama/llama_index) + [milvus](https://github.com/milvus-io/milvus)
 
diff --git a/docs/deployment/integrations/production-stack.md b/docs/deployment/integrations/production-stack.md
index 497f9f1a92a5d..fae392589c060 100644
--- a/docs/deployment/integrations/production-stack.md
+++ b/docs/deployment/integrations/production-stack.md
@@ -140,11 +140,12 @@ The core vLLM production stack configuration is managed with YAML. Here is the e
     ```
 
 In this YAML configuration:
+
 * **`modelSpec`** includes:
-  * `name`: A nickname that you prefer to call the model.
-  * `repository`: Docker repository of vLLM.
-  * `tag`: Docker image tag.
-  * `modelURL`: The LLM model that you want to use.
+    * `name`: A nickname that you prefer to call the model.
+    * `repository`: Docker repository of vLLM.
+    * `tag`: Docker image tag.
+    * `modelURL`: The LLM model that you want to use.
 * **`replicaCount`**: Number of replicas.
 * **`requestCPU` and `requestMemory`**: Specifies the CPU and memory resource requests for the pod.
 * **`requestGPU`**: Specifies the number of GPUs required.
diff --git a/docs/deployment/k8s.md b/docs/deployment/k8s.md
index f244b0858eb6e..cad801a4312cc 100644
--- a/docs/deployment/k8s.md
+++ b/docs/deployment/k8s.md
@@ -5,7 +5,7 @@ Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine le
 - [Deployment with CPUs](#deployment-with-cpus)
 - [Deployment with GPUs](#deployment-with-gpus)
 - [Troubleshooting](#troubleshooting)
-  - [Startup Probe or Readiness Probe Failure, container log contains "KeyboardInterrupt: terminated"](#startup-probe-or-readiness-probe-failure-container-log-contains-keyboardinterrupt-terminated)
+    - [Startup Probe or Readiness Probe Failure, container log contains "KeyboardInterrupt: terminated"](#startup-probe-or-readiness-probe-failure-container-log-contains-keyboardinterrupt-terminated)
 - [Conclusion](#conclusion)
 
 Alternatively, you can deploy vLLM to Kubernetes using any of the following:
diff --git a/docs/design/automatic_prefix_caching.md b/docs/design/automatic_prefix_caching.md
deleted file mode 100644
index 60e21f6ad0fcb..0000000000000
--- a/docs/design/automatic_prefix_caching.md
+++ /dev/null
@@ -1,40 +0,0 @@
-# Automatic Prefix Caching
-
-The core idea of [PagedAttention](https://blog.vllm.ai/2023/06/20/vllm.html) is to partition the KV cache of each request into KV Blocks. Each block contains the attention keys and values for a fixed number of tokens. The PagedAttention algorithm allows these blocks to be stored in non-contiguous physical memory so that we can eliminate memory fragmentation by allocating the memory on demand.
-
-To automatically cache the KV cache, we utilize the following key observation: Each KV block can be uniquely identified by the tokens within the block and the tokens in the prefix before the block.
-
-```text
-                    Block 1                  Block 2                  Block 3
-         [A gentle breeze stirred] [the leaves as children] [laughed in the distance]
-Block 1: |<--- block tokens ---->|
-Block 2: |<------- prefix ------>| |<--- block tokens --->|
-Block 3: |<------------------ prefix -------------------->| |<--- block tokens ---->|
-```
-
-In the example above, the KV cache in the first block can be uniquely identified with the tokens “A gentle breeze stirred”. The third block can be uniquely identified with the tokens in the block “laughed in the distance”, along with the prefix tokens “A gentle breeze stirred the leaves as children”. Therefore, we can build the following one-to-one mapping:
-
-```text
-hash(prefix tokens + block tokens) <--> KV Block
-```
-
-With this mapping, we can add another indirection in vLLM’s KV cache management. Previously, each sequence in vLLM maintained a mapping from their logical KV blocks to physical blocks. To achieve automatic caching of KV blocks, we map the logical KV blocks to their hash value and maintain a global hash table of all the physical blocks. In this way, all the KV blocks sharing the same hash value (e.g., shared prefix blocks across two requests) can be mapped to the same physical block and share the memory space.
-
-This design achieves automatic prefix caching without the need of maintaining a tree structure among the KV blocks. More specifically, all of the blocks are independent of each other and can be allocated and freed by itself, which enables us to manages the KV cache as ordinary caches in operating system.
-
-## Generalized Caching Policy
-
-Keeping all the KV blocks in a hash table enables vLLM to cache KV blocks from earlier requests to save memory and accelerate the computation of future requests. For example, if a new request shares the system prompt with the previous request, the KV cache of the shared prompt can directly be used for the new request without recomputation. However, the total KV cache space is limited and we have to decide which KV blocks to keep or evict when the cache is full.
-
-Managing KV cache with a hash table allows us to implement flexible caching policies. As an example, in current vLLM, we implement the following eviction policy:
-
-* When there are no free blocks left, we will evict a KV block with reference count (i.e., number of current requests using the block) equals 0.
-* If there are multiple blocks with reference count equals to 0, we prioritize to evict the least recently used block (LRU).
-* If there are multiple blocks whose last access time are the same, we prioritize the eviction of the block that is at the end of the longest prefix (i.e., has the maximum number of blocks before it).
-
-Note that this eviction policy effectively implements the exact policy as in [RadixAttention](https://lmsys.org/blog/2024-01-17-sglang/) when applied to models with full attention, which prioritizes to evict reference count zero and least recent used leaf nodes in the prefix tree.
-
-However, the hash-based KV cache management gives us the flexibility to handle more complicated serving scenarios and implement more complicated eviction policies beyond the policy above:
-
-* Multi-LoRA serving. When serving requests for multiple LoRA adapters, we can simply let the hash of each KV block to also include the LoRA ID the request is querying for to enable caching for all adapters. In this way, we can jointly manage the KV blocks for different adapters, which simplifies the system implementation and improves the global cache hit rate and efficiency.
-* Multi-modal models. When the user input includes more than just discrete tokens, we can use different hashing methods to handle the caching of inputs of different modalities. For example, perceptual hashing for images to cache similar input images.
diff --git a/docs/design/fused_moe_modular_kernel.md b/docs/design/fused_moe_modular_kernel.md
new file mode 100644
index 0000000000000..3ef1232051b07
--- /dev/null
+++ b/docs/design/fused_moe_modular_kernel.md
@@ -0,0 +1,259 @@
+# Fused MoE Modular Kernel
+
+## Introduction
+
+FusedMoEModularKernel is implemented [here](gh-file:/vllm/model_executor/layers/fused_moe/modular_kernel.py)
+
+Based on the format of the input activations, FusedMoE implementations are broadly classified into 2 types.
+
+* Contiguous / Standard / Non-Batched, and
+* Batched
+
+!!! note
+    The terms Contiguous, Standard, and Non-Batched are used interchangeably throughout the document.
+
+The input activation format completely depends on the All2All Dispatch being used.
+
+* In the Contiguous variant, the All2All Dispatch returns the activations as a contiguous tensor of shape (M, K) along with TopK Ids and TopK weights of shape (M, num_topk). Look at `DeepEPHTPrepareAndFinalize` for an example.
+* In the Batched variant, the All2All Dispatch returns the activations as a tensor of shape (num_experts, max_tokens, K). Here, the activations/tokens that subscribe to the same expert are batched together. Note that not all entries of the tensor are valid. The activations tensor is typically accompanied by an `expert_num_tokens` tensor of size `num_experts`, where `expert_num_tokens[i]` indicates the number of valid tokens that subscribe to the ith expert. Look at `PplxPrepareAndFinalize` or `DeepEPLLPrepareAndFinalize` for an example.
+
+The FusedMoE operation is generally made of multiple operations, in both the Contiguous and Batched variants, as described in the diagrams below
+
+![](../assets/design/fused_moe_modular_kernel/fused_moe_non_batched.png "FusedMoE Non-Batched")
+
+![](../assets/design/fused_moe_modular_kernel/fused_moe_batched.png "FusedMoE Batched")
+
+!!! note
+    The main difference, in terms of operations, between the Batched and Non-Batched cases is the Permute / Unpermute operations. All other operations remain.
+
+## Motivation
+
+As can be seen from the diagrams, there are a lot of operations and there can be a variety of implementations for each operation. The set of ways the operations can be put together to make a valid FusedMoE implementation quickly becomes intractable. The Modular Kernel framework addresses this issue,  by grouping the operations into logical components. This broad categorization makes the combinations manageable and prevents code-duplication. This also decouples the All2All Dispatch & Combine implementations from the FusedMoE implementations and allows for their independent development and testing. Furthermore, the Modular Kernel framework introduces Abstract classes for the different components thus providing a well-defined skeleton for future implementations.
+
+The rest of the document will focus on the Contiguous / Non-Batched case. Extrapolating to the Batched case should be straight-forward.
+
+## ModularKernel Components
+
+FusedMoEModularKernel splits the FusedMoE operation into 3 parts,
+
+1. TopKWeightAndReduce
+2. FusedMoEPrepareAndFinalize
+3. FusedMoEPermuteExpertsUnpermute
+
+### TopKWeightAndReduce
+
+The TopK Weight Application and Reduction components happen right after the Unpermute operation and before the All2All Combine. Note that the `FusedMoEPermuteExpertsUnpermute` is responsible for the Unpermute and `FusedMoEPrepareAndFinalize` is responsible for the All2All Combine. There is value in doing the TopK Weight Application and Reduction in the `FusedMoEPermuteExpertsUnpermute`. But some implementations choose to do it `FusedMoEPrepareAndFinalize`. In order to enable this flexibility, we have a TopKWeightAndReduce abstract class.
+
+Please find the implementations of TopKWeightAndReduce [here](gh-file:vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py).
+
+`FusedMoEPrepareAndFinalize::finalize()` method accepts a `TopKWeightAndReduce` argument that is invoked inside the method.
+The `FusedMoEModularKernel` acts as a bridge between the `FusedMoEPermuteExpertsUnpermute` and `FusedMoEPerpareAndFinalize` implementations to determine where the TopK Weight Application and Reduction happens.
+
+* `FusedMoEPermuteExpertsUnpermute::finalize_weight_and_reduce_impl` method returns `TopKWeightAndReduceNoOp` if the `FusedMoEPermuteExpertsUnpermute` implementation does the weight application and reduction itself.
+* `FusedMoEPermuteExpertsUnpermute::finalize_weight_and_reduce_impl` method returns `TopKWeightAndReduceContiguous` / `TopKWeightAndReduceNaiveBatched` / `TopKWeightAndReduceDelegate` if the `FusedMoEPermuteExpertsUnpermute` implementation needs the `FusedMoEPrepareAndFinalize::finalize()` to do the weight application and reduction.
+
+### FusedMoEPrepareAndFinalize
+
+The `FusedMoEPrepareAndFinalize` abstract class exposes `prepare` and `finalize` functions.
+The `prepare` function is responsible for input activation Quantization and All2All Dispatch. The `finalize` function is responsible for invoking the All2All Combine. Additionally the `finalize` function may or may not do the TopK weight application and reduction (Please refer to the TopKWeightAndReduce section)
+
+![](../assets/design/fused_moe_modular_kernel/prepare_and_finalize_blocks.png "FusedMoEPrepareAndFinalize Blocks")
+
+### FusedMoEPermuteExpertsUnpermute
+
+The `FusedMoEPermuteExpertsUnpermute` class is where the crux of the MoE operations happen. The `FusedMoEPermuteExpertsUnpermute` abstract class exposes a few important functions,
+
+* apply()
+* workspace_shapes()
+* finalize_weight_and_reduce_impl()
+
+#### apply()
+
+The `apply` method is where the implementations perform
+
+* Permute
+* Matmul with weight W1
+* Act + Mul
+* Quantization
+* Matmul with weight W2
+* Unpermute
+* Maybe TopK Weight Application + Reduction
+
+#### workspace_shapes()
+
+The core FusedMoE implementation performs a series of operations. It would be inefficient to create output memory for each of these operations separately. To that effect, implementations are required to declare 2 workspace shapes, the workspace datatype and the FusedMoE output shape as outputs of the workspace_shapes() method. This information is used to allocate the workspace tensors and the output tensor in `FusedMoEModularKernel::forward()` and passed on to the `FusedMoEPermuteExpertsUnpermute::apply()` method. The workspaces could then be used as intermediate buffers in the FusedMoE implementation.
+
+#### finalize_weight_and_reduce_impl()
+
+It is sometimes efficient to perform TopK weight application and Reduction inside the `FusedMoEPermuteExpertsUnpermute::apply()`. Find an example [here](https://github.com/vllm-project/vllm/pull/20228). We have a `TopKWeightAndReduce` abstract class to facilitate such implementations. Please refer to the TopKWeightAndReduce section.
+`FusedMoEPermuteExpertsUnpermute::finalize_weight_and_reduce_impl()` returns the `TopKWeightAndReduce` object that the implementation wants the `FusedMoEPrepareAndFinalize::finalize()` to use.
+
+![](../assets/design/fused_moe_modular_kernel/fused_experts_blocks.png "FusedMoEPermuteExpertsUnpermute Blocks")
+
+### FusedMoEModularKernel
+
+`FusedMoEModularKernel` is composed of the `FusedMoEPrepareAndFinalize` and `FusedMoEPermuteExpertsUnpermute` objects.
+`FusedMoEModularKernel` pseudocode/sketch,
+
+```py
+class FusedMoEModularKernel:
+    def __init__(self,
+                 prepare_finalize: FusedMoEPrepareAndFinalize,
+                 fused_experts: FusedMoEPermuteExpertsUnpermute):
+
+        self.prepare_finalize = prepare_finalize
+        self.fused_experts = fused_experts
+
+    def forward(self, DP_A):
+
+        Aq, A_scale, _, _, _ = self.prepare_finalize.prepare(DP_A, ...)
+
+        workspace13_shape, workspace2_shape, _, _ = self.fused_experts.workspace_shapes(...)
+
+        # allocate workspaces
+        workspace_13 = torch.empty(workspace13_shape, ...)
+        workspace_2 = torch.empty(workspace2_shape, ...)
+
+        # execute fused_experts
+        fe_out = self.fused_experts.apply(Aq, A_scale, workspace13, workspace2, ...)
+
+        # war_impl is an object of type TopKWeightAndReduceNoOp if the fused_experts implementations
+        # performs the TopK Weight Application and Reduction.
+        war_impl = self.fused_experts.finalize_weight_and_reduce_impl()
+
+        output = self.prepare_finalize.finalize(fe_out, war_impl,...)
+
+        return output
+```
+
+## How-To
+
+### How To Add a FusedMoEPrepareAndFinalize Type
+
+Typically a FusedMoEPrepareAndFinalize type is backed by an All2All Dispatch & Combine implementation / kernel. For example,
+
+* PplxPrepareAndFinalize type is backed by Pplx All2All kernels,
+* DeepEPHTPrepareAndFinalize type is backed by DeepEP High-Throughtput All2All kernels, and
+* DeepEPLLPrepareAndFinalize type is backed by DeepEP Low-Latency All2All kernels.
+
+#### Step 1: Add an All2All manager
+
+The purpose of the All2All Manager is to setup the All2All kernel implementations. The `FusedMoEPrepareAndFinalize` implementations typically fetch a kernel-implementation "handle" from the All2All Manager to invoke the Dispatch and Combine functions. Please look at the All2All Manager implementations [here](gh-file:vllm/distributed/device_communicators/all2all.py).
+
+#### Step 2: Add a FusedMoEPrepareAndFinalize Type
+
+This section describes the significance of the various functions exposed by the `FusedMoEPrepareAndFinalize` abstract class.
+
+`FusedMoEPrepareAndFinalize::prepare()`: The prepare method implements the Quantization and All2All Dispatch. Typically the Dispatch function from the relevant All2All Manager is invoked.
+
+`FusedMoEPrepareAndFinalize::finalize()`: Maybe perform TopK Weight Application and Reduction and All2All Combine. Typically the Combine function from the relevant All2AllManager is invoked.
+
+`FusedMoEPrepareAndFinalize::activation_format()`: Return `FusedMoEActivationFormat.BatchedExperts` if the output of the prepare method (i.e. the All2All dispatch) is Batched. Return `FusedMoEActivationFormat.Standard` otherwise.
+
+`FusedMoEPrepareAndFinalize::topk_indices_dtype()`: Data type of the TopK ids. Some All2All kernels have strict requirements pertaining to the data type of the TopK ids. This requirement is passed on to the `FusedMoe::select_experts` function so it could be respected. If there are no strict requirements return None.
+
+`FusedMoEPrepareAndFinalize::max_num_tokens_per_rank()`: This is the maximum number of tokens that would be submitted to the All2All Dispatch at once.
+
+`FusedMoEPrepareAndFinalize::num_dispatchers()`: Total number of dispatching units. This value determines the size of the Dispatch output. The Dispatch output is of shape (num_local_experts, max_num_tokens, K). Here max_num_tokens = num_dispatchers() * max_num_tokens_per_rank().
+
+We suggest picking an already existing `FusedMoEPrepareAndFinalize` implementation that matches your All2All implementation closely and using it as a reference.
+
+### How To Add a FusedMoEPermuteExpertsUnpermute Type
+
+FusedMoEPermuteExpertsUnpermute performs the core of the FusedMoE operations. The various functions exposed by the abstract class and their significance is as follows,
+
+`FusedMoEPermuteExpertsUnpermute::activation_formats()`: Return the supported Input and Output activation formats. i.e. Contiguous / Batched format.
+
+`FusedMoEPermuteExpertsUnpermute::supports_chunking()`: Return True if the implementation supports chunking. Typically
+implementations that input `FusedMoEActivationFormat.Standard` support chunking and `FusedMoEActivationFormat.BatchedExperts` do not.
+
+`FusedMoEPermuteExpertsUnpermute::supports_expert_map()`: Return True if the implementation supports expert map.
+
+`FusedMoEPermuteExpertsUnpermute::workspace_shapes()` /
+`FusedMoEPermuteExpertsUnpermute::finalize_weight_and_reduce_impl` /
+`FusedMoEPermuteExpertsUnpermute::apply`: Refer to `FusedMoEPermuteExpertsUnpermute` section above.
+
+### FusedMoEModularKernel Initialization
+
+`FusedMoEMethodBase` class has 2 methods that are collectively responsible in creating the `FusedMoEModularKernel` object. They are,
+
+* select_gemm_impl, and
+* init_prepare_finalize
+
+#### select_gemm_impl
+
+The `select_gemm_impl` method is undefined in the base class. It is the responsibility of the derived class to implement a method that constructs a valid/appropriate `FusedMoEPermuteExpertsUnpermute` object.
+Please refer to the implementations in,
+
+* `UnquantizedFusedMoEMethod`
+* `CompressedTensorsW8A8Fp8MoEMethod`
+* `CompressedTensorsW8A8Fp8MoECutlassMethod`
+* `Fp8MoEMethod`
+* `ModelOptNvFp4FusedMoE`
+dervied classes.
+
+#### init_prepare_finalize
+
+Based on the input and env settings, the `init_prepare_finalize` method creates the appropriate `FusedMoEPrepareAndFinalize` object. The method then queries `select_gemm_impl` for the appropriate `FusedMoEPermuteExpertsUnpermute` object and builds the `FusedMoEModularKernel` object
+
+Please take a look at [init_prepare_finalize](https://github.com/vllm-project/vllm/blob/1cbf951ba272c230823b947631065b826409fa62/vllm/model_executor/layers/fused_moe/layer.py#L188).
+**Important**: The `FusedMoEMethodBase` derived classes use the `FusedMoEMethodBase::fused_experts` object in their `apply` methods. When settings permit the construction of a valid `FusedMoEModularKernel` object, we override `FusedMoEMethodBase::fused_experts` with it. This essentially makes the derived classes agnostic to what FusedMoE implementation is used.
+
+### How To Unit Test
+
+We have `FusedMoEModularKernel` unit tests at [test_modular_kernel_combinations.py](gh-file:tests/kernels/moe/test_modular_kernel_combinations.py).
+
+The unit test iterates through all combinations of `FusedMoEPrepareAndFinalize` and `FusedMoEPremuteExpertsUnpermute` types and if they are
+compatible, runs some correctness tests.
+If you are adding some `FusedMoEPrepareAndFinalize` / `FusedMoEPermuteExpertsUnpermute` implementations,
+
+1. Add the implementation type to `MK_ALL_PREPARE_FINALIZE_TYPES` and `MK_FUSED_EXPERT_TYPES` in [mk_objects.py](gh-file:tests/kernels/moe/modular_kernel_tools/mk_objects.py) respectively.
+2. Update `Config::is_batched_prepare_finalize()`, `Config::is_batched_fused_experts()`, `Config::is_standard_fused_experts()`,
+`Config::is_fe_16bit_supported()`,  `Config::is_fe_fp8_supported()`, `Config::is_fe_block_fp8_supported()`,
+`Config::is_fe_supports_chunking()` methods in [/tests/kernels/moe/modular_kernel_tools/common.py](gh-file:tests/kernels/moe/modular_kernel_tools/common.py)
+
+Doing this will add the new implementation to the test suite.
+
+### How To Check `FusedMoEPrepareAndFinalize` & `FusedMoEPermuteExpertsUnpermute` Compatibility
+
+The unit test file [test_modular_kernel_combinations.py](gh-file:tests/kernels/moe/test_modular_kernel_combinations.py) can also be executed as a standalone script.
+Example: `python3 -m tests.kernels.moe.test_modular_kernel_combinations --pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts`
+As a side-effect, this script can be used to test `FusedMoEPrepareAndFinalize` & `FusedMoEPermuteExpertsUnpermute` compatibility. When invoked
+with incompatible types, the script will error.
+
+### How To Profile
+
+Please take a look at [profile_modular_kernel.py](gh-file:tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py)
+The script can be used to generate Torch traces for a single `FusedMoEModularKernel::forward()` call for any compatible
+`FusedMoEPrepareAndFinalize` and `FusedMoEPermuteExpertsUnpermute` types.
+Example: `python3 -m tests.kernels.moe.modular_kernel_tools.profile_modular_kernel --pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts`
+
+## FusedMoEPrepareAndFinalize Implementations
+
+The following table lists the `FusedMoEPrepareAndFinalize` implementations at the time of writing,
+
+| Implementation | Type | Comments |
+| :--- | :--- | :--- |
+| DeepEPHTPrepareAndFinalize | Contiguous / Non-Batched | Uses the DeepEP High-Throughput all2all kernels. |
+| DeepEPLLPrepareAndFinalize | Batched | Uses the DeepEP Low-Latency all2all kernels. |
+| PplxPrepareAndFinalize | Batched | Uses the Perplexity all2all kernels. |
+| FlashInferCutlassMoEPrepareAndFinalize | Contiguous | |
+| MoEPrepareAndFinalizeNoEP | Contiguous | This implementation is used when there is no EP. i.e. no all2all kernels are invoked. |
+| BatchedPrepareAndFinalize | Batched | A reference prepare/finalize class that reorganizes the tokens into expert batched format, i.e. E x max_num_tokens x K. (Doesn’t use any all2all kernels. This is primarily used in unit testing) |
+
+## FusedMoEPermuteExpertsUnpermute
+
+The following table lists the `FusedMoEPermuteExpertsUnpermute` implementations at the time of writing,
+
+| Implementation | Type | Comment |
+| :--- | :--- | :--- |
+| BatchedDeepGemmExperts | Batched | Uses the DeepGemm’s Masked Grouped Gemm kernels for the fused_moe operation. |
+| BatchedTritonExperts | Batched | Uses a Triton Kernel for the Batched matmuls. |
+| BatchedTritonOrDeepGemmExperts | Batched | Chooses either the `BatchedDeepGemmExperts` or `BatchedTritonExperts` based on environment settings. |
+| DeepGemmExperts | Contiguous / Non-Batched | Uses DeepGemm’s Grouped Gemm kernels for fused_moe operation. |
+| TritonExperts | Contiguous / Non-Batched | Uses a Triton Kernel for fused_moe matmuls. |
+| TritonOrDeepGemmExperts | Contiguous / Non-Batched | Chooses either the `DeepGemmExperts` or `TritonExperts` based on fused_moe inputs. |
+| CutlassExpertsFP8 | Supports both Batched and Contiguous formats | Uses Cutlass Grouped Gemm implementations for the fp8 matmuls. |
+| CutlassExpertsFP4 | Supports both Batched and Contiguous formats | Uses Cutlass Grouped Gemm implementations for the fp4 matmuls. |
+| FlashInferExperts | Contiguous | Uses fused_moe operation from FlashInfer |
+| NaiveBatchedExperts | Batched | Reference Batched Experts implementation. Primarily used in unit tests. |
diff --git a/docs/design/huggingface_integration.md b/docs/design/huggingface_integration.md
index 7b01313ddb00a..5a7582c86d49f 100644
--- a/docs/design/huggingface_integration.md
+++ b/docs/design/huggingface_integration.md
@@ -1,4 +1,4 @@
-# Integration with HuggingFace
+# Integration with Hugging Face
 
 This document describes how vLLM integrates with HuggingFace libraries. We will explain step by step what happens under the hood when we run `vllm serve`.
 
diff --git a/docs/design/v1/metrics.md b/docs/design/metrics.md
similarity index 99%
rename from docs/design/v1/metrics.md
rename to docs/design/metrics.md
index 52cd320dd4e11..1f65331d3c0a9 100644
--- a/docs/design/v1/metrics.md
+++ b/docs/design/metrics.md
@@ -223,7 +223,7 @@ And the calculated intervals are:
 
 Put another way:
 
-![Interval calculations - common case](../../assets/design/v1/metrics/intervals-1.png)
+![Interval calculations - common case](../assets/design/metrics/intervals-1.png)
 
 We explored the possibility of having the frontend calculate these
 intervals using the timing of events visible by the frontend. However,
@@ -238,13 +238,13 @@ When a preemption occurs during decode, since any already generated
 tokens are reused, we consider the preemption as affecting the
 inter-token, decode, and inference intervals.
 
-![Interval calculations - preempted decode](../../assets/design/v1/metrics/intervals-2.png)
+![Interval calculations - preempted decode](../assets/design/metrics/intervals-2.png)
 
 When a preemption occurs during prefill (assuming such an event
 is possible), we consider the preemption as affecting the
 time-to-first-token and prefill intervals.
 
-![Interval calculations - preempted prefill](../../assets/design/v1/metrics/intervals-3.png)
+![Interval calculations - preempted prefill](../assets/design/metrics/intervals-3.png)
 
 ### Frontend Stats Collection
 
@@ -361,7 +361,7 @@ instances in Prometheus.
 
 We use this concept for the `vllm:cache_config_info` metric:
 
-```
+```text
 # HELP vllm:cache_config_info Information of the LLMEngine CacheConfig
 # TYPE vllm:cache_config_info gauge
 vllm:cache_config_info{block_size="16",cache_dtype="auto",calculate_kv_scales="False",cpu_offload_gb="0",enable_prefix_caching="False",gpu_memory_utilization="0.9",...} 1.0
@@ -686,7 +686,7 @@ documentation for this option states:
 The metrics were added by <gh-pr:7089> and who up in an OpenTelemetry trace
 as:
 
-```
+```text
 -> gen_ai.latency.time_in_scheduler: Double(0.017550230026245117)
 -> gen_ai.latency.time_in_model_forward: Double(3.151565277099609)
 -> gen_ai.latency.time_in_model_execute: Double(3.6468167304992676)
diff --git a/docs/design/v1/multiprocessing.md b/docs/design/multiprocessing.md
similarity index 100%
rename from docs/design/v1/multiprocessing.md
rename to docs/design/multiprocessing.md
diff --git a/docs/design/v1/p2p_nccl_connector.md b/docs/design/p2p_nccl_connector.md
similarity index 94%
rename from docs/design/v1/p2p_nccl_connector.md
rename to docs/design/p2p_nccl_connector.md
index 9d334f8873d97..adf838306bc77 100644
--- a/docs/design/v1/p2p_nccl_connector.md
+++ b/docs/design/p2p_nccl_connector.md
@@ -1,8 +1,11 @@
+# P2P NCCL Connector
+
 An implementation of xPyD with dynamic scaling based on point-to-point communication, partly inspired by Dynamo.
 
-# Detailed Design
+## Detailed Design
+
+### Overall Process
 
-## Overall Process
 As shown in Figure 1, the overall process of this **PD disaggregation** solution is described through a request flow:
 
 1. The client sends an HTTP request to the Proxy/Router's `/v1/completions` interface.
@@ -15,13 +18,13 @@ As shown in Figure 1, the overall process of this **PD disaggregation** solution
 
 ![image1](https://github.com/user-attachments/assets/fb01bde6-755b-49f7-ad45-48a94b1e10a7)
 
-## Proxy/Router (Demo)
+### Proxy/Router (Demo)
 
 A simple HTTP service acts as the entry point for client requests and starts a background thread to listen for P/D instances reporting their HTTP IP and PORT, as well as ZMQ IP and PORT. It maintains a dictionary of `http_addr -> zmq_addr`. The `http_addr` is the IP:PORT for the vLLM instance's request, while the `zmq_addr` is the address for KV cache handshake and metadata reception.
 
 The Proxy/Router is responsible for selecting 1P1D based on the characteristics of the client request, such as the prompt, and generating a corresponding `request_id`, for example:
 
-```
+```text
 cmpl-___prefill_addr_10.0.1.2:21001___decode_addr_10.0.1.3:22001_93923d63113b4b338973f24d19d4bf11-0
 ```
 
@@ -29,13 +32,13 @@ Currently, to quickly verify whether xPyD can work, a round-robin selection of 1
 
 Each P/D instance periodically sends a heartbeat packet to the Proxy/Router (currently every 3 seconds) to register (i.e., report `http_addr -> zmq_addr`) and keep the connection alive. If an instance crashes and fails to send a ping for a certain period of time, the Proxy/Router will remove the timed-out instance (this feature has not yet been developed).
 
-## KV Cache Transfer Methods
+### KV Cache Transfer Methods
 
 There are three methods for KVCache transfer: PUT, GET, and PUT_ASYNC. These methods can be specified using the `--kv-transfer-config` and `kv_connector_extra_config` parameters, specifically through the `send_type` field. Both PUT and PUT_ASYNC involve the P instance actively sending KVCache to the D instance. The difference is that PUT is a synchronous transfer method that blocks the main process, while PUT_ASYNC is an asynchronous transfer method. PUT_ASYNC uses a dedicated thread for sending KVCache, which means it does not block the main process. In contrast, the GET method involves the P instance saving the KVCache to the memory buffer after computing the prefill. The D instance then actively retrieves the computed KVCache from the P instance once it has allocated space for the KVCache.
 
 Experimental results have shown that the performance of these methods, from highest to lowest, is as follows: PUT_ASYNC → GET → PUT.
 
-## P2P Communication via ZMQ & NCCL
+### P2P Communication via ZMQ & NCCL
 
 As long as the address of the counterpart is known, point-to-point KV cache transfer (using NCCL) can be performed, without being constrained by rank and world size. To support dynamic scaling (expansion and contraction) of instances with PD disaggregation. This means that adding or removing P/D instances does not require a full system restart.
 
@@ -43,7 +46,7 @@ Each P/D instance only needs to create a single `P2pNcclEngine` instance. This i
 
 When a P instance and a D instance transmit KVCache for the first time, they need to establish a ZMQ connection and an NCCL group. For subsequent KVCache transmissions, this ZMQ connection and NCCL group are reused. The NCCL group consists of only two ranks, meaning the world size is equal to 2. This design is intended to support dynamic scaling, which means that adding or removing P/D instances does not require a full system restart. As long as the address of the counterpart is known, point-to-point KVCache transmission can be performed, without being restricted by rank or world size.
 
-## NCCL Group Topology
+### NCCL Group Topology
 
 Currently, only symmetric TP (Tensor Parallelism) methods are supported for KVCache transmission. Asymmetric TP and PP (Pipeline Parallelism) methods will be supported in the future. Figure 2 illustrates the 1P2D setup, where each instance has a TP (Tensor Parallelism) degree of 2. There are a total of 7 NCCL groups: three vLLM instances each have one NCCL group with TP=2. Additionally, the 0th GPU card of the P instance establishes an NCCL group with the 0th GPU card of each D instance. Similarly, the 1st GPU card of the P instance establishes an NCCL group with the 1st GPU card of each D instance.
 
@@ -51,7 +54,7 @@ Currently, only symmetric TP (Tensor Parallelism) methods are supported for KVCa
 
 Each NCCL group occupies a certain amount of GPU memory buffer for communication, the size of which is primarily influenced by the `NCCL_MAX_NCHANNELS` environment variable. When `NCCL_MAX_NCHANNELS=16`, an NCCL group typically occupies 100MB, while when `NCCL_MAX_NCHANNELS=8`, it usually takes up 52MB. For large-scale xPyD configurations—such as DeepSeek's 96P144D—this implementation is currently not feasible. Moving forward, we are considering using RDMA for point-to-point communication and are also keeping an eye on UCCL.
 
-## GPU Memory Buffer and Tensor Memory Pool
+### GPU Memory Buffer and Tensor Memory Pool
 
 The trade-off in the size of the memory buffer is as follows: For P instances, the memory buffer is not required in PUT and PUT_ASYNC modes, but it is necessary in GET mode. For D instances, a memory buffer is needed in all three modes. The memory buffer for D instances should not be too large. Similarly, for P instances in GET mode, the memory buffer should also not be too large. The memory buffer of D instances is used to temporarily store KVCache sent by P instances. If it is too large, it will reduce the KVCache space available for normal inference by D instances, thereby decreasing the inference batch size and ultimately leading to a reduction in output throughput. The size of the memory buffer is configured by the parameter `kv_buffer_size`, measured in bytes, and is typically set to 5%～10% of the memory size.
 
@@ -59,15 +62,16 @@ If the `--max-num-seqs` parameter for P instances is set to a large value, due t
 
 To address the above issues, I have designed and developed a local Tensor memory pool for storing KVCache, inspired by the buddy system used in Linux memory modules. Since the memory is sufficiently large, typically in the TB range on servers, there is no need to consider prefix caching or using block-based designs to reuse memory, thereby saving space. When the memory buffer is insufficient, KVCache can be directly stored in the Tensor memory pool, and D instances can subsequently retrieve KVCache from it. The read and write speed is that of PCIe, with PCIe 4.0 having a speed of approximately 21 GB/s, which is usually faster than the Prefill speed. Otherwise, solutions like Mooncake and lmcache would not be necessary. The Tensor memory pool acts as a flood diversion area, typically unused except during sudden traffic surges. In the worst-case scenario, my solution performs no worse than the normal situation with a Cache store.
 
-# Install vLLM
+## Install vLLM
 
 ```shell
 pip install "vllm>=0.9.2"
 ```
 
-# Run xPyD
+## Run xPyD
+
+### Instructions
 
-## Instructions
 - The following examples are run on an A800 (80GB) device, using the Meta-Llama-3.1-8B-Instruct model.
 - Pay attention to the setting of the `kv_buffer_size` (in bytes). The empirical value is 10% of the GPU memory size. This is related to the kvcache size. If it is too small, the GPU memory buffer for temporarily storing the received kvcache will overflow, causing the kvcache to be stored in the tensor memory pool, which increases latency. If it is too large, the kvcache available for inference will be reduced, leading to a smaller batch size and decreased throughput.
 - For Prefill instances, when using non-GET mode, the `kv_buffer_size` can be set to 1, as Prefill currently does not need to receive kvcache. However, when using GET mode, a larger `kv_buffer_size` is required because it needs to store the kvcache sent to the D instance.
@@ -79,16 +83,16 @@ pip install "vllm>=0.9.2"
 - Supports multiple nodes; you just need to modify the `proxy_ip` and `proxy_port` in `--kv-transfer-config`.
 - In the following examples, it is assumed that **the proxy's IP is 10.0.1.1**.
 
-## Run 1P3D
+### Run 1P3D
 
-### Proxy (e.g. 10.0.1.1)
+#### Proxy (e.g. 10.0.1.1)
 
 ```shell
 cd {your vllm directory}/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/
 python3 disagg_proxy_p2p_nccl_xpyd.py &
 ```
 
-### Prefill1 (e.g. 10.0.1.2 or 10.0.1.1)
+#### Prefill1 (e.g. 10.0.1.2 or 10.0.1.1)
 
 ??? console "Command"
 
@@ -105,12 +109,11 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
         --max-num-seqs 256 \
         --trust-remote-code \
         --gpu-memory-utilization 0.9 \
-        --disable-log-request \
         --kv-transfer-config \
         '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20001"}}' > /var/vllm.log 2>&1 &
     ```
 
-### Decode1 (e.g. 10.0.1.3 or 10.0.1.1)
+#### Decode1 (e.g. 10.0.1.3 or 10.0.1.1)
 
 ??? console "Command"
 
@@ -127,12 +130,11 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
         --max-num-seqs 256 \
         --trust-remote-code \
         --gpu-memory-utilization 0.7 \
-        --disable-log-request \
         --kv-transfer-config \
         '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20002"}}' > /var/vllm.log 2>&1 &
     ```
 
-### Decode2 (e.g. 10.0.1.4 or 10.0.1.1)
+#### Decode2 (e.g. 10.0.1.4 or 10.0.1.1)
 
 ??? console "Command"
 
@@ -149,12 +151,11 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
         --max-num-seqs 256 \
         --trust-remote-code \
         --gpu-memory-utilization 0.7 \
-        --disable-log-request \
         --kv-transfer-config \
         '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003"}}' > /var/vllm.log 2>&1 &
     ```
 
-### Decode3 (e.g. 10.0.1.5 or 10.0.1.1)
+#### Decode3 (e.g. 10.0.1.5 or 10.0.1.1)
 
 ??? console "Command"
 
@@ -171,21 +172,20 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
         --max-num-seqs 256 \
         --trust-remote-code \
         --gpu-memory-utilization 0.7 \
-        --disable-log-request \
         --kv-transfer-config \
         '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20004"}}' > /var/vllm.log 2>&1 &
     ```
 
-## Run 3P1D
+### Run 3P1D
 
-### Proxy (e.g. 10.0.1.1)
+#### Proxy (e.g. 10.0.1.1)
 
 ```shell
 cd {your vllm directory}/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/
 python3 disagg_proxy_p2p_nccl_xpyd.py &
 ```
 
-### Prefill1 (e.g. 10.0.1.2 or 10.0.1.1)
+#### Prefill1 (e.g. 10.0.1.2 or 10.0.1.1)
 
 ??? console "Command"
 
@@ -202,12 +202,11 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
         --max-num-seqs 256 \
         --trust-remote-code \
         --gpu-memory-utilization 0.9 \
-        --disable-log-request \
         --kv-transfer-config \
         '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20001"}}' > /var/vllm.log 2>&1 &
     ```
 
-### Prefill2 (e.g. 10.0.1.3 or 10.0.1.1)
+#### Prefill2 (e.g. 10.0.1.3 or 10.0.1.1)
 
 ??? console "Command"
 
@@ -224,12 +223,11 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
         --max-num-seqs 256 \
         --trust-remote-code \
         --gpu-memory-utilization 0.9 \
-        --disable-log-request \
         --kv-transfer-config \
         '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20002"}}' > /var/vllm.log 2>&1 &
     ```
 
-### Prefill3 (e.g. 10.0.1.4 or 10.0.1.1)
+#### Prefill3 (e.g. 10.0.1.4 or 10.0.1.1)
 
 ??? console "Command"
 
@@ -246,12 +244,11 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
         --max-num-seqs 256 \
         --trust-remote-code \
         --gpu-memory-utilization 0.9 \
-        --disable-log-request \
         --kv-transfer-config \
         '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003"}}' > /var/vllm.log 2>&1 &
     ```
 
-### Decode1 (e.g. 10.0.1.5 or 10.0.1.1)
+#### Decode1 (e.g. 10.0.1.5 or 10.0.1.1)
 
 ??? console "Command"
 
@@ -268,12 +265,11 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
         --max-num-seqs 256 \
         --trust-remote-code \
         --gpu-memory-utilization 0.7 \
-        --disable-log-request \
         --kv-transfer-config \
         '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20004"}}' > /var/vllm.log 2>&1 &
     ```
 
-# Single request
+## Single request
 
 ```shell
 curl -X POST -s http://10.0.1.1:10001/v1/completions \
@@ -286,7 +282,7 @@ curl -X POST -s http://10.0.1.1:10001/v1/completions \
 }'
 ```
 
-# Benchmark
+## Benchmark
 
 ??? console "Command"
 
@@ -310,14 +306,14 @@ curl -X POST -s http://10.0.1.1:10001/v1/completions \
         --num-prompts 1000
     ```
 
-# Shut down
+## Shut down
 
 ```shell
 pgrep python | xargs kill -9 && pkill -f python
 ```
 
-# Test data
+## Test data
 
-## **Scenario**: 1K input & 200 output tokens, E2E P99 latency ~2s
+### **Scenario**: 1K input & 200 output tokens, E2E P99 latency ~2s
 
 ![testdata](https://github.com/user-attachments/assets/cef0953b-4567-4bf9-b940-405b92a28eb1)
diff --git a/docs/design/kernel/paged_attention.md b/docs/design/paged_attention.md
similarity index 94%
rename from docs/design/kernel/paged_attention.md
rename to docs/design/paged_attention.md
index 94bfa97ee2217..fb991a35caf30 100644
--- a/docs/design/kernel/paged_attention.md
+++ b/docs/design/paged_attention.md
@@ -1,4 +1,8 @@
-# vLLM Paged Attention
+# Paged Attention
+
+!!! warning
+    This is a historical document based on the [original paper for vLLM](https://arxiv.org/abs/2309.06180).
+    It no longer describes the code used in vLLM today.
 
 Currently, vLLM utilizes its own implementation of a multi-head query
 attention kernel (`csrc/attention/attention_kernels.cu`).
@@ -136,7 +140,7 @@ const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE;
 ```
 
 <figure markdown="span">
-  ![](../../assets/kernel/query.png){ align="center" alt="query" width="70%" }
+  ![](../assets/design/paged_attention/query.png){ align="center" alt="query" width="70%" }
 </figure>
 
 Each thread defines its own `q_ptr` which points to the assigned
@@ -145,7 +149,7 @@ and `HEAD_SIZE` is 128, the `q_ptr` points to data that contains
 total of 128 elements divided into 128 / 4 = 32 vecs.
 
 <figure markdown="span">
-  ![](../../assets/kernel/q_vecs.png){ align="center" alt="q_vecs" width="70%" }
+  ![](../assets/design/paged_attention/q_vecs.png){ align="center" alt="q_vecs" width="70%" }
 </figure>
 
 ```cpp
@@ -184,7 +188,7 @@ points to key token data based on `k_cache` at assigned block,
 assigned head and assigned token.
 
 <figure markdown="span">
-  ![](../../assets/kernel/key.png){ align="center" alt="key" width="70%" }
+  ![](../assets/design/paged_attention/key.png){ align="center" alt="key" width="70%" }
 </figure>
 
 The diagram above illustrates the memory layout for key data. It
@@ -199,7 +203,7 @@ elements for one token) that will be processed by 2 threads (one
 thread group) separately.
 
 <figure markdown="span">
-  ![](../../assets/kernel/k_vecs.png){ align="center" alt="k_vecs" width="70%" }
+  ![](../assets/design/paged_attention/k_vecs.png){ align="center" alt="k_vecs" width="70%" }
 </figure>
 
 ```cpp
@@ -358,15 +362,15 @@ later steps. Now, it should store the normalized softmax result of
 ## Value
 
 <figure markdown="span">
-  ![](../../assets/kernel/value.png){ align="center" alt="value" width="70%" }
+  ![](../assets/design/paged_attention/value.png){ align="center" alt="value" width="70%" }
 </figure>
 
 <figure markdown="span">
-  ![](../../assets/kernel/logits_vec.png){ align="center" alt="logits_vec" width="50%" }
+  ![](../assets/design/paged_attention/logits_vec.png){ align="center" alt="logits_vec" width="50%" }
 </figure>
 
 <figure markdown="span">
-  ![](../../assets/kernel/v_vec.png){ align="center" alt="v_vec" width="70%" }
+  ![](../assets/design/paged_attention/v_vec.png){ align="center" alt="v_vec" width="70%" }
 </figure>
 
 Now we need to retrieve the value data and perform dot multiplication
@@ -495,3 +499,14 @@ for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
 Finally, we need to iterate over different assigned head positions
 and write out the corresponding accumulated result based on the
 `out_ptr`.
+
+## Citation
+
+```bibtex
+@inproceedings{kwon2023efficient,
+  title={Efficient Memory Management for Large Language Model Serving with PagedAttention},
+  author={Woosuk Kwon and Zhuohan Li and Siyuan Zhuang and Ying Sheng and Lianmin Zheng and Cody Hao Yu and Joseph E. Gonzalez and Hao Zhang and Ion Stoica},
+  booktitle={Proceedings of the ACM SIGOPS 29th Symposium on Operating Systems Principles},
+  year={2023}
+}
+```
diff --git a/docs/design/plugin_system.md b/docs/design/plugin_system.md
index 23a05ac719ce2..ca1c2c2305d91 100644
--- a/docs/design/plugin_system.md
+++ b/docs/design/plugin_system.md
@@ -1,4 +1,4 @@
-# vLLM's Plugin System
+# Plugin System
 
 The community frequently requests the ability to extend vLLM with custom features. To facilitate this, vLLM includes a plugin system that allows users to add custom features without modifying the vLLM codebase. This document explains how plugins work in vLLM and how to create a plugin for vLLM.
 
diff --git a/docs/design/v1/prefix_caching.md b/docs/design/prefix_caching.md
similarity index 90%
rename from docs/design/v1/prefix_caching.md
rename to docs/design/prefix_caching.md
index 2d3c8412894a6..9941837bf1652 100644
--- a/docs/design/v1/prefix_caching.md
+++ b/docs/design/prefix_caching.md
@@ -18,10 +18,12 @@ In the example above, the KV cache in the first block can be uniquely identified
 * Block tokens: A tuple of tokens in this block. The reason to include the exact tokens is to reduce potential hash value collision.
 * Extra hashes: Other values required to make this block unique, such as LoRA IDs, multi-modality input hashes (see the example below), and cache salts to isolate caches in multi-tenant environments.
 
-> **Note 1:** We only cache full blocks.
+!!! note "Note 1"
+    We only cache full blocks.
 
-> **Note 2:** The above hash key structure is not 100% collision free. Theoretically it’s still possible for the different prefix tokens to have the same hash value. To avoid any hash collisions **in a multi-tenant setup, we advise to use SHA256** as hash function instead of the default builtin hash.
-SHA256 is supported since vLLM v0.8.3 and must be enabled with a command line argument. It comes with a performance impact of about 100-200ns per token (~6ms for 50k tokens of context).
+!!! note "Note 2"
+    The above hash key structure is not 100% collision free. Theoretically it’s still possible for the different prefix tokens to have the same hash value. To avoid any hash collisions **in a multi-tenant setup, we advise to use SHA256** as hash function instead of the default builtin hash.
+    SHA256 is supported since vLLM v0.8.3 and must be enabled with a command line argument. It comes with a performance impact of about 100-200ns per token (~6ms for 50k tokens of context).
 
 **A hashing example with multi-modality inputs**  
 In this example, we illustrate how prefix caching works with multi-modality inputs (e.g., images). Assuming we have a request with the following messages:
@@ -92,7 +94,8 @@ To improve privacy in shared environments, vLLM supports isolating prefix cache
 
 With this setup, cache sharing is limited to users or requests that explicitly agree on a common salt, enabling cache reuse within a trust group while isolating others.
 
-> **Note:** Cache isolation is not supported in engine V0.
+!!! note
+    Cache isolation is not supported in engine V0.
 
 ## Data Structure
 
@@ -122,7 +125,7 @@ There are two design points to highlight:
 
 As a result, we will have the following components when the KV cache manager is initialized:
 
-![Component Overview](../../assets/design/v1/prefix_caching/overview.png)
+![Component Overview](../assets/design/prefix_caching/overview.png)
 
 * Block Pool: A list of KVCacheBlock.  
 * Free Block Queue: Only store the pointers of head and tail blocks for manipulations.  
@@ -192,7 +195,7 @@ As can be seen, block 3 is a new full block and is cached. However, it is redund
 
 When a request is finished, we free all its blocks if no other requests are using them (reference count = 0). In this example, we free request 1 and block 2, 3, 4, 8 associated with it. We can see that the freed blocks are added to the tail of the free queue in the *reverse* order. This is because the last block of a request must hash more tokens and is less likely to be reused by other requests. As a result, it should be evicted first.
 
-![Free queue after a request us freed](../../assets/design/v1/prefix_caching/free.png)
+![Free queue after a request us freed](../assets/design/prefix_caching/free.png)
 
 ### Eviction (LRU)
 
@@ -208,24 +211,24 @@ In this example, we assume the block size is 4 (each block can cache 4 tokens),
 
 **Time 1: The cache is empty and a new request comes in.** We allocate 4 blocks. 3 of them are already full and cached. The fourth block is partially full with 3 of 4 tokens.
 
-![Example Time 1](../../assets/design/v1/prefix_caching/example-time-1.png)
+![Example Time 1](../assets/design/prefix_caching/example-time-1.png)
 
 **Time 3: Request 0 makes the block 3 full and asks for a new block to keep decoding.** We cache block 3 and allocate block 4.
 
-![Example Time 3](../../assets/design/v1/prefix_caching/example-time-3.png)
+![Example Time 3](../assets/design/prefix_caching/example-time-3.png)
 
 **Time 4: Request 1 comes in with the 14 prompt tokens, where the first 10 tokens are the same as request 0.** We can see that only the first 2 blocks (8 tokens) hit the cache, because the 3rd block only matches 2 of 4 tokens.
 
-![Example Time 4](../../assets/design/v1/prefix_caching/example-time-4.png)
+![Example Time 4](../assets/design/prefix_caching/example-time-4.png)
 
 **Time 5: Request 0 is finished and free.** Blocks 2, 3 and 4 are added to the free queue in the reverse order (but block 2 and 3 are still cached). Block 0 and 1 are not added to the free queue because they are being used by Request 1.
 
-![Example Time 5](../../assets/design/v1/prefix_caching/example-time-5.png)
+![Example Time 5](../assets/design/prefix_caching/example-time-5.png)
 
 **Time 6: Request 1 is finished and free.**
 
-![Example Time 6](../../assets/design/v1/prefix_caching/example-time-6.png)
+![Example Time 6](../assets/design/prefix_caching/example-time-6.png)
 
 **Time 7: Request 2 comes in with the 29 prompt tokens, where the first 12 tokens are the same as request 0\.** Note that even the block order in the free queue was `7 - 8 - 9 - 4 - 3 - 2 - 6 - 5 - 1 - 0`, the cache hit blocks (i.e., 0, 1, 2) are touched and removed from the queue before allocation, so the free queue becomes `7 - 8 - 9 - 4 - 3 - 6 - 5`. As a result, the allocated blocks are 0 (cached), 1 (cached), 2 (cached), 7, 8, 9, 4, 3 (evicted).
 
-![Example Time 7](../../assets/design/v1/prefix_caching/example-time-7.png)
+![Example Time 7](../assets/design/prefix_caching/example-time-7.png)
diff --git a/docs/design/v1/torch_compile.md b/docs/design/torch_compile.md
similarity index 99%
rename from docs/design/v1/torch_compile.md
rename to docs/design/torch_compile.md
index ea5d8ac212f7a..47ac4958dbf7f 100644
--- a/docs/design/v1/torch_compile.md
+++ b/docs/design/torch_compile.md
@@ -1,4 +1,4 @@
-# vLLM's `torch.compile` integration
+# `torch.compile` integration
 
 In vLLM's V1 architecture, `torch.compile` is enabled by default and is a critical part of the framework. This document gives a simple walk-through example to show how to understand the `torch.compile` usage.
 
@@ -8,7 +8,7 @@ Throughout the example, we will run a common Llama model using v1, and turn on d
 
 In the very verbose logs, we can see:
 
-```
+```console
 INFO 03-07 03:06:55 [backends.py:409] Using cache directory: ~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0 for vLLM's torch.compile
 ```
 
@@ -75,7 +75,7 @@ Every submodule can be identified by its index, and will be processed individual
 
 In the very verbose logs, we can also see:
 
-```
+```console
 DEBUG 03-07 03:52:37 [backends.py:134] store the 0-th graph for shape None from inductor via handle ('fpegyiq3v3wzjzphd45wkflpabggdbjpylgr7tta4hj6uplstsiw', '~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/inductor_cache/iw/ciwzrk3ittdqatuzwonnajywvno3llvjcs2vfdldzwzozn3zi3iy.py')
 DEBUG 03-07 03:52:39 [backends.py:134] store the 1-th graph for shape None from inductor via handle ('f7fmlodmf3h3by5iiu2c4zarwoxbg4eytwr3ujdd2jphl4pospfd', '~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/inductor_cache/ly/clyfzxldfsj7ehaluis2mca2omqka4r7mgcedlf6xfjh645nw6k2.py')
 ...
@@ -93,7 +93,7 @@ One more detail: you can see that the 1-th graph and the 15-th graph have the sa
 
 If we already have the cache directory (e.g. run the same code for the second time), we will see the following logs:
 
-```
+```console
 DEBUG 03-07 04:00:45 [backends.py:86] Directly load the 0-th graph for shape None from inductor via handle ('fpegyiq3v3wzjzphd45wkflpabggdbjpylgr7tta4hj6uplstsiw', '~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/inductor_cache/iw/ciwzrk3ittdqatuzwonnajywvno3llvjcs2vfdldzwzozn3zi3iy.py')
 ```
 
diff --git a/docs/features/compatibility_matrix.md b/docs/features/compatibility_matrix.md
index 8be1585f8e76b..5b08b3810776c 100644
--- a/docs/features/compatibility_matrix.md
+++ b/docs/features/compatibility_matrix.md
@@ -34,23 +34,26 @@ th:not(:first-child) {
 }
 </style>
 
-| Feature | [CP][chunked-prefill] | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | [SD](spec_decode.md) | CUDA graph | <abbr title="Pooling Models">pooling</abbr> | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | <abbr title="Logprobs">logP</abbr> | <abbr title="Prompt Logprobs">prmpt logP</abbr> | <abbr title="Async Output Processing">async output</abbr> | multi-step | <abbr title="Multimodal Inputs">mm</abbr> | best-of | beam-search |
+| Feature | [CP][chunked-prefill] | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | [SD](spec_decode.md) | CUDA graph | [pooling](../models/pooling_models.md) | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | <abbr title="Logprobs">logP</abbr> | <abbr title="Prompt Logprobs">prmpt logP</abbr> | <abbr title="Async Output Processing">async output</abbr> | multi-step | <abbr title="Multimodal Inputs">mm</abbr> | best-of | beam-search |
 |---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
-| [CP][chunked-prefill] | ✅ | | | | | | | | | | | | | | |
-| [APC](automatic_prefix_caching.md) | ✅ | ✅ | | | | | | | | | | | | | |
-| [LoRA](lora.md) | ✅ | ✅ | ✅ | | | | | | | | | | | | |
+| [CP][chunked-prefill] | ✅ | | | | | | | | | | | | | |
+| [APC](automatic_prefix_caching.md) | ✅ | ✅ | | | | | | | | | | | | |
+| [LoRA](lora.md) | ✅ | ✅ | ✅ | | | | | | | | | | | |
 | [SD](spec_decode.md) | ✅ | ✅ | ❌ | ✅ | | | | | | | | | | |
 | CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | |
-| <abbr title="Pooling Models">pooling</abbr> | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | | | | | | | | |
+| [pooling](../models/pooling_models.md) | 🟠\* | 🟠\* | ✅ | ❌ | ✅ | ✅ | | | | | | | | |
 | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ❌ | [❌](gh-issue:7366) | ❌ | [❌](gh-issue:7366) | ✅ | ✅ | ✅ | | | | | | | |
 | <abbr title="Logprobs">logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | |
 | <abbr title="Prompt Logprobs">prmpt logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | | | | | |
 | <abbr title="Async Output Processing">async output</abbr> | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | | | | |
 | multi-step | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | | | |
-| <abbr title="Multimodal Inputs">mm</abbr> | ✅ | [🟠](gh-pr:8348) | [🟠](gh-pr:4194) | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | |
+| [mm](multimodal_inputs.md) | ✅ | ✅ | [🟠](gh-pr:4194)<sup>^</sup> | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | |
 | best-of | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ✅ | ✅ | |
 | beam-search | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ❔ | ✅ | ✅ |
 
+\* Chunked prefill and prefix caching are only applicable to last-token pooling.  
+<sup>^</sup> LoRA is only applicable to the language backbone of multimodal models.
+
 [](){ #feature-x-hardware }
 
 ## Feature x Hardware
@@ -62,9 +65,9 @@ th:not(:first-child) {
 | [LoRA](lora.md)                                           | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅ |
 | [SD](spec_decode.md)                                      | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
 | CUDA graph                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ✅     | ❌ |
-| <abbr title="Pooling Models">pooling</abbr>               | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❔     | ❌ |
+| [pooling](../models/pooling_models.md)                    | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
 | <abbr title="Encoder-Decoder Models">enc-dec</abbr>       | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❌     | ❌ |
-| <abbr title="Multimodal Inputs">mm</abbr>                 | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
+| [mm](multimodal_inputs.md)                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
 | <abbr title="Logprobs">logP</abbr>                        | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
 | <abbr title="Prompt Logprobs">prmpt logP</abbr>           | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
 | <abbr title="Async Output Processing">async output</abbr> | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ❌     | ❌ |
diff --git a/docs/features/lora.md b/docs/features/lora.md
index ea1b495138c1b..a4e05dae11c2e 100644
--- a/docs/features/lora.md
+++ b/docs/features/lora.md
@@ -119,6 +119,7 @@ export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True
 ```
 
 ### Using API Endpoints
+
 Loading a LoRA Adapter:
 
 To dynamically load a LoRA adapter, send a POST request to the `/v1/load_lora_adapter` endpoint with the necessary
@@ -156,6 +157,7 @@ curl -X POST http://localhost:8000/v1/unload_lora_adapter \
 ```
 
 ### Using Plugins
+
 Alternatively, you can use the LoRAResolver plugin to dynamically load LoRA adapters. LoRAResolver plugins enable you to load LoRA adapters from both local and remote sources such as local file system and S3. On every request, when there's a new model name that hasn't been loaded yet, the LoRAResolver will try to resolve and load the corresponding LoRA adapter.
 
 You can set up multiple LoRAResolver plugins if you want to load LoRA adapters from different sources. For example, you might have one resolver for local files and another for S3 storage. vLLM will load the first LoRA adapter that it finds.
diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md
index d4c8852206bba..cdd32924b5668 100644
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@@ -172,6 +172,36 @@ Multi-image input can be extended to perform video captioning. We show this with
         print(generated_text)
     ```
 
+#### Custom RGBA Background Color
+
+When loading RGBA images (images with transparency), vLLM converts them to RGB format. By default, transparent pixels are replaced with white background. You can customize this background color using the `rgba_background_color` parameter in `media_io_kwargs`.
+
+??? code
+
+    ```python
+    from vllm import LLM
+    
+    # Default white background (no configuration needed)
+    llm = LLM(model="llava-hf/llava-1.5-7b-hf")
+    
+    # Custom black background for dark theme
+    llm = LLM(
+        model="llava-hf/llava-1.5-7b-hf",
+        media_io_kwargs={"image": {"rgba_background_color": [0, 0, 0]}}
+    )
+    
+    # Custom brand color background (e.g., blue)
+    llm = LLM(
+        model="llava-hf/llava-1.5-7b-hf", 
+        media_io_kwargs={"image": {"rgba_background_color": [0, 0, 255]}}
+    )
+    ```
+
+!!! note
+    - The `rgba_background_color` accepts RGB values as a list `[R, G, B]` or tuple `(R, G, B)` where each value is 0-255
+    - This setting only affects RGBA images with transparency; RGB images are unchanged
+    - If not specified, the default white background `(255, 255, 255)` is used for backward compatibility
+
 ### Video Inputs
 
 You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary
@@ -478,6 +508,20 @@ Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for
     export VLLM_VIDEO_FETCH_TIMEOUT=<timeout>
     ```
 
+#### Custom RGBA Background Color
+
+To use a custom background color for RGBA images, pass the `rgba_background_color` parameter via `--media-io-kwargs`:
+
+```bash
+# Example: Black background for dark theme
+vllm serve llava-hf/llava-1.5-7b-hf \
+  --media-io-kwargs '{"image": {"rgba_background_color": [0, 0, 0]}}'
+
+# Example: Custom gray background
+vllm serve llava-hf/llava-1.5-7b-hf \
+  --media-io-kwargs '{"image": {"rgba_background_color": [128, 128, 128]}}'
+```
+
 ### Audio Inputs
 
 Audio input is supported according to [OpenAI Audio API](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in).
@@ -588,7 +632,9 @@ Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for
 
 To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
 pass a tensor of shape to the corresponding field of the multi-modal dictionary.
+
 #### Image Embedding Inputs
+
 For image embeddings, you can pass the base64-encoded tensor to the `image_embeds` field.
 The following example demonstrates how to pass image embeddings to the OpenAI server:
 
diff --git a/docs/features/quantization/auto_round.md b/docs/features/quantization/auto_round.md
index 2dfd847bb7d9a..ac766d5e29228 100644
--- a/docs/features/quantization/auto_round.md
+++ b/docs/features/quantization/auto_round.md
@@ -97,7 +97,7 @@ for output in outputs:
     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 ```
 
-# Acknowledgement
+## Acknowledgement
 
 Special thanks to open-source low precision libraries such as AutoGPTQ, AutoAWQ, GPTQModel, Triton, Marlin, and
 ExLLaMAV2 for providing low-precision CUDA kernels, which are leveraged in AutoRound.
diff --git a/docs/features/quantization/int4.md b/docs/features/quantization/int4.md
index 1df32a11ed9db..127e403989944 100644
--- a/docs/features/quantization/int4.md
+++ b/docs/features/quantization/int4.md
@@ -134,8 +134,8 @@ lm_eval --model vllm \
 - Employ the chat template or instruction template that the model was trained with
 - If you've fine-tuned a model, consider using a sample of your training data for calibration
 - Tune key hyperparameters to the quantization algorithm:
-  - `dampening_frac` sets how much influence the GPTQ algorithm has. Lower values can improve accuracy, but can lead to numerical instabilities that cause the algorithm to fail.
-  - `actorder` sets the activation ordering. When compressing the weights of a layer weight, the order in which channels are quantized matters. Setting `actorder="weight"` can improve accuracy without added latency.
+    - `dampening_frac` sets how much influence the GPTQ algorithm has. Lower values can improve accuracy, but can lead to numerical instabilities that cause the algorithm to fail.
+    - `actorder` sets the activation ordering. When compressing the weights of a layer weight, the order in which channels are quantized matters. Setting `actorder="weight"` can improve accuracy without added latency.
 
 The following is an example of an expanded quantization recipe you can tune to your own use case:
 
diff --git a/docs/features/quantization/quantized_kvcache.md b/docs/features/quantization/quantized_kvcache.md
index c54ec43658a43..b2b417309e92b 100644
--- a/docs/features/quantization/quantized_kvcache.md
+++ b/docs/features/quantization/quantized_kvcache.md
@@ -50,6 +50,7 @@ Here is an example of how to enable FP8 quantization:
     ```
 
 The `kv_cache_dtype` argument specifies the data type for KV cache storage:
+
 - `"auto"`: Uses the model's default "unquantized" data type
 - `"fp8"` or `"fp8_e4m3"`: Supported on CUDA 11.8+ and ROCm (AMD GPU)
 - `"fp8_e5m2"`: Supported on CUDA 11.8+
diff --git a/docs/features/quantization/quark.md b/docs/features/quantization/quark.md
index 5abfae35eeec4..e8ed2155375d4 100644
--- a/docs/features/quantization/quark.md
+++ b/docs/features/quantization/quark.md
@@ -213,6 +213,7 @@ lm_eval --model vllm \
 ```
 
 ## Quark Quantization Script
+
 In addition to the example of Python API above, Quark also offers a
 [quantization script](https://quark.docs.amd.com/latest/pytorch/example_quark_torch_llm_ptq.html)
 to quantize large language models more conveniently. It supports quantizing models with variety
diff --git a/docs/features/quantization/supported_hardware.md b/docs/features/quantization/supported_hardware.md
index 70a6a499562a3..f53e69ecc6115 100644
--- a/docs/features/quantization/supported_hardware.md
+++ b/docs/features/quantization/supported_hardware.md
@@ -2,19 +2,26 @@
 
 The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:
 
+<style>
+th {
+  white-space: nowrap;
+  min-width: 0 !important;
+}
+</style>
+
 | Implementation        | Volta   | Turing   | Ampere   | Ada   | Hopper   | AMD GPU   | Intel GPU   | Intel Gaudi | x86 CPU   | AWS Neuron   | Google TPU   |
 |-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-------------|-----------|--------------|--------------|
-| AWQ                   | ❌      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌        | ✅︎          | ❌         | ✅︎        | ❌           | ❌           |
-| GPTQ                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎        | ❌        | ✅︎          | ❌         | ✅︎        | ❌           | ❌           |
-| Marlin (GPTQ/AWQ/FP8) | ❌      | ❌      | ✅︎       | ✅︎    | ✅︎       | ❌        | ❌          | ❌         | ❌        | ❌          | ❌           |
-| INT8 (W8A8)           | ❌      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌        | ❌          | ❌         | ✅︎        | ✅︎           | ✅︎            |
-| FP8 (W8A8)            | ❌      | ❌      | ❌       | ✅︎    | ✅︎      | ✅︎         | ❌          | ❌         | ❌        | ✅︎           | ❌           |
+| AWQ                   | ❌      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ✅︎          | ❌         | ✅︎        | ❌          | ❌           |
+| GPTQ                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ✅︎          | ❌         | ✅︎        | ❌          | ❌           |
+| Marlin (GPTQ/AWQ/FP8) | ❌      | ❌       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
+| INT8 (W8A8)           | ❌      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ✅︎        | ✅︎          | ✅︎           |
+| FP8 (W8A8)            | ❌      | ❌       | ❌       | ✅︎    | ✅︎       | ✅︎         | ❌          | ❌         | ❌        | ✅︎          | ❌           |
 | BitBLAS (GPTQ)        | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
 | AQLM                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
 | bitsandbytes          | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
 | DeepSpeedFP           | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
-| GGUF                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ✅︎         | ❌          | ❌         | ❌         | ❌          | ❌           |
-| INC (W8A8)            | ❌      | ❌      | ❌      | ❌    | ❌      | ❌        | ❌          | ✅︎         | ❌         | ❌           | ❌          |
+| GGUF                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ✅︎         | ❌          | ❌         | ❌        | ❌          | ❌           |
+| INC (W8A8)            | ❌      | ❌       | ❌       | ❌    | ❌       | ❌         | ❌          | ✅︎         | ❌        | ❌          | ❌           |
 
 - Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0.
 - ✅︎ indicates that the quantization method is supported on the specified hardware.
diff --git a/docs/features/quantization/torchao.md b/docs/features/quantization/torchao.md
index ab6802177048b..6932445997012 100644
--- a/docs/features/quantization/torchao.md
+++ b/docs/features/quantization/torchao.md
@@ -13,6 +13,7 @@ pip install \
 ```
 
 ## Quantizing HuggingFace Models
+
 You can quantize your own huggingface model with torchao, e.g. [transformers](https://huggingface.co/docs/transformers/main/en/quantization/torchao) and [diffusers](https://huggingface.co/docs/diffusers/en/quantization/torchao), and save the checkpoint to huggingface hub like [this](https://huggingface.co/jerryzh168/llama3-8b-int8wo) with the following example code:
 
 ??? code
diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md
index 6b84eca275309..04b943efbbbb4 100644
--- a/docs/features/reasoning_outputs.md
+++ b/docs/features/reasoning_outputs.md
@@ -123,13 +123,12 @@ OpenAI Python client library does not officially support `reasoning_content` att
     printed_content = False
 
     for chunk in stream:
-        reasoning_content = None
-        content = None
-        # Check the content is reasoning_content or content
-        if hasattr(chunk.choices[0].delta, "reasoning_content"):
-            reasoning_content = chunk.choices[0].delta.reasoning_content
-        elif hasattr(chunk.choices[0].delta, "content"):
-            content = chunk.choices[0].delta.content
+        # Safely extract reasoning_content and content from delta,
+        # defaulting to None if attributes don't exist or are empty strings
+        reasoning_content = (
+            getattr(chunk.choices[0].delta, "reasoning_content", None) or None
+        )
+        content = getattr(chunk.choices[0].delta, "content", None) or None
 
         if reasoning_content is not None:
             if not printed_reasoning_content:
diff --git a/docs/features/spec_decode.md b/docs/features/spec_decode.md
index be4b91feda7aa..89d5b489e1888 100644
--- a/docs/features/spec_decode.md
+++ b/docs/features/spec_decode.md
@@ -15,6 +15,10 @@ Speculative decoding is a technique which improves inter-token latency in memory
 
 The following code configures vLLM in an offline mode to use speculative decoding with a draft model, speculating 5 tokens at a time.
 
+!!! warning
+    In vllm v0.10.0, speculative decoding with a draft model is not supported.
+    If you use the following code, you will get a `NotImplementedError`.
+
 ??? code
 
     ```python
diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md
index 4f737afa80f55..8a934d406f382 100644
--- a/docs/features/structured_outputs.md
+++ b/docs/features/structured_outputs.md
@@ -103,7 +103,7 @@ The next example shows how to use the `guided_json` parameter with a Pydantic mo
                 "content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's",
             }
         ],
-        "response_format": {
+        response_format={
             "type": "json_schema",
             "json_schema": {
                 "name": "car-description",
diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md
index 2d2598da943c7..7a34d47d8e494 100644
--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@@ -164,7 +164,7 @@ Note, it is recommended to manually reserve 1 CPU for vLLM front-end process whe
 
 ### How to decide `VLLM_CPU_KVCACHE_SPACE`?
 
-  - This value is 4GB by default. Larger space can support more concurrent requests, longer context length. However, users should take care of memory capacity of each NUMA node. The memory usage of each TP rank is the sum of `weight shard size` and `VLLM_CPU_KVCACHE_SPACE`, if it exceeds the capacity of a single NUMA node, the TP worker will be killed with `exitcode 9` due to out-of-memory.
+This value is 4GB by default. Larger space can support more concurrent requests, longer context length. However, users should take care of memory capacity of each NUMA node. The memory usage of each TP rank is the sum of `weight shard size` and `VLLM_CPU_KVCACHE_SPACE`, if it exceeds the capacity of a single NUMA node, the TP worker will be killed with `exitcode 9` due to out-of-memory.
 
 ### How to do performance tuning for vLLM CPU?
 
@@ -183,13 +183,13 @@ vLLM CPU supports tensor parallel (TP) and pipeline parallel (PP) to leverage mu
 
 ### Which quantization configs does vLLM CPU support?
 
-  - vLLM CPU supports quantizations:
+- vLLM CPU supports quantizations:
     - AWQ (x86 only)
     - GPTQ (x86 only)
     - compressed-tensor INT8 W8A8 (x86, s390x)
 
 ### (x86 only) What is the purpose of `VLLM_CPU_MOE_PREPACK` and `VLLM_CPU_SGL_KERNEL`?
 
-  - Both of them requires `amx` CPU flag.
+- Both of them requires `amx` CPU flag.
     - `VLLM_CPU_MOE_PREPACK` can provides better performance for MoE models
     - `VLLM_CPU_SGL_KERNEL` can provides better performance for MoE models and small-batch scenarios.
diff --git a/docs/getting_started/installation/cpu/apple.inc.md b/docs/getting_started/installation/cpu/apple.inc.md
index 0816f38ac68a1..2828173a76a9a 100644
--- a/docs/getting_started/installation/cpu/apple.inc.md
+++ b/docs/getting_started/installation/cpu/apple.inc.md
@@ -1,6 +1,6 @@
 # --8<-- [start:installation]
 
-vLLM has experimental support for macOS with Apple silicon. For now, users shall build from the source vLLM to natively run on macOS.
+vLLM has experimental support for macOS with Apple silicon. For now, users must build from source to natively run on macOS.
 
 Currently the CPU implementation for macOS supports FP32 and FP16 datatypes.
 
@@ -23,20 +23,20 @@ Currently the CPU implementation for macOS supports FP32 and FP16 datatypes.
 # --8<-- [end:pre-built-wheels]
 # --8<-- [start:build-wheel-from-source]
 
-After installation of XCode and the Command Line Tools, which include Apple Clang, execute the following commands to build and install vLLM from the source.
+After installation of XCode and the Command Line Tools, which include Apple Clang, execute the following commands to build and install vLLM from source.
 
 ```bash
 git clone https://github.com/vllm-project/vllm.git
 cd vllm
-pip install -r requirements/cpu.txt
-pip install -e .
+uv pip install -r requirements/cpu.txt
+uv pip install -e .
 ```
 
 !!! note
-    On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which currently is the only supported device.
+    On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which is currently the only supported device.
 
 !!! example "Troubleshooting"
-    If the build has error like the following snippet where standard C++ headers cannot be found, try to remove and reinstall your
+    If the build fails with errors like the following where standard C++ headers cannot be found, try to remove and reinstall your
     [Command Line Tools for Xcode](https://developer.apple.com/download/all/).
 
     ```text
diff --git a/docs/getting_started/installation/cpu/build.inc.md b/docs/getting_started/installation/cpu/build.inc.md
index fa777fe0c8a1a..57a09e674a821 100644
--- a/docs/getting_started/installation/cpu/build.inc.md
+++ b/docs/getting_started/installation/cpu/build.inc.md
@@ -1,4 +1,4 @@
-First, install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
+First, install the recommended compiler. We recommend using `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
 
 ```bash
 sudo apt-get update  -y
@@ -6,28 +6,34 @@ sudo apt-get install -y --no-install-recommends ccache git curl wget ca-certific
 sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
 ```
 
-Second, clone vLLM project:
+Second, clone the vLLM project:
 
 ```bash
 git clone https://github.com/vllm-project/vllm.git vllm_source
 cd vllm_source
 ```
 
-Third, install Python packages for vLLM CPU backend building:
+Third, install required dependencies:
 
 ```bash
-pip install --upgrade pip
-pip install -v -r requirements/cpu-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
-pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
+uv pip install -r requirements/cpu-build.txt --torch-backend auto
+uv pip install -r requirements/cpu.txt --torch-backend auto
 ```
 
-Finally, build and install vLLM CPU backend:
+??? console "pip"
+    ```bash
+    pip install --upgrade pip
+    pip install -v -r requirements/cpu-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
+    pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
+    ```
+
+Finally, build and install vLLM:
 
 ```bash
 VLLM_TARGET_DEVICE=cpu python setup.py install
 ```
 
-If you want to develop vllm, install it in editable mode instead.
+If you want to develop vLLM, install it in editable mode instead.
 
 ```bash
 VLLM_TARGET_DEVICE=cpu python setup.py develop
diff --git a/docs/getting_started/installation/cpu/s390x.inc.md b/docs/getting_started/installation/cpu/s390x.inc.md
index acfb3396896bf..c1917267ce91b 100644
--- a/docs/getting_started/installation/cpu/s390x.inc.md
+++ b/docs/getting_started/installation/cpu/s390x.inc.md
@@ -1,6 +1,6 @@
 # --8<-- [start:installation]
 
-vLLM has experimental support for s390x architecture on IBM Z platform. For now, users shall build from the vLLM source to natively run on IBM Z platform.
+vLLM has experimental support for s390x architecture on IBM Z platform. For now, users must build from source to natively run on IBM Z platform.
 
 Currently the CPU implementation for s390x architecture supports FP32 datatype only.
 
@@ -40,21 +40,32 @@ curl https://sh.rustup.rs -sSf | sh -s -- -y && \
     . "$HOME/.cargo/env"
 ```
 
-Execute the following commands to build and install vLLM from the source.
+Execute the following commands to build and install vLLM from source.
 
 !!! tip
-    Please build the following dependencies, `torchvision`, `pyarrow` from the source before building vLLM.
+    Please build the following dependencies, `torchvision`, `pyarrow` from source before building vLLM.
 
 ```bash
     sed -i '/^torch/d' requirements-build.txt    # remove torch from requirements-build.txt since we use nightly builds
-    pip install -v \
-        --extra-index-url https://download.pytorch.org/whl/nightly/cpu \
+    uv pip install -v \
+        --torch-backend auto \
         -r requirements-build.txt \
         -r requirements-cpu.txt \
     VLLM_TARGET_DEVICE=cpu python setup.py bdist_wheel && \
-    pip install dist/*.whl
+        uv pip install dist/*.whl
 ```
 
+??? console "pip"
+    ```bash
+        sed -i '/^torch/d' requirements-build.txt    # remove torch from requirements-build.txt since we use nightly builds
+        pip install -v \
+            --extra-index-url https://download.pytorch.org/whl/nightly/cpu \
+            -r requirements-build.txt \
+            -r requirements-cpu.txt \
+        VLLM_TARGET_DEVICE=cpu python setup.py bdist_wheel && \
+            pip install dist/*.whl
+    ```
+
 # --8<-- [end:build-wheel-from-source]
 # --8<-- [start:pre-built-images]
 
@@ -63,19 +74,19 @@ Execute the following commands to build and install vLLM from the source.
 
 ```bash
 docker build -f docker/Dockerfile.s390x \
-        --tag vllm-cpu-env .
+    --tag vllm-cpu-env .
 
-# Launching OpenAI server
+# Launch OpenAI server
 docker run --rm \
-            --privileged=true \
-            --shm-size=4g \
-            -p 8000:8000 \
-            -e VLLM_CPU_KVCACHE_SPACE=<KV cache space> \
-            -e VLLM_CPU_OMP_THREADS_BIND=<CPU cores for inference> \
-            vllm-cpu-env \
-            --model=meta-llama/Llama-3.2-1B-Instruct \
-            --dtype=float \
-            other vLLM OpenAI server arguments
+    --privileged true \
+    --shm-size 4g \
+    -p 8000:8000 \
+    -e VLLM_CPU_KVCACHE_SPACE=<KV cache space> \
+    -e VLLM_CPU_OMP_THREADS_BIND=<CPU cores for inference> \
+    vllm-cpu-env \
+    --model meta-llama/Llama-3.2-1B-Instruct \
+    --dtype float \
+    other vLLM OpenAI server arguments
 ```
 
 # --8<-- [end:build-image-from-source]
diff --git a/docs/getting_started/installation/google_tpu.md b/docs/getting_started/installation/google_tpu.md
index 55d69d11fa401..6f09babb3aba0 100644
--- a/docs/getting_started/installation/google_tpu.md
+++ b/docs/getting_started/installation/google_tpu.md
@@ -85,7 +85,7 @@ gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \
 | PROJECT_ID         | Your Google Cloud project                                                                                                                                                                                |
 | ZONE               | The GCP zone where you want to create your Cloud TPU. The value you use depends on the version of TPUs you are using. For more information, see [TPU regions and zones]                                  |
 | ACCELERATOR_TYPE   | The TPU version you want to use. Specify the TPU version, for example `v5litepod-4` specifies a v5e TPU with 4 cores, `v6e-1` specifies a v6e TPU with 1 core. For more information, see [TPU versions]. |
-| RUNTIME_VERSION    | The TPU VM runtime version to use. For example, use `v2-alpha-tpuv6e` for a VM loaded with one or more v6e TPU(s). For more information see [TPU VM images].                                             |
+| RUNTIME_VERSION    | The TPU VM runtime version to use. For example, use `v2-alpha-tpuv6e` for a VM loaded with one or more v6e TPU(s).                                              |
 | SERVICE_ACCOUNT    | The email address for your service account. You can find it in the IAM Cloud Console under *Service Accounts*. For example: `tpu-service-account@<your_project_ID>.iam.gserviceaccount.com`              |
 
 Connect to your TPU VM using SSH:
@@ -94,6 +94,9 @@ Connect to your TPU VM using SSH:
 gcloud compute tpus tpu-vm ssh TPU_NAME --project PROJECT_ID --zone ZONE
 ```
 
+!!! note
+    When configuring `RUNTIME_VERSION` ("TPU software version") on GCP, ensure it matches the TPU generation you've selected by referencing the [TPU VM images] compatibility matrix. Using an incompatible version may prevent vLLM from running correctly.
+
 [TPU versions]: https://cloud.google.com/tpu/docs/runtimes
 [TPU VM images]: https://cloud.google.com/tpu/docs/runtimes
 [TPU regions and zones]: https://cloud.google.com/tpu/docs/regions-zones
diff --git a/docs/getting_started/installation/gpu/cuda.inc.md b/docs/getting_started/installation/gpu/cuda.inc.md
index 5ca5296d0a657..69a9842e4719b 100644
--- a/docs/getting_started/installation/gpu/cuda.inc.md
+++ b/docs/getting_started/installation/gpu/cuda.inc.md
@@ -20,16 +20,16 @@ Therefore, it is recommended to install vLLM with a **fresh new** environment. I
 # --8<-- [end:set-up-using-python]
 # --8<-- [start:pre-built-wheels]
 
-You can install vLLM using either `pip` or `uv pip`:
-
 ```bash
-# Install vLLM with CUDA 12.8.
-# If you are using pip.
-pip install vllm --extra-index-url https://download.pytorch.org/whl/cu128
-# If you are using uv.
 uv pip install vllm --torch-backend=auto
 ```
 
+??? console "pip"
+    ```bash
+    # Install vLLM with CUDA 12.8.
+    pip install vllm --extra-index-url https://download.pytorch.org/whl/cu128
+    ```
+
 We recommend leveraging `uv` to [automatically select the appropriate PyTorch index at runtime](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection) by inspecting the installed CUDA driver version via `--torch-backend=auto` (or `UV_TORCH_BACKEND=auto`). To select a specific backend (e.g., `cu126`), set `--torch-backend=cu126` (or `UV_TORCH_BACKEND=cu126`). If this doesn't work, try running `uv self update` to update `uv` first.
 
 !!! note
@@ -38,10 +38,10 @@ We recommend leveraging `uv` to [automatically select the appropriate PyTorch in
 As of now, vLLM's binaries are compiled with CUDA 12.8 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.6, 11.8, and public PyTorch release versions:
 
 ```bash
-# Install vLLM with CUDA 11.8.
-export VLLM_VERSION=0.6.1.post1
-export PYTHON_VERSION=312
-uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
+# Install vLLM with a specific CUDA version (e.g., 11.8 or 12.6).
+export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//')
+export CUDA_VERSION=118 # or 126
+uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu${CUDA_VERSION}-cp38-abi3-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu${CUDA_VERSION}
 ```
 
 [](){ #install-the-latest-code }
@@ -50,36 +50,22 @@ uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VE
 
 LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on a x86 platform with CUDA 12 for every commit since `v0.5.3`.
 
-##### Install the latest code using `pip`
-
-```bash
-pip install -U vllm \
-    --pre \
-    --extra-index-url https://wheels.vllm.ai/nightly
-```
-
-`--pre` is required for `pip` to consider pre-released versions.
-
-Another way to install the latest code is to use `uv`:
-
 ```bash
 uv pip install -U vllm \
     --torch-backend=auto \
     --extra-index-url https://wheels.vllm.ai/nightly
 ```
 
-##### Install specific revisions using `pip`
+??? console "pip"
+    ```bash
+    pip install -U vllm \
+        --pre \
+        --extra-index-url https://wheels.vllm.ai/nightly
+    ```
 
-If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), due to the limitation of `pip`, you have to specify the full URL of the wheel file by embedding the commit hash in the URL:
+    `--pre` is required for `pip` to consider pre-released versions.
 
-```bash
-export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
-pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
-```
-
-Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in the wheel metadata (the wheels listed in the extra index url have correct versions). Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
-
-##### Install specific revisions using `uv`
+##### Install specific revisions
 
 If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL:
 
@@ -92,17 +78,35 @@ uv pip install vllm \
 
 The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-remember command. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version.
 
+??? note "pip"
+    If you want to access the wheels for previous commits (e.g. to bisect the behavior change,
+    performance regression), due to the limitation of `pip`, you have to specify the full URL of the
+    wheel file by embedding the commit hash in the URL:
+
+    ```bash
+    export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
+    pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
+    ```
+
+    Note that the wheels are built with Python 3.8 ABI (see [PEP
+    425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible
+    with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a
+    placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in
+    the wheel metadata (the wheels listed in the extra index url have correct versions). Although we
+    don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the
+    wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
+
 # --8<-- [end:pre-built-wheels]
 # --8<-- [start:build-wheel-from-source]
 
 #### Set up using Python-only build (without compilation)
 
-If you only need to change Python code, you can build and install vLLM without compilation. Using `pip`'s [`--editable` flag](https://pip.pypa.io/en/stable/topics/local-project-installs/#editable-installs), changes you make to the code will be reflected when you run vLLM:
+If you only need to change Python code, you can build and install vLLM without compilation. Using `uv pip`'s [`--editable` flag](https://docs.astral.sh/uv/pip/packages/#editable-packages), changes you make to the code will be reflected when you run vLLM:
 
 ```bash
 git clone https://github.com/vllm-project/vllm.git
 cd vllm
-VLLM_USE_PRECOMPILED=1 pip install --editable .
+VLLM_USE_PRECOMPILED=1 uv pip install --editable .
 ```
 
 This command will do the following:
@@ -121,7 +125,7 @@ In case you see an error about wheel not found when running the above command, i
 ```bash
 export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch
 export VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
-pip install --editable .
+uv pip install --editable .
 ```
 
 You can find more information about vLLM's wheels in [install-the-latest-code][install-the-latest-code].
@@ -137,7 +141,7 @@ If you want to modify C++ or CUDA code, you'll need to build vLLM from source. T
 ```bash
 git clone https://github.com/vllm-project/vllm.git
 cd vllm
-pip install -e .
+uv pip install -e .
 ```
 
 !!! tip
@@ -152,14 +156,14 @@ pip install -e .
     The following environment variables can be set to configure the vLLM `sccache` remote: `SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1`. We also recommend setting `SCCACHE_IDLE_TIMEOUT=0`.
 
 !!! note "Faster Kernel Development"
-    For frequent C++/CUDA kernel changes, after the initial `pip install -e .` setup, consider using the [Incremental Compilation Workflow](../../contributing/incremental_build.md) for significantly faster rebuilds of only the modified kernel code.
+    For frequent C++/CUDA kernel changes, after the initial `uv pip install -e .` setup, consider using the [Incremental Compilation Workflow](../../contributing/incremental_build.md) for significantly faster rebuilds of only the modified kernel code.
 
 ##### Use an existing PyTorch installation
 
-There are scenarios where the PyTorch dependency cannot be easily installed via pip, e.g.:
+There are scenarios where the PyTorch dependency cannot be easily installed with `uv`, e.g.:
 
 - Building vLLM with PyTorch nightly or a custom PyTorch build.
-- Building vLLM with aarch64 and CUDA (GH200), where the PyTorch wheels are not available on PyPI. Currently, only the PyTorch nightly has wheels for aarch64 with CUDA. You can run `pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124` to [install PyTorch nightly](https://pytorch.org/get-started/locally/), and then build vLLM on top of it.
+- Building vLLM with aarch64 and CUDA (GH200), where the PyTorch wheels are not available on PyPI. Currently, only the PyTorch nightly has wheels for aarch64 with CUDA. You can run `uv pip install --index-url https://download.pytorch.org/whl/nightly/cu128 torch torchvision torchaudio` to [install PyTorch nightly](https://pytorch.org/get-started/locally/) and then build vLLM on top of it.
 
 To build vLLM using an existing PyTorch installation:
 
@@ -167,8 +171,8 @@ To build vLLM using an existing PyTorch installation:
 git clone https://github.com/vllm-project/vllm.git
 cd vllm
 python use_existing_torch.py
-pip install -r requirements/build.txt
-pip install --no-build-isolation -e .
+uv pip install -r requirements/build.txt
+uv pip install --no-build-isolation -e .
 ```
 
 ##### Use the local cutlass for compilation
@@ -179,7 +183,7 @@ To achieve this, you can set the environment variable VLLM_CUTLASS_SRC_DIR to po
 ```bash
 git clone https://github.com/vllm-project/vllm.git
 cd vllm
-VLLM_CUTLASS_SRC_DIR=/path/to/cutlass pip install -e .
+VLLM_CUTLASS_SRC_DIR=/path/to/cutlass uv pip install -e .
 ```
 
 ##### Troubleshooting
@@ -189,7 +193,7 @@ to be run simultaneously, via the environment variable `MAX_JOBS`. For example:
 
 ```bash
 export MAX_JOBS=6
-pip install -e .
+uv pip install -e .
 ```
 
 This is especially useful when you are building on less powerful machines. For example, when you use WSL it only [assigns 50% of the total memory by default](https://learn.microsoft.com/en-us/windows/wsl/wsl-config#main-wsl-settings), so using `export MAX_JOBS=1` can avoid compiling multiple files simultaneously and running out of memory.
@@ -228,7 +232,7 @@ Simply disable the `VLLM_TARGET_DEVICE` environment variable before installing:
 
 ```bash
 export VLLM_TARGET_DEVICE=empty
-pip install -e .
+uv pip install -e .
 ```
 
 # --8<-- [end:build-wheel-from-source]
diff --git a/docs/getting_started/installation/intel_gaudi.md b/docs/getting_started/installation/intel_gaudi.md
index 0be0d02d0679c..61b2b02aa10ba 100644
--- a/docs/getting_started/installation/intel_gaudi.md
+++ b/docs/getting_started/installation/intel_gaudi.md
@@ -339,13 +339,13 @@ Each described step is logged by vLLM server, as follows (negative values corres
 
 - `VLLM_{phase}_{dim}_BUCKET_{param}` - collection of 12 environment variables configuring ranges of bucketing mechanism
 
-    * `{phase}` is either `PROMPT` or `DECODE`
+    - `{phase}` is either `PROMPT` or `DECODE`
 
-    * `{dim}` is either `BS`, `SEQ` or `BLOCK`
+    - `{dim}` is either `BS`, `SEQ` or `BLOCK`
 
-    * `{param}` is either `MIN`, `STEP` or `MAX`
+    - `{param}` is either `MIN`, `STEP` or `MAX`
 
-    * Default values:
+    - Default values:
 
 | `{phase}` | Parameter | Env Variable | Value Expression |
 |-----------|-----------|--------------|------------------|
diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md
index 74235db16a15d..f833807666460 100644
--- a/docs/getting_started/quickstart.md
+++ b/docs/getting_started/quickstart.md
@@ -98,6 +98,43 @@ for output in outputs:
     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 ```
 
+!!! note
+    The `llm.generate` method does not automatically apply the model's chat template to the input prompt. Therefore, if you are using an Instruct model or Chat model, you should manually apply the corresponding chat template to ensure the expected behavior. Alternatively, you can use the `llm.chat` method and pass a list of messages which have the same format as those passed to OpenAI's `client.chat.completions`:
+
+    ??? code
+    
+        ```python
+        # Using tokenizer to apply chat template
+        from transformers import AutoTokenizer
+    
+        tokenizer = AutoTokenizer.from_pretrained("/path/to/chat_model")
+        messages_list = [
+            [{"role": "user", "content": prompt}]
+            for prompt in prompts
+        ]
+        texts = tokenizer.apply_chat_template(
+            messages_list,
+            tokenize=False,
+            add_generation_prompt=True,
+        )
+        
+        # Generate outputs
+        outputs = llm.generate(texts, sampling_params)
+        
+        # Print the outputs.
+        for output in outputs:
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    
+        # Using chat interface.
+        outputs = llm.chat(messages_list, sampling_params)
+        for idx, output in enumerate(outputs):
+            prompt = prompts[idx]
+            generated_text = output.outputs[0].text
+            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        ```
+
 [](){ #quickstart-online }
 
 ## OpenAI-Compatible Server
@@ -126,6 +163,7 @@ curl http://localhost:8000/v1/models
 ```
 
 You can pass in the argument `--api-key` or environment variable `VLLM_API_KEY` to enable the server to check for API key in the header.
+You can pass multiple keys after `--api-key`, and the server will accept any of the keys passed, this can be useful for key rotation.
 
 ### OpenAI Completions API with vLLM
 
diff --git a/docs/models/hardware_supported_models/tpu.md b/docs/models/hardware_supported_models/tpu.md
index da03a3b3160ad..7b0a5ba6e72da 100644
--- a/docs/models/hardware_supported_models/tpu.md
+++ b/docs/models/hardware_supported_models/tpu.md
@@ -1,7 +1,8 @@
 # TPU
 
-# TPU Supported Models
-## Text-only Language Models
+## Supported Models
+
+### Text-only Language Models
 
 | Model                                               | Architecture                   | Supported |
 |-----------------------------------------------------|--------------------------------|-----------|
diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md
index a06d86523af1a..c6588363b63fb 100644
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -7,9 +7,9 @@ These models use a [Pooler][vllm.model_executor.layers.pooler.Pooler] to extract
 before returning them.
 
 !!! note
-    We currently support pooling models primarily as a matter of convenience.
-    As shown in the [Compatibility Matrix](../features/compatibility_matrix.md), most vLLM features are not applicable to
-    pooling models as they only work on the generation or decode stage, so performance may not improve as much.
+    We currently support pooling models primarily as a matter of convenience. This is not guaranteed to have any performance improvement over using HF Transformers / Sentence Transformers directly.
+
+    We are now planning to optimize pooling models in vLLM. Please comment on <gh-issue:21796> if you have any suggestions!
 
 ## Configuration
 
@@ -45,14 +45,14 @@ Each pooling model in vLLM supports one or more of these tasks according to
 [Pooler.get_supported_tasks][vllm.model_executor.layers.pooler.Pooler.get_supported_tasks],
 enabling the corresponding APIs:
 
-| Task       | APIs               |
-|------------|--------------------|
-| `encode`   | `encode`           |
-| `embed`    | `embed`, `score`\* |
-| `classify` | `classify`         |
-| `score`    | `score`            |
+| Task       | APIs                                 |
+|------------|--------------------------------------|
+| `encode`   | `LLM.reward(...)`                    |
+| `embed`    | `LLM.embed(...)`, `LLM.score(...)`\* |
+| `classify` | `LLM.classify(...)`                  |
+| `score`    | `LLM.score(...)`                     |
 
-\* The `score` API falls back to `embed` task if the model does not support `score` task.
+\* The `LLM.score(...)` API falls back to `embed` task if the model does not support `score` task.
 
 ### Pooler Configuration
 
@@ -66,11 +66,11 @@ you can override some of its attributes via the `--override-pooler-config` optio
 If the model has been converted via `--convert` (see above),
 the pooler assigned to each task has the following attributes by default:
 
-| Task       | Pooling Type   | Normalization | Softmax |
-|------------|----------------|---------------|---------|
-| `encode`   | `ALL`          | ❌            | ❌      |
-| `embed`    | `LAST`         | ✅︎            | ❌      |
-| `classify` | `LAST`         | ❌            | ✅︎      |
+| Task       | Pooling Type | Normalization | Softmax |
+|------------|--------------|---------------|---------|
+| `reward`   | `ALL`        | ❌            | ❌     |
+| `embed`    | `LAST`       | ✅︎            | ❌      |
+| `classify` | `LAST`       | ❌            | ✅︎      |
 
 When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models,
 its Sentence Transformers configuration file (`modules.json`) takes priority over the model's defaults.
@@ -83,21 +83,6 @@ which takes priority over both the model's and Sentence Transformers's defaults.
 The [LLM][vllm.LLM] class provides various methods for offline inference.
 See [configuration][configuration] for a list of options when initializing the model.
 
-### `LLM.encode`
-
-The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM.
-It returns the extracted hidden states directly, which is useful for reward models.
-
-```python
-from vllm import LLM
-
-llm = LLM(model="Qwen/Qwen2.5-Math-RM-72B", runner="pooling")
-(output,) = llm.encode("Hello, my name is")
-
-data = output.outputs.data
-print(f"Data: {data!r}")
-```
-
 ### `LLM.embed`
 
 The [embed][vllm.LLM.embed] method outputs an embedding vector for each prompt.
@@ -106,7 +91,7 @@ It is primarily designed for embedding models.
 ```python
 from vllm import LLM
 
-llm = LLM(model="intfloat/e5-mistral-7b-instruct", runner="pooling")
+llm = LLM(model="intfloat/e5-small", runner="pooling")
 (output,) = llm.embed("Hello, my name is")
 
 embeds = output.outputs.embedding
@@ -135,7 +120,7 @@ A code example can be found here: <gh-file:examples/offline_inference/basic/clas
 ### `LLM.score`
 
 The [score][vllm.LLM.score] method outputs similarity scores between sentence pairs.
-It is designed for embedding models and cross encoder models. Embedding models use cosine similarity, and [cross-encoder models](https://www.sbert.net/examples/applications/cross-encoder/README.html) serve as rerankers between candidate query-document pairs in RAG systems.
+It is designed for embedding models and cross-encoder models. Embedding models use cosine similarity, and [cross-encoder models](https://www.sbert.net/examples/applications/cross-encoder/README.html) serve as rerankers between candidate query-document pairs in RAG systems.
 
 !!! note
     vLLM can only perform the model inference component (e.g. embedding, reranking) of RAG.
@@ -154,6 +139,46 @@ print(f"Score: {score}")
 
 A code example can be found here: <gh-file:examples/offline_inference/basic/score.py>
 
+### `LLM.reward`
+
+The [reward][vllm.LLM.reward] method is available to all reward models in vLLM.
+It returns the extracted hidden states directly.
+
+```python
+from vllm import LLM
+
+llm = LLM(model="internlm/internlm2-1_8b-reward", runner="pooling", trust_remote_code=True)
+(output,) = llm.reward("Hello, my name is")
+
+data = output.outputs.data
+print(f"Data: {data!r}")
+```
+
+A code example can be found here: <gh-file:examples/offline_inference/basic/reward.py>
+
+### `LLM.encode`
+
+The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM.
+It returns the extracted hidden states directly.
+
+!!! note
+    Please use one of the more specific methods or set the task directly when using `LLM.encode`:
+
+    - For embeddings, use `LLM.embed(...)` or `pooling_task="embed"`.
+    - For classification logits, use `LLM.classify(...)` or `pooling_task="classify"`.
+    - For rewards, use `LLM.reward(...)` or `pooling_task="reward"`.
+    - For similarity scores, use `LLM.score(...)`.  
+
+```python
+from vllm import LLM
+
+llm = LLM(model="intfloat/e5-small", runner="pooling")
+(output,) = llm.encode("Hello, my name is", pooling_task="embed")
+
+data = output.outputs.data
+print(f"Data: {data!r}")
+```
+
 ## Online Serving
 
 Our [OpenAI-Compatible Server](../serving/openai_compatible_server.md) provides endpoints that correspond to the offline APIs:
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 763dd8fd50451..017a339ffca0c 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -45,10 +45,10 @@ If a model is neither supported natively by vLLM or Transformers, it can still b
 For a model to be compatible with the Transformers backend for vLLM it must:
 
 - be a Transformers compatible custom model (see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)):
-    * The model directory must have the correct structure (e.g. `config.json` is present).
-    * `config.json` must contain `auto_map.AutoModel`.
+    - The model directory must have the correct structure (e.g. `config.json` is present).
+    - `config.json` must contain `auto_map.AutoModel`.
 - be a Transformers backend for vLLM compatible model (see [writing-custom-models][writing-custom-models]):
-    * Customisation should be done in the base model (e.g. in `MyModel`, not `MyModelForCausalLM`).
+    - Customisation should be done in the base model (e.g. in `MyModel`, not `MyModelForCausalLM`).
 
 If the compatible model is:
 
@@ -134,10 +134,10 @@ class MyConfig(PretrainedConfig):
 
 - `base_model_tp_plan` is a `dict` that maps fully qualified layer name patterns to tensor parallel styles (currently only `"colwise"` and `"rowwise"` are supported).
 - `base_model_pp_plan` is a `dict` that maps direct child layer names to `tuple`s of `list`s of `str`s:
-    * You only need to do this for layers which are not present on all pipeline stages
-    * vLLM assumes that there will be only one `nn.ModuleList`, which is distributed across the pipeline stages
-    * The `list` in the first element of the `tuple` contains the names of the input arguments
-    * The `list` in the last element of the `tuple` contains the names of the variables the layer outputs to in your modeling code
+    - You only need to do this for layers which are not present on all pipeline stages
+    - vLLM assumes that there will be only one `nn.ModuleList`, which is distributed across the pipeline stages
+    - The `list` in the first element of the `tuple` contains the names of the input arguments
+    - The `list` in the last element of the `tuple` contains the names of the variables the layer outputs to in your modeling code
 
 ## Loading a Model
 
@@ -255,7 +255,7 @@ export https_proxy=http://your.proxy.server:port
 https_proxy=http://your.proxy.server:port huggingface-cli download <model_name>
 
 # or use vllm cmd directly
-https_proxy=http://your.proxy.server:port  vllm serve <model_name> --disable-log-requests
+https_proxy=http://your.proxy.server:port  vllm serve <model_name>
 ```
 
 - Set the proxy in Python interpreter:
@@ -311,6 +311,8 @@ See [this page](generative_models.md) for more information on how to use generat
 
 #### Text Generation
 
+These models primarily accept the [`LLM.generate`](./generative_models.md#llmgenerate) API. Chat/Instruct models additionally support the [`LLM.chat`](./generative_models.md#llmchat) API.
+
 <style>
 th {
   white-space: nowrap;
@@ -328,7 +330,7 @@ th {
 | `BambaForCausalLM` | Bamba | `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B` | ✅︎ | ✅︎ | ✅︎ |
 | `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | | ✅︎ | |
 | `BartForConditionalGeneration` | BART | `facebook/bart-base`, `facebook/bart-large-cnn`, etc. | | | |
-| `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `zai-org/chatglm2-6b`, `zai-org/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R | `CohereForAI/c4ai-command-r-v01`, `CohereForAI/c4ai-command-r7b-12-2024`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ | ✅︎ |
 | `DeciLMForCausalLM` | DeciLM | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc. | ✅︎ | ✅︎ | ✅︎ |
@@ -348,8 +350,8 @@ th {
 | `Gemma2ForCausalLM` | Gemma 2 | `google/gemma-2-9b`, `google/gemma-2-27b`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Gemma3ForCausalLM` | Gemma 3 | `google/gemma-3-1b-it`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Gemma3nForConditionalGeneration` | Gemma 3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ |
-| `GlmForCausalLM` | GLM-4 | `THUDM/glm-4-9b-chat-hf`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Glm4ForCausalLM` | GLM-4-0414 | `THUDM/GLM-4-32B-0414`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `GlmForCausalLM` | GLM-4 | `zai-org/glm-4-9b-chat-hf`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Glm4ForCausalLM` | GLM-4-0414 | `zai-org/GLM-4-32B-0414`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `GPT2LMHeadModel` | GPT-2 | `gpt2`, `gpt2-xl`, etc. | | ✅︎ | ✅︎ |
 | `GPTBigCodeForCausalLM` | StarCoder, SantaCoder, WizardCoder | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `GPTJForCausalLM` | GPT-J | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. | | ✅︎ | ✅︎ |
@@ -419,7 +421,9 @@ See [this page](./pooling_models.md) for more information on how to use pooling
     Since some model architectures support both generative and pooling tasks,
     you should explicitly specify `--runner pooling` to ensure that the model is used in pooling mode instead of generative mode.
 
-#### Text Embedding
+#### Embedding
+
+These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) API.
 
 | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
 |--------------|--------|-------------------|----------------------|---------------------------|---------------------|
@@ -457,28 +461,10 @@ If your model is not in the above list, we will try to automatically convert the
 [as_embedding_model][vllm.model_executor.models.adapters.as_embedding_model]. By default, the embeddings
 of the whole prompt are extracted from the normalized hidden state corresponding to the last token.
 
-#### Reward Modeling
-
-| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
-|--------------|--------|-------------------|----------------------|---------------------------|---------------------|
-| `InternLM2ForRewardModel` | InternLM2-based | `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `LlamaForCausalLM`<sup>C</sup> | Llama-based | `peiyi9979/math-shepherd-mistral-7b-prm`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Qwen2ForRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-RM-72B`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Qwen2ForProcessRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-PRM-7B`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* | \* |
-
-<sup>C</sup> Automatically converted into a reward model via `--convert reward`. ([details](./pooling_models.md#model-conversion))  
-\* Feature support is the same as that of the original model.
-
-If your model is not in the above list, we will try to automatically convert the model using
-[as_reward_model][vllm.model_executor.models.adapters.as_reward_model]. By default, we return the hidden states of each token directly.
-
-!!! important
-    For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
-    e.g.: `--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
-
 #### Classification
 
+These models primarily support the [`LLM.classify`](./pooling_models.md#llmclassify) API.
+
 | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
 |--------------|--------|-------------------|----------------------|---------------------------|---------------------|
 | `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ | |
@@ -491,7 +477,10 @@ If your model is not in the above list, we will try to automatically convert the
 If your model is not in the above list, we will try to automatically convert the model using
 [as_seq_cls_model][vllm.model_executor.models.adapters.as_seq_cls_model]. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
 
-#### Sentence Pair Scoring
+#### Cross-encoder / Reranker
+
+Cross-encoder and reranker models are a subset of classification models that accept two prompts as input.
+These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) API.
 
 | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
 |--------------|--------|-------------------|----------------------|---------------------------|---------------------|
@@ -501,6 +490,7 @@ If your model is not in the above list, we will try to automatically convert the
 | `Qwen3ForSequenceClassification` | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ | ✅︎ | ✅︎ |
 | `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | | | |
 | `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | | | |
+| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* | \* |
 
 <sup>C</sup> Automatically converted into a classification model via `--convert classify`. ([details](./pooling_models.md#model-conversion))  
 \* Feature support is the same as that of the original model.
@@ -526,6 +516,28 @@ If your model is not in the above list, we will try to automatically convert the
     vllm serve Qwen/Qwen3-Reranker-0.6B --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}'
     ```
 
+#### Reward Modeling
+
+These models primarily support the [`LLM.reward`](./pooling_models.md#llmreward) API.
+
+| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
+|--------------|--------|-------------------|----------------------|---------------------------|---------------------|
+| `InternLM2ForRewardModel` | InternLM2-based | `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `LlamaForCausalLM`<sup>C</sup> | Llama-based | `peiyi9979/math-shepherd-mistral-7b-prm`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Qwen2ForRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-RM-72B`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Qwen2ForProcessRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-PRM-7B`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* | \* |
+
+<sup>C</sup> Automatically converted into a reward model via `--convert reward`. ([details](./pooling_models.md#model-conversion))  
+\* Feature support is the same as that of the original model.
+
+If your model is not in the above list, we will try to automatically convert the model using
+[as_reward_model][vllm.model_executor.models.adapters.as_reward_model]. By default, we return the hidden states of each token directly.
+
+!!! important
+    For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
+    e.g.: `--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
+
 [](){ #supported-mm-models }
 
 ## List of Multimodal Language Models
@@ -579,6 +591,8 @@ See [this page](generative_models.md) for more information on how to use generat
 
 #### Text Generation
 
+These models primarily accept the [`LLM.generate`](./generative_models.md#llmgenerate) API. Chat/Instruct models additionally support the [`LLM.chat`](./generative_models.md#llmchat) API.
+
 | Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
 |--------------|--------|--------|-------------------|----------------------|---------------------------|---------------------|
 | `AriaForConditionalGeneration` | Aria | T + I<sup>+</sup> | `rhymes-ai/Aria` | | | ✅︎ |
@@ -589,13 +603,14 @@ See [this page](generative_models.md) for more information on how to use generat
 | `Florence2ForConditionalGeneration` | Florence-2 | T + I | `microsoft/Florence-2-base`, `microsoft/Florence-2-large`, etc. | | | |
 | `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b`, etc. | | ✅︎ | ✅︎ |
 | `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ | ⚠️ |
-| `GLM4VForCausalLM`<sup>^</sup> | GLM-4V | T + I | `THUDM/glm-4v-9b`, `THUDM/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + I<sup>E+</sup> + V<sup>E+</sup> | `THUDM/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Glm4MoeForCausalLM` | GLM-4.5 | T + I<sup>E+</sup> + V<sup>E+</sup> | `THUDM/GLM-4.5`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `GLM4VForCausalLM`<sup>^</sup> | GLM-4V | T + I | `zai-org/glm-4v-9b`, `zai-org/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Glm4MoeForCausalLM` | GLM-4.5 | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.5`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Glm4v_moeForConditionalGeneration` | GLM-4.5V | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.5V`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ |
 | `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ |
 | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ |
-| `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, etc. | | ✅︎ | ✅︎ |
+| `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | | | ✅︎ |
 | `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | | ✅︎ |
@@ -625,6 +640,7 @@ See [this page](generative_models.md) for more information on how to use generat
 | `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup> | `Qwen/Qwen2.5-Omni-7B` | | ✅︎ | ✅︎ |
 | `SkyworkR1VChatModel` | Skywork-R1V-38B | T + I | `Skywork/Skywork-R1V-38B` | | ✅︎ | ✅︎ |
 | `SmolVLMForConditionalGeneration` | SmolVLM2 | T + I | `SmolVLM2-2.2B-Instruct` | ✅︎ | | ✅︎ |
+| `Step3VLForConditionalGeneration` | Step3-VL | T + I<sup>+</sup> | `stepfun-ai/step3` | | ✅︎ | ✅︎ |
 | `TarsierForConditionalGeneration` | Tarsier | T + I<sup>E+</sup> | `omni-search/Tarsier-7b`, `omni-search/Tarsier-34b` | | ✅︎ | ✅︎ |
 | `Tarsier2ForConditionalGeneration`<sup>^</sup> | Tarsier2 | T + I<sup>E+</sup> + V<sup>E+</sup> | `omni-research/Tarsier2-Recap-7b`, `omni-research/Tarsier2-7b-0115` | | ✅︎ | ✅︎ |
 
@@ -712,16 +728,15 @@ Speech2Text models trained specifically for Automatic Speech Recognition.
 | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
 |--------------|--------|-------------------|----------------------|---------------------------|---------------------|
 | `WhisperForConditionalGeneration` | Whisper | `openai/whisper-small`, `openai/whisper-large-v3-turbo`, etc. | | | |
+| `VoxtralForConditionalGeneration` | Voxtral (Mistral format) | `mistralai/Voxtral-Mini-3B-2507`, `mistralai/Voxtral-Small-24B-2507`, etc. | | ✅︎ | ✅︎ |
 
 ### Pooling Models
 
 See [this page](./pooling_models.md) for more information on how to use pooling models.
 
-!!! important
-    Since some model architectures support both generative and pooling tasks,
-    you should explicitly specify `--runner pooling` to ensure that the model is used in pooling mode instead of generative mode.
+#### Embedding
 
-#### Text Embedding
+These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) API.
 
 !!! note
     To get the best results, you should use pooling models that are specifically trained as such.
@@ -739,7 +754,10 @@ The following table lists those that are tested in vLLM.
 
 ---
 
-#### Scoring
+#### Cross-encoder / Reranker
+
+Cross-encoder and reranker models are a subset of classification models that accept two prompts as input.
+These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) API.
 
 | Architecture                        | Models             | Inputs   | Example HF Models        | [LoRA][lora-adapter]   | [PP][distributed-serving]   | [V1](gh-issue:8779)   |
 |-------------------------------------|--------------------|----------|--------------------------|------------------------|-----------------------------|-----------------------|
diff --git a/docs/serving/distributed_serving.md b/docs/serving/distributed_serving.md
index 4f111115f3073..08d889a00d2cf 100644
--- a/docs/serving/distributed_serving.md
+++ b/docs/serving/distributed_serving.md
@@ -58,7 +58,17 @@ vllm serve gpt2 \
 
 ## Multi-node deployment
 
-If a single node lacks sufficient GPUs to hold the model, deploy vLLM across multiple nodes. Multi-node deployments require Ray as the runtime engine. Ensure that every node provides an identical execution environment, including the model path and Python packages. Using container images is recommended because they provide a convenient way to keep environments consistent and to hide host heterogeneity.
+If a single node lacks sufficient GPUs to hold the model, deploy vLLM across multiple nodes. Ensure that every node provides an identical execution environment, including the model path and Python packages. Using container images is recommended because they provide a convenient way to keep environments consistent and to hide host heterogeneity.
+
+### What is Ray?
+
+Ray is a distributed computing framework for scaling Python programs. Multi-node vLLM deployments require Ray as the runtime engine.
+
+vLLM uses Ray to manage the distributed execution of tasks across multiple nodes and control where execution happens.
+
+Ray also offers high-level APIs for large-scale [offline batch inference](https://docs.ray.io/en/latest/data/working-with-llms.html) and [online serving](https://docs.ray.io/en/latest/serve/llm/serving-llms.html) that can leverage vLLM as the engine. These APIs add production-grade fault tolerance, scaling, and distributed observability to vLLM workloads.
+
+For details, see the [Ray documentation](https://docs.ray.io/en/latest/index.html).
 
 ### Ray cluster setup with containers
 
@@ -99,7 +109,7 @@ From any node, enter a container and run `ray status` and `ray list nodes` to ve
 ### Running vLLM on a Ray cluster
 
 !!! tip
-     If Ray is running inside containers, run the commands in the remainder of this guide _inside the containers_, not on the host. To open a shell inside a container, connect to a node and use `docker exec -it <container_name> /bin/bash`.
+    If Ray is running inside containers, run the commands in the remainder of this guide *inside the containers*, not on the host. To open a shell inside a container, connect to a node and use `docker exec -it <container_name> /bin/bash`.
 
 Once a Ray cluster is running, use vLLM as you would in a single-node setting. All resources across the Ray cluster are visible to vLLM, so a single `vllm` command on a single node is sufficient.
 
diff --git a/docs/serving/expert_parallel_deployment.md b/docs/serving/expert_parallel_deployment.md
index d79b6fc590189..280b3322b11c3 100644
--- a/docs/serving/expert_parallel_deployment.md
+++ b/docs/serving/expert_parallel_deployment.md
@@ -31,11 +31,12 @@ vLLM provides three communication backends for EP:
 
 Enable EP by setting the `--enable-expert-parallel` flag. The EP size is automatically calculated as:
 
-```
+```text
 EP_SIZE = TP_SIZE × DP_SIZE
 ```
 
 Where:
+
 - `TP_SIZE`: Tensor parallel size (always 1 for now)
 - `DP_SIZE`: Data parallel size
 - `EP_SIZE`: Expert parallel size (computed automatically)
diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md
index 4eb2ea2731817..dfed15d4ace97 100644
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -206,6 +206,7 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai
 We support both [Vision](https://platform.openai.com/docs/guides/vision)- and
 [Audio](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in)-related parameters;
 see our [Multimodal Inputs](../features/multimodal_inputs.md) guide for more information.
+
 - *Note: `image_url.detail` parameter is not supported.*
 
 Code example: <gh-file:examples/online_serving/openai_chat_completion_client.py>
diff --git a/docs/usage/security.md b/docs/usage/security.md
index 76140434dcb36..d54e2bb37ec07 100644
--- a/docs/usage/security.md
+++ b/docs/usage/security.md
@@ -13,15 +13,18 @@ All communications between nodes in a multi-node vLLM deployment are **insecure
 The following options control inter-node communications in vLLM:
 
 #### 1. **Environment Variables:**
-   - `VLLM_HOST_IP`: Sets the IP address for vLLM processes to communicate on
+
+- `VLLM_HOST_IP`: Sets the IP address for vLLM processes to communicate on
 
 #### 2. **KV Cache Transfer Configuration:**
-   - `--kv-ip`: The IP address for KV cache transfer communications (default: 127.0.0.1)
-   - `--kv-port`: The port for KV cache transfer communications (default: 14579)
+
+- `--kv-ip`: The IP address for KV cache transfer communications (default: 127.0.0.1)
+- `--kv-port`: The port for KV cache transfer communications (default: 14579)
 
 #### 3. **Data Parallel Configuration:**
-   - `data_parallel_master_ip`: IP of the data parallel master (default: 127.0.0.1)
-   - `data_parallel_master_port`: Port of the data parallel master (default: 29500)
+
+- `data_parallel_master_ip`: IP of the data parallel master (default: 127.0.0.1)
+- `data_parallel_master_port`: Port of the data parallel master (default: 29500)
 
 ### Notes on PyTorch Distributed
 
@@ -41,18 +44,21 @@ Key points from the PyTorch security guide:
 ### Security Recommendations
 
 #### 1. **Network Isolation:**
-   - Deploy vLLM nodes on a dedicated, isolated network
-   - Use network segmentation to prevent unauthorized access
-   - Implement appropriate firewall rules
+
+- Deploy vLLM nodes on a dedicated, isolated network
+- Use network segmentation to prevent unauthorized access
+- Implement appropriate firewall rules
 
 #### 2. **Configuration Best Practices:**
-   - Always set `VLLM_HOST_IP` to a specific IP address rather than using defaults
-   - Configure firewalls to only allow necessary ports between nodes
+
+- Always set `VLLM_HOST_IP` to a specific IP address rather than using defaults
+- Configure firewalls to only allow necessary ports between nodes
 
 #### 3. **Access Control:**
-   - Restrict physical and network access to the deployment environment
-   - Implement proper authentication and authorization for management interfaces
-   - Follow the principle of least privilege for all system components
+
+- Restrict physical and network access to the deployment environment
+- Implement proper authentication and authorization for management interfaces
+- Follow the principle of least privilege for all system components
 
 ## Security and Firewalls: Protecting Exposed vLLM Systems
 
diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md
index 498ff3da0ca31..38399c6633bdb 100644
--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@@ -148,7 +148,7 @@ are not yet supported.
 vLLM V1 supports logprobs and prompt logprobs. However, there are some important semantic
 differences compared to V0:
 
-**Logprobs Calculation**
+##### Logprobs Calculation
 
 Logprobs in V1 are now returned immediately once computed from the model’s raw output (i.e.
 before applying any logits post-processing such as temperature scaling or penalty
@@ -157,7 +157,7 @@ probabilities used during sampling.
 
 Support for logprobs with post-sampling adjustments is in progress and will be added in future updates.
 
-**Prompt Logprobs with Prefix Caching**
+##### Prompt Logprobs with Prefix Caching
 
 Currently prompt logprobs are only supported when prefix caching is turned off via `--no-enable-prefix-caching`. In a future release, prompt logprobs will be compatible with prefix caching, but a recomputation will be triggered to recover the full prompt logprobs even upon a prefix cache hit. See details in [RFC #13414](gh-issue:13414).
 
@@ -165,7 +165,7 @@ Currently prompt logprobs are only supported when prefix caching is turned off v
 
 As part of the major architectural rework in vLLM V1, several legacy features have been deprecated.
 
-**Sampling features**
+##### Sampling features
 
 - **best_of**: This feature has been deprecated due to limited usage. See details at [RFC #13361](gh-issue:13361).
 - **Per-Request Logits Processors**: In V0, users could pass custom
@@ -173,11 +173,11 @@ As part of the major architectural rework in vLLM V1, several legacy features ha
   feature has been deprecated. Instead, the design is moving toward supporting **global logits
   processors**, a feature the team is actively working on for future releases. See details at [RFC #13360](gh-pr:13360).
 
-**KV Cache features**
+##### KV Cache features
 
 - **GPU <> CPU KV Cache Swapping**: with the new simplified core architecture, vLLM V1 no longer requires KV cache swapping
 to handle request preemptions.
 
-**Structured Output features**
+##### Structured Output features
 
 - **Request-level Structured Output Backend**: Deprecated, alternative backends (outlines, guidance) with fallbacks is supported now.
diff --git a/examples/offline_inference/async_llm_streaming.py b/examples/offline_inference/async_llm_streaming.py
new file mode 100644
index 0000000000000..b876d536e3a19
--- /dev/null
+++ b/examples/offline_inference/async_llm_streaming.py
@@ -0,0 +1,111 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Simple example demonstrating streaming offline inference with AsyncLLM (V1 engine).
+
+This script shows the core functionality of vLLM's AsyncLLM engine for streaming
+token-by-token output in offline inference scenarios. It demonstrates DELTA mode
+streaming where you receive new tokens as they are generated.
+
+Usage:
+    python examples/offline_inference/async_llm_streaming.py
+"""
+
+import asyncio
+
+from vllm import SamplingParams
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.sampling_params import RequestOutputKind
+from vllm.v1.engine.async_llm import AsyncLLM
+
+
+async def stream_response(engine: AsyncLLM, prompt: str, request_id: str) -> None:
+    """
+    Stream response from AsyncLLM and display tokens as they arrive.
+
+    This function demonstrates the core streaming pattern:
+    1. Create SamplingParams with DELTA output kind
+    2. Call engine.generate() and iterate over the async generator
+    3. Print new tokens as they arrive
+    4. Handle the finished flag to know when generation is complete
+    """
+    print(f"\n🚀 Prompt: {prompt!r}")
+    print("💬 Response: ", end="", flush=True)
+
+    # Configure sampling parameters for streaming
+    sampling_params = SamplingParams(
+        max_tokens=100,
+        temperature=0.8,
+        top_p=0.95,
+        seed=42,  # For reproducible results
+        output_kind=RequestOutputKind.DELTA,  # Get only new tokens each iteration
+    )
+
+    try:
+        # Stream tokens from AsyncLLM
+        async for output in engine.generate(
+            request_id=request_id, prompt=prompt, sampling_params=sampling_params
+        ):
+            # Process each completion in the output
+            for completion in output.outputs:
+                # In DELTA mode, we get only new tokens generated since last iteration
+                new_text = completion.text
+                if new_text:
+                    print(new_text, end="", flush=True)
+
+            # Check if generation is finished
+            if output.finished:
+                print("\n✅ Generation complete!")
+                break
+
+    except Exception as e:
+        print(f"\n❌ Error during streaming: {e}")
+        raise
+
+
+async def main():
+    print("🔧 Initializing AsyncLLM...")
+
+    # Create AsyncLLM engine with simple configuration
+    engine_args = AsyncEngineArgs(
+        model="meta-llama/Llama-3.2-1B-Instruct",
+        enforce_eager=True,  # Faster startup for examples
+    )
+    engine = AsyncLLM.from_engine_args(engine_args)
+
+    try:
+        # Example prompts to demonstrate streaming
+        prompts = [
+            "The future of artificial intelligence is",
+            "In a galaxy far, far away",
+            "The key to happiness is",
+        ]
+
+        print(f"🎯 Running {len(prompts)} streaming examples...")
+
+        # Process each prompt
+        for i, prompt in enumerate(prompts, 1):
+            print(f"\n{'=' * 60}")
+            print(f"Example {i}/{len(prompts)}")
+            print(f"{'=' * 60}")
+
+            request_id = f"stream-example-{i}"
+            await stream_response(engine, prompt, request_id)
+
+            # Brief pause between examples
+            if i < len(prompts):
+                await asyncio.sleep(0.5)
+
+        print("\n🎉 All streaming examples completed!")
+
+    finally:
+        # Always clean up the engine
+        print("🔧 Shutting down engine...")
+        engine.shutdown()
+
+
+if __name__ == "__main__":
+    try:
+        asyncio.run(main())
+    except KeyboardInterrupt:
+        print("\n🛑 Interrupted by user")
diff --git a/examples/offline_inference/basic/embed.py b/examples/offline_inference/basic/embed.py
index 526753bcef22f..158836728beed 100644
--- a/examples/offline_inference/basic/embed.py
+++ b/examples/offline_inference/basic/embed.py
@@ -12,10 +12,9 @@ def parse_args():
     parser = EngineArgs.add_cli_args(parser)
     # Set example specific arguments
     parser.set_defaults(
-        model="intfloat/e5-mistral-7b-instruct",
+        model="intfloat/e5-small",
         runner="pooling",
         enforce_eager=True,
-        max_model_len=1024,
     )
     return parser.parse_args()
 
diff --git a/examples/offline_inference/basic/reward.py b/examples/offline_inference/basic/reward.py
new file mode 100644
index 0000000000000..aa173cf96f5bc
--- /dev/null
+++ b/examples/offline_inference/basic/reward.py
@@ -0,0 +1,53 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from argparse import Namespace
+
+from vllm import LLM, EngineArgs
+from vllm.utils import FlexibleArgumentParser
+
+
+def parse_args():
+    parser = FlexibleArgumentParser()
+    parser = EngineArgs.add_cli_args(parser)
+    # Set example specific arguments
+    parser.set_defaults(
+        model="internlm/internlm2-1_8b-reward",
+        runner="pooling",
+        enforce_eager=True,
+        max_model_len=1024,
+        trust_remote_code=True,
+    )
+    return parser.parse_args()
+
+
+def main(args: Namespace):
+    # Sample prompts.
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    # Create an LLM.
+    # You should pass runner="pooling" for reward models
+    llm = LLM(**vars(args))
+
+    # Generate rewards. The output is a list of PoolingRequestOutput.
+    outputs = llm.reward(prompts)
+
+    # Print the outputs.
+    print("\nGenerated Outputs:\n" + "-" * 60)
+    for prompt, output in zip(prompts, outputs):
+        rewards = output.outputs.data
+        rewards_trimmed = (
+            (str(rewards[:16])[:-1] + ", ...]") if len(rewards) > 16 else rewards
+        )
+        print(f"Prompt: {prompt!r} \nReward: {rewards_trimmed} (size={len(rewards)})")
+        print("-" * 60)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/examples/offline_inference/disaggregated-prefill-v1/README.md b/examples/offline_inference/disaggregated-prefill-v1/README.md
index 9cbdb19820f56..abf6883f8d3ef 100644
--- a/examples/offline_inference/disaggregated-prefill-v1/README.md
+++ b/examples/offline_inference/disaggregated-prefill-v1/README.md
@@ -5,6 +5,6 @@ This example contains scripts that demonstrate disaggregated prefill in the offl
 ## Files
 
 - `run.sh` - A helper script that will run `prefill_example.py` and `decode_example.py` sequentially.
-  - Make sure you are in the `examples/offline_inference/disaggregated-prefill-v1` directory before running `run.sh`.
+    - Make sure you are in the `examples/offline_inference/disaggregated-prefill-v1` directory before running `run.sh`.
 - `prefill_example.py` - A script which performs prefill only, saving the KV state to the `local_storage` directory and the prompts to `output.txt`.
 - `decode_example.py` - A script which performs decode only, loading the KV state from the `local_storage` directory and the prompts from `output.txt`.
diff --git a/examples/offline_inference/openai_batch/README.md b/examples/offline_inference/openai_batch/README.md
index 631fde91fcd08..3c6f6c7a6c588 100644
--- a/examples/offline_inference/openai_batch/README.md
+++ b/examples/offline_inference/openai_batch/README.md
@@ -19,9 +19,9 @@ We currently support `/v1/chat/completions`, `/v1/embeddings`, and `/v1/score` e
 ## Pre-requisites
 
 * The examples in this document use `meta-llama/Meta-Llama-3-8B-Instruct`.
-  - Create a [user access token](https://huggingface.co/docs/hub/en/security-tokens)
-  - Install the token on your machine (Run `huggingface-cli login`).
-  - Get access to the gated model by [visiting the model card](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and agreeing to the terms and conditions.
+    * Create a [user access token](https://huggingface.co/docs/hub/en/security-tokens)
+    * Install the token on your machine (Run `huggingface-cli login`).
+    * Get access to the gated model by [visiting the model card](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and agreeing to the terms and conditions.
 
 ## Example 1: Running with a local file
 
@@ -105,7 +105,7 @@ To integrate with cloud blob storage, we recommend using presigned urls.
 
 * [Create an S3 bucket](https://docs.aws.amazon.com/AmazonS3/latest/userguide/creating-bucket.html).
 * The `awscli` package (Run `pip install awscli`) to configure your credentials and interactively use s3.
-  - [Configure your credentials](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-quickstart.html).
+    * [Configure your credentials](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-quickstart.html).
 * The `boto3` python package (Run `pip install boto3`) to generate presigned urls.
 
 ### Step 1: Upload your input script
diff --git a/examples/offline_inference/rlhf.py b/examples/offline_inference/rlhf.py
index 752117a4e3623..ed974b90b57ee 100644
--- a/examples/offline_inference/rlhf.py
+++ b/examples/offline_inference/rlhf.py
@@ -126,7 +126,10 @@ for name, p in train_model.named_parameters():
 
 # Synchronize the updated weights to the inference engine.
 for name, p in train_model.named_parameters():
-    handle = llm.collective_rpc.remote("update_weight", args=(name, p.dtype, p.shape))
+    dtype_name = str(p.dtype).split(".")[-1]
+    handle = llm.collective_rpc.remote(
+        "update_weight", args=(name, dtype_name, p.shape)
+    )
     model_update_group.broadcast(p, src=0, stream=torch.cuda.current_stream())
     ray.get(handle)
 
diff --git a/examples/offline_inference/rlhf_utils.py b/examples/offline_inference/rlhf_utils.py
index c445224d75686..d2a8419ffabcd 100644
--- a/examples/offline_inference/rlhf_utils.py
+++ b/examples/offline_inference/rlhf_utils.py
@@ -45,7 +45,8 @@ class WorkerExtension:
             self.device,
         )
 
-    def update_weight(self, name, dtype, shape):
+    def update_weight(self, name, dtype_name, shape):
+        dtype = getattr(torch, dtype_name)
         weight = torch.empty(shape, dtype=dtype, device="cuda")
         self.model_update_group.broadcast(
             weight, src=0, stream=torch.cuda.current_stream()
diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py
index ce735f3b27dfe..184c30891eca7 100644
--- a/examples/offline_inference/spec_decode.py
+++ b/examples/offline_inference/spec_decode.py
@@ -13,6 +13,38 @@ except ImportError:
     from argparse import ArgumentParser as FlexibleArgumentParser
 
 
+QUESTION = "What is the content of each image?"
+IMAGE_URLS = [
+    "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/2/26/Ultramarine_Flycatcher_%28Ficedula_superciliaris%29_Naggar%2C_Himachal_Pradesh%2C_2013_%28cropped%29.JPG",
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/e/e5/Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg/2560px-Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/d/d4/Starfish%2C_Caswell_Bay_-_geograph.org.uk_-_409413.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/6/69/Grapevinesnail_01.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/0/0b/Texas_invasive_Musk_Thistle_1.jpg/1920px-Texas_invasive_Musk_Thistle_1.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/7/7a/Huskiesatrest.jpg/2880px-Huskiesatrest.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/6/68/Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg/1920px-Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/3/30/George_the_amazing_guinea_pig.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/1/1f/Oryctolagus_cuniculus_Rcdo.jpg/1920px-Oryctolagus_cuniculus_Rcdo.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/9/98/Horse-and-pony.jpg",
+]
+
+
+def get_custom_mm_prompts(num_prompts):
+    prompts = []
+    for url in IMAGE_URLS:
+        prompts.append(
+            [
+                {"type": "image_url", "image_url": {"url": url}},
+                {"type": "text", "text": QUESTION},
+            ]
+        )
+    if num_prompts > len(IMAGE_URLS):
+        prompts = prompts * (num_prompts // len(IMAGE_URLS) + 1)
+
+    return [[{"role": "user", "content": prompt}] for prompt in prompts[:num_prompts]]
+
+
 def parse_args():
     parser = FlexibleArgumentParser()
     add_dataset_parser(parser)
@@ -35,6 +67,7 @@ def parse_args():
     parser.add_argument("--output-len", type=int, default=256)
     parser.add_argument("--model-dir", type=str, default=None)
     parser.add_argument("--eagle-dir", type=str, default=None)
+    parser.add_argument("--custom-mm-prompts", action="store_true")
     return parser.parse_args()
 
 
@@ -44,14 +77,26 @@ def main():
 
     model_dir = args.model_dir
     if args.model_dir is None:
+        if args.custom_mm_prompts:
+            raise ValueError(
+                "custom_mm_prompts requires mm based models"
+                "default llama3.1-8b-instruct is not mm based"
+                "please specify model_dir to give a mm based model"
+            )
         model_dir = "meta-llama/Llama-3.1-8B-Instruct"
     tokenizer = AutoTokenizer.from_pretrained(model_dir)
+    args.custom_skip_chat_template = True
 
-    prompts = get_samples(args, tokenizer)
-    # add_special_tokens is False to avoid adding bos twice when using chat templates
-    prompt_ids = [
-        tokenizer.encode(prompt.prompt, add_special_tokens=False) for prompt in prompts
-    ]
+    if not args.custom_mm_prompts:
+        prompts = get_samples(args, tokenizer)
+        # add_special_tokens is False to avoid adding bos twice
+        # when using chat templates
+        prompt_ids = [
+            tokenizer.encode(prompt.prompt, add_special_tokens=False)
+            for prompt in prompts
+        ]
+    else:
+        prompts = get_custom_mm_prompts(args.num_prompts)
 
     if args.method == "eagle" or args.method == "eagle3":
         eagle_dir = args.eagle_dir
@@ -85,10 +130,17 @@ def main():
         speculative_config=speculative_config,
         disable_log_stats=False,
         max_model_len=16384,
+        limit_mm_per_prompt={"image": 5},
+        disable_chunked_mm_input=True,
     )
 
     sampling_params = SamplingParams(temperature=args.temp, max_tokens=args.output_len)
-    outputs = llm.generate(prompt_token_ids=prompt_ids, sampling_params=sampling_params)
+    if not args.custom_mm_prompts:
+        outputs = llm.generate(
+            prompt_token_ids=prompt_ids, sampling_params=sampling_params
+        )
+    else:
+        outputs = llm.chat(prompts, sampling_params=sampling_params)
 
     # print the generated text
     if args.print_output:
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 6f23a29e72f71..16bb3712f551e 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -221,7 +221,7 @@ def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
 # GLM-4v
 def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
-    model_name = "THUDM/glm-4v-9b"
+    model_name = "zai-org/glm-4v-9b"
 
     engine_args = EngineArgs(
         model=model_name,
@@ -250,7 +250,7 @@ def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
 
 # GLM-4.1V
 def run_glm4_1v(questions: list[str], modality: str) -> ModelRequestData:
-    model_name = "THUDM/GLM-4.1V-9B-Thinking"
+    model_name = "zai-org/GLM-4.1V-9B-Thinking"
 
     engine_args = EngineArgs(
         model=model_name,
@@ -423,51 +423,6 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
-# SmolVLM2-2.2B-Instruct
-def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
-    assert modality == "image"
-    model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
-
-    engine_args = EngineArgs(
-        model=model_name,
-        max_model_len=8192,
-        max_num_seqs=2,
-        enforce_eager=True,
-        mm_processor_kwargs={
-            "max_image_size": {"longest_edge": 384},
-        },
-        limit_mm_per_prompt={modality: 1},
-    )
-    prompts = [
-        (f"<|im_start|>User:<image>{question}<end_of_utterance>\nAssistant:")
-        for question in questions
-    ]
-
-    return ModelRequestData(
-        engine_args=engine_args,
-        prompts=prompts,
-    )
-
-
-# omni-research/Tarsier-7b
-def run_tarsier(questions: list[str], modality: str) -> ModelRequestData:
-    assert modality == "image"
-    model_name = "omni-research/Tarsier-7b"
-
-    engine_args = EngineArgs(
-        model=model_name,
-        trust_remote_code=True,
-        max_model_len=4096,
-        limit_mm_per_prompt={modality: 1},
-    )
-    prompts = [(f"USER: <image>\n{question} ASSISTANT:") for question in questions]
-
-    return ModelRequestData(
-        engine_args=engine_args,
-        prompts=prompts,
-    )
-
-
 # Intern-S1
 def run_interns1(questions: list[str], modality: str) -> ModelRequestData:
     model_name = "internlm/Intern-S1"
@@ -541,44 +496,6 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
-# Nemontron_VL
-def run_nemotron_vl(questions: list[str], modality: str) -> ModelRequestData:
-    model_name = "nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"
-
-    engine_args = EngineArgs(
-        model=model_name,
-        trust_remote_code=True,
-        max_model_len=8192,
-        limit_mm_per_prompt={modality: 1},
-    )
-
-    assert modality == "image"
-    placeholder = "<image>"
-
-    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-    messages = [
-        [{"role": "user", "content": f"{placeholder}\n{question}"}]
-        for question in questions
-    ]
-    prompts = tokenizer.apply_chat_template(
-        messages, tokenize=False, add_generation_prompt=True
-    )
-
-    # Stop tokens for InternVL
-    # models variants may have different stop tokens
-    # please refer to the model card for the correct "stop words":
-    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
-    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
-    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
-    stop_token_ids = [token_id for token_id in stop_token_ids if token_id is not None]
-
-    return ModelRequestData(
-        engine_args=engine_args,
-        prompts=prompts,
-        stop_token_ids=stop_token_ids,
-    )
-
-
 # Keye-VL
 def run_keye_vl(questions: list[str], modality: str) -> ModelRequestData:
     model_name = "Kwai-Keye/Keye-VL-8B-Preview"
@@ -634,6 +551,41 @@ def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+def run_llama4(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=4,
+        tensor_parallel_size=8,
+        gpu_memory_utilization=0.4,
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    messages = [
+        [
+            {
+                "role": "user",
+                "content": [{"type": "image"}, {"type": "text", "text": f"{question}"}],
+            }
+        ]
+        for question in questions
+    ]
+    prompts = tokenizer.apply_chat_template(
+        messages, add_generation_prompt=True, tokenize=False
+    )
+    stop_token_ids = None
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        stop_token_ids=stop_token_ids,
+    )
+
+
 # LLaVA-1.5
 def run_llava(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
@@ -876,41 +828,6 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
-def run_llama4(questions: list[str], modality: str) -> ModelRequestData:
-    assert modality == "image"
-
-    model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
-
-    engine_args = EngineArgs(
-        model=model_name,
-        max_model_len=8192,
-        max_num_seqs=4,
-        tensor_parallel_size=8,
-        gpu_memory_utilization=0.4,
-        limit_mm_per_prompt={modality: 1},
-    )
-
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    messages = [
-        [
-            {
-                "role": "user",
-                "content": [{"type": "image"}, {"type": "text", "text": f"{question}"}],
-            }
-        ]
-        for question in questions
-    ]
-    prompts = tokenizer.apply_chat_template(
-        messages, add_generation_prompt=True, tokenize=False
-    )
-    stop_token_ids = None
-    return ModelRequestData(
-        engine_args=engine_args,
-        prompts=prompts,
-        stop_token_ids=stop_token_ids,
-    )
-
-
 # Molmo
 def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
@@ -936,6 +853,44 @@ def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+# Nemontron_VL
+def run_nemotron_vl(questions: list[str], modality: str) -> ModelRequestData:
+    model_name = "nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=8192,
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    assert modality == "image"
+    placeholder = "<image>"
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    messages = [
+        [{"role": "user", "content": f"{placeholder}\n{question}"}]
+        for question in questions
+    ]
+    prompts = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    # Stop tokens for InternVL
+    # models variants may have different stop tokens
+    # please refer to the model card for the correct "stop words":
+    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
+    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
+    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+    stop_token_ids = [token_id for token_id in stop_token_ids if token_id is not None]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        stop_token_ids=stop_token_ids,
+    )
+
+
 # NVLM-D
 def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
@@ -1293,37 +1248,6 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
     )
 
 
-def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
-    model_name = "omni-research/Tarsier2-Recap-7b"
-
-    engine_args = EngineArgs(
-        model=model_name,
-        max_model_len=4096,
-        hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]},
-        limit_mm_per_prompt={modality: 1},
-    )
-
-    if modality == "image":
-        placeholder = "<|image_pad|>"
-    elif modality == "video":
-        placeholder = "<|video_pad|>"
-
-    prompts = [
-        (
-            "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
-            f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
-            f"{question}<|im_end|>\n"
-            "<|im_start|>assistant\n"
-        )
-        for question in questions
-    ]
-
-    return ModelRequestData(
-        engine_args=engine_args,
-        prompts=prompts,
-    )
-
-
 # SkyworkR1V
 def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
@@ -1357,6 +1281,111 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+# SmolVLM2-2.2B-Instruct
+def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+    model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=2,
+        enforce_eager=True,
+        mm_processor_kwargs={
+            "max_image_size": {"longest_edge": 384},
+        },
+        limit_mm_per_prompt={modality: 1},
+    )
+    prompts = [
+        (f"<|im_start|>User:<image>{question}<end_of_utterance>\nAssistant:")
+        for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+# Step3
+def run_step3(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    model_name = "stepfun-ai/step3-fp8"
+
+    # NOTE: Below are verified configurations for step3-fp8
+    # on 8xH100 GPUs.
+    engine_args = EngineArgs(
+        model=model_name,
+        max_num_batched_tokens=4096,
+        gpu_memory_utilization=0.85,
+        tensor_parallel_size=8,
+        limit_mm_per_prompt={modality: 1},
+        reasoning_parser="step3",
+    )
+
+    prompts = [
+        "<｜begin▁of▁sentence｜> You are a helpful assistant. <|BOT|>user\n "
+        f"<im_patch>{question} <|EOT|><|BOT|>assistant\n<think>\n"
+        for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+# omni-research/Tarsier-7b
+def run_tarsier(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+    model_name = "omni-research/Tarsier-7b"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=4096,
+        limit_mm_per_prompt={modality: 1},
+    )
+    prompts = [(f"USER: <image>\n{question} ASSISTANT:") for question in questions]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
+    model_name = "omni-research/Tarsier2-Recap-7b"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]},
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    if modality == "image":
+        placeholder = "<|image_pad|>"
+    elif modality == "video":
+        placeholder = "<|video_pad|>"
+
+    prompts = [
+        (
+            "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+            f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
+            f"{question}<|im_end|>\n"
+            "<|im_start|>assistant\n"
+        )
+        for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 model_example_map = {
     "aria": run_aria,
     "aya_vision": run_aya_vision,
@@ -1373,9 +1402,9 @@ model_example_map = {
     "idefics3": run_idefics3,
     "interns1": run_interns1,
     "internvl_chat": run_internvl,
-    "nemotron_vl": run_nemotron_vl,
     "keye_vl": run_keye_vl,
     "kimi_vl": run_kimi_vl,
+    "llama4": run_llama4,
     "llava": run_llava,
     "llava-next": run_llava_next,
     "llava-next-video": run_llava_next_video,
@@ -1385,8 +1414,8 @@ model_example_map = {
     "minicpmv": run_minicpmv,
     "mistral3": run_mistral3,
     "mllama": run_mllama,
-    "llama4": run_llama4,
     "molmo": run_molmo,
+    "nemotron_vl": run_nemotron_vl,
     "NVLM_D": run_nvlm_d,
     "ovis": run_ovis,
     "paligemma": run_paligemma,
@@ -1401,6 +1430,7 @@ model_example_map = {
     "qwen2_5_omni": run_qwen2_5_omni,
     "skywork_chat": run_skyworkr1v,
     "smolvlm": run_smolvlm,
+    "step3": run_step3,
     "tarsier": run_tarsier,
     "tarsier2": run_tarsier2,
 }
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index dd50f3639709e..1ab405fa14f3a 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -197,6 +197,53 @@ def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
 
+def load_hyperclovax_seed_vision(
+    question: str, image_urls: list[str]
+) -> ModelRequestData:
+    model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=16384,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    message = {"role": "user", "content": list()}
+    for _image_url in image_urls:
+        message["content"].append(
+            {
+                "type": "image",
+                "image": _image_url,
+                "ocr": "",
+                "lens_keywords": "",
+                "lens_local_keywords": "",
+            }
+        )
+    message["content"].append(
+        {
+            "type": "text",
+            "text": question,
+        }
+    )
+
+    prompt = tokenizer.apply_chat_template(
+        [
+            message,
+        ],
+        tokenize=False,
+        add_generation_prompt=True,
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        stop_token_ids=None,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
 def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData:
     model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
 
@@ -225,34 +272,6 @@ def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
 
-def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData:
-    model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
-
-    # The configuration below has been confirmed to launch on a single L40 GPU.
-    engine_args = EngineArgs(
-        model=model_name,
-        max_model_len=8192,
-        max_num_seqs=16,
-        enforce_eager=True,
-        limit_mm_per_prompt={"image": len(image_urls)},
-        mm_processor_kwargs={
-            "max_image_size": {"longest_edge": 384},
-        },
-    )
-
-    placeholders = "\n".join(
-        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
-    )
-    prompt = (
-        f"<|im_start|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:"  # noqa: E501
-    )
-    return ModelRequestData(
-        engine_args=engine_args,
-        prompt=prompt,
-        image_data=[fetch_image(url) for url in image_urls],
-    )
-
-
 def load_interns1(question: str, image_urls: list[str]) -> ModelRequestData:
     model_name = "internlm/Intern-S1"
 
@@ -316,49 +335,36 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
 
-def load_hyperclovax_seed_vision(
-    question: str, image_urls: list[str]
-) -> ModelRequestData:
-    model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
-    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
 
     engine_args = EngineArgs(
         model=model_name,
-        trust_remote_code=True,
-        max_model_len=16384,
+        max_model_len=131072,
+        tensor_parallel_size=8,
         limit_mm_per_prompt={"image": len(image_urls)},
     )
 
-    message = {"role": "user", "content": list()}
-    for _image_url in image_urls:
-        message["content"].append(
-            {
-                "type": "image",
-                "image": _image_url,
-                "ocr": "",
-                "lens_keywords": "",
-                "lens_local_keywords": "",
-            }
-        )
-    message["content"].append(
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [
         {
-            "type": "text",
-            "text": question,
+            "role": "user",
+            "content": [
+                *placeholders,
+                {"type": "text", "text": question},
+            ],
         }
-    )
+    ]
 
-    prompt = tokenizer.apply_chat_template(
-        [
-            message,
-        ],
-        tokenize=False,
-        add_generation_prompt=True,
+    processor = AutoProcessor.from_pretrained(model_name)
+
+    prompt = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
     )
 
     return ModelRequestData(
         engine_args=engine_args,
         prompt=prompt,
-        stop_token_ids=None,
         image_data=[fetch_image(url) for url in image_urls],
     )
 
@@ -463,40 +469,6 @@ def load_llava_onevision(question: str, image_urls: list[str]) -> ModelRequestDa
     )
 
 
-def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
-    model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
-
-    engine_args = EngineArgs(
-        model=model_name,
-        max_model_len=131072,
-        tensor_parallel_size=8,
-        limit_mm_per_prompt={"image": len(image_urls)},
-    )
-
-    placeholders = [{"type": "image", "image": url} for url in image_urls]
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                *placeholders,
-                {"type": "text", "text": question},
-            ],
-        }
-    ]
-
-    processor = AutoProcessor.from_pretrained(model_name)
-
-    prompt = processor.apply_chat_template(
-        messages, tokenize=False, add_generation_prompt=True
-    )
-
-    return ModelRequestData(
-        engine_args=engine_args,
-        prompt=prompt,
-        image_data=[fetch_image(url) for url in image_urls],
-    )
-
-
 def load_keye_vl(question: str, image_urls: list[str]) -> ModelRequestData:
     model_name = "Kwai-Keye/Keye-VL-8B-Preview"
 
@@ -954,6 +926,62 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
 
+def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
+
+    # The configuration below has been confirmed to launch on a single L40 GPU.
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=16,
+        enforce_eager=True,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        mm_processor_kwargs={
+            "max_image_size": {"longest_edge": 384},
+        },
+    )
+
+    placeholders = "\n".join(
+        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
+    )
+    prompt = (
+        f"<|im_start|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:"  # noqa: E501
+    )
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
+def load_step3(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "stepfun-ai/step3-fp8"
+
+    # NOTE: Below are verified configurations for step3-fp8
+    # on 8xH100 GPUs.
+    engine_args = EngineArgs(
+        model=model_name,
+        max_num_batched_tokens=4096,
+        gpu_memory_utilization=0.85,
+        tensor_parallel_size=8,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        reasoning_parser="step3",
+    )
+
+    prompt = (
+        "<｜begin▁of▁sentence｜> You are a helpful assistant. <|BOT|>user\n "
+        f"{'<im_patch>' * len(image_urls)}{question} <|EOT|><|BOT|"
+        ">assistant\n<think>\n"
+    )
+    image_data = [fetch_image(url) for url in image_urls]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=image_data,
+    )
+
+
 def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData:
     model_name = "omni-research/Tarsier-7b"
 
@@ -1006,16 +1034,16 @@ model_example_map = {
     "deepseek_vl_v2": load_deepseek_vl2,
     "gemma3": load_gemma3,
     "h2ovl_chat": load_h2ovl,
+    "hyperclovax_seed_vision": load_hyperclovax_seed_vision,
     "idefics3": load_idefics3,
     "interns1": load_interns1,
     "internvl_chat": load_internvl,
-    "hyperclovax_seed_vision": load_hyperclovax_seed_vision,
     "keye_vl": load_keye_vl,
     "kimi_vl": load_kimi_vl,
+    "llama4": load_llama4,
     "llava": load_llava,
     "llava-next": load_llava_next,
     "llava-onevision": load_llava_onevision,
-    "llama4": load_llama4,
     "mistral3": load_mistral3,
     "mllama": load_mllama,
     "NVLM_D": load_nvlm_d,
@@ -1028,6 +1056,7 @@ model_example_map = {
     "qwen2_vl": load_qwen2_vl,
     "qwen2_5_vl": load_qwen2_5_vl,
     "smolvlm": load_smolvlm,
+    "step3": load_step3,
     "tarsier": load_tarsier,
     "tarsier2": load_tarsier2,
 }
diff --git a/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py b/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py
index 16c32dcaa5d31..d39edb0b9d15c 100644
--- a/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py
+++ b/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py
@@ -293,6 +293,8 @@ class Proxy:
             # add params to request
             kv_prepare_request = request.copy()
             kv_prepare_request["max_tokens"] = 1
+            if "max_completion_tokens" in kv_prepare_request:
+                kv_prepare_request["max_completion_tokens"] = 1
 
             # prefill stage
             prefill_instance = self.schedule(self.prefill_cycler)
diff --git a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh
index 568f7a43b4962..7b0b12bb34d25 100644
--- a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh
+++ b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh
@@ -178,7 +178,6 @@ main() {
         --max-num-seqs 256 \
         --trust-remote-code \
         --gpu-memory-utilization 0.9 \
-        --disable-log-request \
         --kv-transfer-config \
         "{\"kv_connector\":\"P2pNcclConnector\",\"kv_role\":\"kv_producer\",\"kv_buffer_size\":\"1e1\",\"kv_port\":\"$kv_port\",\"kv_connector_extra_config\":{\"proxy_ip\":\"0.0.0.0\",\"proxy_port\":\"$PROXY_PORT\",\"http_port\":\"$port\",\"send_type\":\"PUT_ASYNC\",\"nccl_num_channels\":\"16\"}}" > prefill$((i+1)).log 2>&1 &
         PIDS+=($!)
@@ -207,7 +206,6 @@ main() {
         --max-num-seqs 256 \
         --trust-remote-code \
         --gpu-memory-utilization 0.7 \
-        --disable-log-request \
         --kv-transfer-config \
         "{\"kv_connector\":\"P2pNcclConnector\",\"kv_role\":\"kv_consumer\",\"kv_buffer_size\":\"8e9\",\"kv_port\":\"$kv_port\",\"kv_connector_extra_config\":{\"proxy_ip\":\"0.0.0.0\",\"proxy_port\":\"$PROXY_PORT\",\"http_port\":\"$port\",\"send_type\":\"PUT_ASYNC\",\"nccl_num_channels\":\"16\"}}" > decode$((i+1)).log 2>&1 &
         PIDS+=($!)
diff --git a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py
index a6fd92feb2f11..0c7d32d7862e3 100644
--- a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py
+++ b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py
@@ -46,7 +46,7 @@ def _listen_for_register(poller, router_socket):
                 global prefill_instances
                 global prefill_cv
                 with prefill_cv:
-                    node = prefill_instances.pop(data["http_address"], None)
+                    node = prefill_instances.get(data["http_address"], None)
                     prefill_instances[data["http_address"]] = (
                         data["zmq_address"],
                         time.time() + DEFAULT_PING_SECONDS,
@@ -57,7 +57,7 @@ def _listen_for_register(poller, router_socket):
                 global decode_instances
                 global decode_cv
                 with decode_cv:
-                    node = decode_instances.pop(data["http_address"], None)
+                    node = decode_instances.get(data["http_address"], None)
                     decode_instances[data["http_address"]] = (
                         data["zmq_address"],
                         time.time() + DEFAULT_PING_SECONDS,
@@ -69,6 +69,7 @@ def _listen_for_register(poller, router_socket):
                     remote_address,
                     data,
                 )
+                return
 
             if node is None:
                 print(f"🔵Add [HTTP:{data['http_address']}, ZMQ:{data['zmq_address']}]")
@@ -128,6 +129,8 @@ async def handle_request():
         prefill_request = original_request_data.copy()
         # change max_tokens = 1 to let it only do prefill
         prefill_request["max_tokens"] = 1
+        if "max_completion_tokens" in prefill_request:
+            prefill_request["max_completion_tokens"] = 1
 
         global count
         global prefill_instances
diff --git a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
index 5a91929770945..7d1ea37714599 100644
--- a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
+++ b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
@@ -51,13 +51,12 @@ def main():
     printed_content = False
 
     for chunk in stream:
-        reasoning_content = None
-        content = None
-        # Check the content is reasoning_content or content
-        if hasattr(chunk.choices[0].delta, "reasoning_content"):
-            reasoning_content = chunk.choices[0].delta.reasoning_content
-        elif hasattr(chunk.choices[0].delta, "content"):
-            content = chunk.choices[0].delta.content
+        # Safely extract reasoning_content and content from delta,
+        # defaulting to None if attributes don't exist or are empty strings
+        reasoning_content = (
+            getattr(chunk.choices[0].delta, "reasoning_content", None) or None
+        )
+        content = getattr(chunk.choices[0].delta, "content", None) or None
 
         if reasoning_content is not None:
             if not printed_reasoning_content:
diff --git a/examples/online_serving/prometheus_grafana/README.md b/examples/online_serving/prometheus_grafana/README.md
index 7c4e649e6d029..5cd4dab5a8fa7 100644
--- a/examples/online_serving/prometheus_grafana/README.md
+++ b/examples/online_serving/prometheus_grafana/README.md
@@ -13,8 +13,7 @@ Prometheus metric logging is enabled by default in the OpenAI-compatible server.
 
 ```bash
 vllm serve mistralai/Mistral-7B-v0.1 \
-    --max-model-len 2048 \
-    --disable-log-requests
+    --max-model-len 2048
 ```
 
 Launch Prometheus and Grafana servers with `docker compose`:
diff --git a/examples/others/lmcache/README.md b/examples/others/lmcache/README.md
index 95a6bf995b2fd..759be55d6f1c5 100644
--- a/examples/others/lmcache/README.md
+++ b/examples/others/lmcache/README.md
@@ -28,16 +28,20 @@ to run disaggregated prefill and benchmark the performance.
 ### Components
 
 #### Server Scripts
+
 - `disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh` - Launches individual vLLM servers for prefill/decode, and also launches the proxy server.
 - `disagg_prefill_lmcache_v1/disagg_proxy_server.py` - FastAPI proxy server that coordinates between prefiller and decoder
 - `disagg_prefill_lmcache_v1/disagg_example_nixl.sh` - Main script to run the example
 
 #### Configuration
+
 - `disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml` - Configuration for prefiller server
 - `disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml` - Configuration for decoder server
 
 #### Log Files
+
 The main script generates several log files:
+
 - `prefiller.log` - Logs from the prefill server
 - `decoder.log` - Logs from the decode server
 - `proxy.log` - Logs from the proxy server
diff --git a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh
index 5719fa8212923..1284466a45580 100644
--- a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh
+++ b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh
@@ -28,7 +28,6 @@ if [[ $1 == "prefiller" ]]; then
         CUDA_VISIBLE_DEVICES=0 \
         vllm serve $MODEL \
         --port 8100 \
-        --disable-log-requests \
         --enforce-eager \
         --kv-transfer-config \
         '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_producer","kv_connector_extra_config": {"discard_partial_chunks": false, "lmcache_rpc_port": "producer1"}}'
@@ -46,7 +45,6 @@ elif [[ $1 == "decoder" ]]; then
         CUDA_VISIBLE_DEVICES=1 \
         vllm serve $MODEL \
         --port 8200 \
-        --disable-log-requests \
         --enforce-eager \
         --kv-transfer-config \
         '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_consumer","kv_connector_extra_config": {"discard_partial_chunks": false, "lmcache_rpc_port": "consumer1"}}'
diff --git a/examples/others/logging_configuration.md b/examples/others/logging_configuration.md
index 916ab5fd1c871..7c8bdd199a72d 100644
--- a/examples/others/logging_configuration.md
+++ b/examples/others/logging_configuration.md
@@ -8,11 +8,11 @@ of logging configurations that range from simple-and-inflexible to
 more-complex-and-more-flexible.
 
 - No vLLM logging (simple and inflexible)
-  - Set `VLLM_CONFIGURE_LOGGING=0` (leaving `VLLM_LOGGING_CONFIG_PATH` unset)
+    - Set `VLLM_CONFIGURE_LOGGING=0` (leaving `VLLM_LOGGING_CONFIG_PATH` unset)
 - vLLM's default logging configuration (simple and inflexible)
-  - Leave `VLLM_CONFIGURE_LOGGING` unset or set `VLLM_CONFIGURE_LOGGING=1`
+    - Leave `VLLM_CONFIGURE_LOGGING` unset or set `VLLM_CONFIGURE_LOGGING=1`
 - Fine-grained custom logging configuration (more complex, more flexible)
-  - Leave `VLLM_CONFIGURE_LOGGING` unset or set `VLLM_CONFIGURE_LOGGING=1` and
+    - Leave `VLLM_CONFIGURE_LOGGING` unset or set `VLLM_CONFIGURE_LOGGING=1` and
     set `VLLM_LOGGING_CONFIG_PATH=<path-to-logging-config.json>`
 
 ## Logging Configuration Environment Variables
diff --git a/mkdocs.yaml b/mkdocs.yaml
index 78f1c5b77cd07..e5b7454003310 100644
--- a/mkdocs.yaml
+++ b/mkdocs.yaml
@@ -67,6 +67,13 @@ plugins:
       exclude:
         - argparse/*
         - examples/*
+  - minify:
+      minify_html: true
+      minify_js: true
+      minify_css: true
+      cache_safe: true
+      js_files: [docs/mkdocs/javascript/*.js]
+      css_files: [docs/mkdocs/stylesheets/*.css]
   # For API reference generation
   - api-autonav:
       modules: ["vllm"]
diff --git a/pyproject.toml b/pyproject.toml
index a65267942d47e..dfad5d2cdf319 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -156,16 +156,6 @@ markers = [
     "optional: optional tests that are automatically skipped, include --optional to run them",
 ]
 
-[tool.pymarkdown]
-plugins.md004.style = "sublist" # ul-style
-plugins.md007.indent = 4 # ul-indent
-plugins.md007.start_indented = true # ul-indent
-plugins.md013.enabled = false # line-length
-plugins.md041.enabled = false # first-line-h1
-plugins.md033.enabled = false # inline-html
-plugins.md046.enabled = false # code-block-style
-plugins.md024.allow_different_nesting = true # no-duplicate-headers
-
 [tool.ty.src]
 root = "./vllm"
 respect-ignore-files = true
diff --git a/requirements/common.txt b/requirements/common.txt
index d29b3e59d35b2..6b57a3d2f1d0d 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -13,7 +13,7 @@ tokenizers >= 0.21.1  # Required for fast incremental detokenization.
 protobuf # Required by LlamaTokenizer.
 fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
 aiohttp
-openai >= 1.87.0, <= 1.90.0 # Ensure modern openai package (ensure ResponsePrompt exists in type.responses and max_completion_tokens field support)
+openai >= 1.87.0 # Ensure modern openai package (ensure ResponsePrompt exists in type.responses and max_completion_tokens field support)
 pydantic >= 2.10
 prometheus_client >= 0.18.0
 pillow  # Required for image processing
diff --git a/requirements/cuda.txt b/requirements/cuda.txt
index c1273b224eabf..fb30e493f80b3 100644
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -5,10 +5,10 @@ numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Req
 numba == 0.61.2; python_version > '3.9'
 
 # Dependencies for NVIDIA GPUs
-ray[cgraph]>=2.43.0, !=2.44.* # Ray Compiled Graph, required for pipeline parallelism in V1.
+ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
 torch==2.7.1
 torchaudio==2.7.1
 # These must be updated alongside torch
 torchvision==0.22.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 # https://github.com/facebookresearch/xformers/releases/tag/v0.0.31
-xformers==0.0.31; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch >= 2.7
+xformers==0.0.31; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch >= 2.7
\ No newline at end of file
diff --git a/requirements/docs.txt b/requirements/docs.txt
index 9e56c9573b33b..4d4fc7da6816d 100644
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -6,6 +6,7 @@ mkdocs-gen-files
 mkdocs-awesome-nav
 mkdocs-glightbox
 mkdocs-git-revision-date-localized-plugin
+mkdocs-minify-plugin
 python-markdown-math
 regex
 ruff
diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt
index 0a72ddefda79c..7ae5e6f2f409a 100644
--- a/requirements/nightly_torch_test.txt
+++ b/requirements/nightly_torch_test.txt
@@ -16,7 +16,7 @@ librosa # required for audio tests
 vocos # required for minicpmo_26 test
 peft
 pqdm
-ray[cgraph,default]>=2.43.0, !=2.44.* # Ray Compiled Graph, required by pipeline parallelism tests
+ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline parallelism tests
 sentence-transformers # required for embedding tests
 soundfile # required for audio tests
 jiwer # required for audio tests
diff --git a/requirements/test.in b/requirements/test.in
index 3c5e3c0204bfb..9ecaaae92727f 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -17,7 +17,7 @@ vector_quantize_pytorch # required for minicpmo_26 test
 vocos # required for minicpmo_26 test
 peft>=0.15.0 # required for phi-4-mm test
 pqdm
-ray[cgraph,default]>=2.43.0, !=2.44.* # Ray Compiled Graph, required by pipeline parallelism tests
+ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline parallelism tests
 sentence-transformers # required for embedding tests
 soundfile # required for audio tests
 jiwer # required for audio tests
diff --git a/requirements/test.txt b/requirements/test.txt
index d45048aae5809..691420df87c48 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -22,9 +22,7 @@ aiohttp==3.10.11
 aiohttp-cors==0.8.1
     # via ray
 aiosignal==1.3.1
-    # via
-    #   aiohttp
-    #   ray
+    # via aiohttp
 albucore==0.0.16
     # via terratorch
 albumentations==1.4.6
@@ -226,7 +224,6 @@ frozenlist==1.5.0
     # via
     #   aiohttp
     #   aiosignal
-    #   ray
 fsspec==2024.9.0
     # via
     #   datasets
@@ -603,10 +600,18 @@ opencv-python-headless==4.11.0.86
 opentelemetry-api==1.35.0
     # via
     #   mlflow-skinny
+    #   opentelemetry-exporter-prometheus
     #   opentelemetry-sdk
     #   opentelemetry-semantic-conventions
+opentelemetry-exporter-prometheus==0.56b0
+    # via ray
+opentelemetry-proto==1.36.0
+    # via ray
 opentelemetry-sdk==1.35.0
-    # via mlflow-skinny
+    # via
+    #   mlflow-skinny
+    #   opentelemetry-exporter-prometheus
+    #   ray
 opentelemetry-semantic-conventions==0.56b0
     # via opentelemetry-sdk
 packaging==24.2
@@ -697,7 +702,9 @@ pqdm==0.2.0
 pretrainedmodels==0.7.4
     # via segmentation-models-pytorch
 prometheus-client==0.22.0
-    # via ray
+    # via
+    #   opentelemetry-exporter-prometheus
+    #   ray
 propcache==0.2.0
     # via yarl
 proto-plus==1.26.1
@@ -707,6 +714,7 @@ protobuf==5.28.3
     #   google-api-core
     #   googleapis-common-protos
     #   mlflow-skinny
+    #   opentelemetry-proto
     #   proto-plus
     #   ray
     #   tensorboardx
@@ -854,7 +862,7 @@ rasterio==1.4.3
     #   rioxarray
     #   terratorch
     #   torchgeo
-ray==2.43.0
+ray==2.48.0
     # via -r requirements/test.in
 redis==5.2.0
     # via tensorizer
diff --git a/requirements/tpu.txt b/requirements/tpu.txt
index 2d0d8bd8457e3..7bb77c4a99636 100644
--- a/requirements/tpu.txt
+++ b/requirements/tpu.txt
@@ -19,8 +19,8 @@ nixl==0.3.0
 --find-links https://storage.googleapis.com/libtpu-releases/index.html
 --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
 --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-torch==2.9.0.dev20250724
-torchvision==0.24.0.dev20250724
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250724-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250724-cp312-cp312-linux_x86_64.whl ; python_version == "3.12"
+torch==2.9.0.dev20250730
+torchvision==0.24.0.dev20250730
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250730-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250730-cp312-cp312-linux_x86_64.whl ; python_version == "3.12"
 
diff --git a/setup.py b/setup.py
index d46e678e7aa40..c6f4985c5930e 100644
--- a/setup.py
+++ b/setup.py
@@ -663,7 +663,9 @@ setup(
         ["runai-model-streamer >= 0.13.3", "runai-model-streamer-s3", "boto3"],
         "audio": ["librosa", "soundfile",
                   "mistral_common[audio]"],  # Required for audio processing
-        "video": []  # Kept for backwards compatibility
+        "video": [],  # Kept for backwards compatibility
+        # FlashInfer should be updated together with the Dockerfile
+        "flashinfer": ["flashinfer-python==0.2.9"],
     },
     cmdclass=cmdclass,
     package_data=package_data,
diff --git a/tests/compile/test_async_tp.py b/tests/compile/test_async_tp.py
index 916ec2b83df4f..9a51e6b3514f4 100644
--- a/tests/compile/test_async_tp.py
+++ b/tests/compile/test_async_tp.py
@@ -22,6 +22,8 @@ from ..utils import (compare_two_settings, create_new_process_for_each_test,
                      multi_gpu_test)
 from .backend import TestBackend
 
+FP8_DTYPE = current_platform.fp8_dtype()
+
 prompts = [
     "Hello, my name is",
     "The president of the United States is",
@@ -32,9 +34,10 @@ prompts = [
 
 class TestMMRSModel(torch.nn.Module):
 
-    def __init__(self, hidden_size=16):
+    def __init__(self, hidden_size=16, dtype=torch.float16):
         super().__init__()
         self.hidden_size = hidden_size
+        self.dtype = dtype
         self.gate_proj = torch.nn.Parameter(torch.empty(
             (self.hidden_size * 2, hidden_size)),
                                             requires_grad=False)
@@ -64,9 +67,10 @@ class TestMMRSModel(torch.nn.Module):
 
 class TestAGMMModel(torch.nn.Module):
 
-    def __init__(self, hidden_size=16):
+    def __init__(self, hidden_size=16, dtype=torch.float16):
         super().__init__()
         self.hidden_size = hidden_size
+        self.dtype = dtype
         self.weight = torch.nn.Parameter(torch.empty(
             (hidden_size, hidden_size)),
                                          requires_grad=False)
@@ -91,8 +95,125 @@ class TestAGMMModel(torch.nn.Module):
         return [torch.ops.symm_mem.fused_all_gather_matmul.default]
 
 
+class _BaseScaledMMModel(torch.nn.Module):
+
+    def __init__(self, hidden_size=16, dtype=torch.float16):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.dtype = dtype
+        self.weight = torch.empty([hidden_size, hidden_size], dtype=FP8_DTYPE)\
+            .contiguous().transpose(0, 1)
+
+        # Initialize scale_b for _scaled_mm.
+        self.scale_b = torch.ones(1, self.hidden_size, dtype=torch.float32)
+
+
+class TestScaledMMRSModel(_BaseScaledMMModel):
+
+    def forward(self, input: torch.Tensor):
+        """
+        Forward pass implementing the scaled_mm + reduce scatter in the FX graph
+    
+        """
+        fp8_input = input.to(FP8_DTYPE)
+        scale_a = torch.ones(input.shape[0], 1, dtype=torch.float32)
+        scaled_mm = torch._scaled_mm(fp8_input,
+                                     self.weight,
+                                     scale_a=scale_a,
+                                     scale_b=self.scale_b,
+                                     out_dtype=self.dtype)
+        reduce_scatter = tensor_model_parallel_reduce_scatter(scaled_mm, dim=0)
+        return reduce_scatter
+
+    def ops_in_model_before(self):
+        return [torch.ops.vllm.reduce_scatter.default]
+
+    def ops_in_model_after(self):
+        return [torch.ops.symm_mem.fused_scaled_matmul_reduce_scatter.default]
+
+
+class TestAGScaledMMModel(_BaseScaledMMModel):
+
+    def forward(self, input: torch.Tensor):
+        """
+        Forward pass implementing the all gather + scaled_mm in the FX graph
+        """
+        # Reshape input
+        fp8_input = input.to(FP8_DTYPE)
+        all_gather = tensor_model_parallel_all_gather(fp8_input, dim=0)
+
+        scale_a = torch.ones(all_gather.shape[0], 1, dtype=torch.float32)
+        scaled_mm = torch._scaled_mm(all_gather,
+                                     self.weight,
+                                     scale_a=scale_a,
+                                     scale_b=self.scale_b,
+                                     out_dtype=self.dtype)
+        return scaled_mm
+
+    def ops_in_model_before(self):
+        return [torch.ops.vllm.all_gather.default]
+
+    def ops_in_model_after(self):
+        return [torch.ops.symm_mem.fused_all_gather_scaled_matmul.default]
+
+
+class TestCutlassScaledMMRSModel(_BaseScaledMMModel):
+
+    def forward(self, input: torch.Tensor):
+        """
+        Forward pass implementing the cutlass_scaled_mm + reduce scatter
+        in the FX graph
+    
+        """
+        fp8_input = input.to(FP8_DTYPE)
+        scale_a = torch.ones(input.shape[0], 1, dtype=torch.float32)
+        mm_out = torch.empty((fp8_input.shape[0], self.weight.shape[1]),
+                             dtype=self.dtype,
+                             device=input.device)
+        torch.ops._C.cutlass_scaled_mm(mm_out, fp8_input, self.weight, scale_a,
+                                       self.scale_b, None)
+        reduce_scatter = tensor_model_parallel_reduce_scatter(mm_out, dim=0)
+        return reduce_scatter
+
+    def ops_in_model_before(self):
+        return [torch.ops.vllm.reduce_scatter.default]
+
+    def ops_in_model_after(self):
+        return [torch.ops.symm_mem.fused_scaled_matmul_reduce_scatter.default]
+
+
+class TestAGCutlassScaledMMModel(_BaseScaledMMModel):
+
+    def forward(self, input: torch.Tensor):
+        """
+        Forward pass implementing the all gather + cutlass_scaled_mm 
+        in the FX graph
+        """
+        # Reshape input
+        fp8_input = input.to(FP8_DTYPE)
+        all_gather = tensor_model_parallel_all_gather(fp8_input, dim=0)
+
+        scale_a = torch.ones(all_gather.shape[0], 1, dtype=torch.float32)
+
+        mm_out = torch.empty((all_gather.shape[0], self.weight.shape[1]),
+                             dtype=self.dtype,
+                             device=all_gather.device)
+        torch.ops._C.cutlass_scaled_mm(mm_out, all_gather, self.weight,
+                                       scale_a, self.scale_b, None)
+        return mm_out
+
+    def ops_in_model_before(self):
+        return [torch.ops.vllm.all_gather.default]
+
+    def ops_in_model_after(self):
+        return [torch.ops.symm_mem.fused_all_gather_scaled_matmul.default]
+
+
 @multi_gpu_test(num_gpus=2)
-@pytest.mark.parametrize("test_model", [TestMMRSModel, TestAGMMModel])
+@pytest.mark.parametrize("test_model", [
+    TestMMRSModel, TestAGMMModel, TestScaledMMRSModel, TestAGScaledMMModel,
+    TestCutlassScaledMMRSModel, TestAGCutlassScaledMMModel
+])
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize("seq_len", [16])
 @pytest.mark.parametrize("hidden_size", [16])
@@ -101,6 +222,14 @@ class TestAGMMModel(torch.nn.Module):
                     reason="Only test on CUDA")
 def test_async_tp_pass_replace(test_model: str, batch_size: int, seq_len: int,
                                hidden_size: int, dtype: torch.dtype):
+    if test_model in (TestScaledMMRSModel, TestAGScaledMMModel,
+                      TestCutlassScaledMMRSModel,
+                      TestAGCutlassScaledMMModel) and dtype == torch.float16:
+        pytest.skip(
+            "Only bf16 high precision output types are supported for " \
+            "per-token (row-wise) scaling"
+        )
+
     num_processes = 2
 
     def run_torch_spawn(fn, nprocs):
@@ -155,7 +284,8 @@ def async_tp_pass_on_test_model(local_rank: int, world_size: int,
     async_tp_pass = AsyncTPPass(vllm_config)
     backend = TestBackend(async_tp_pass)
 
-    model = test_model_cls(hidden_size)
+    model = test_model_cls(hidden_size,
+                           dtype)  # Pass dtype to model constructor
 
     hidden_states = torch.randn((batch_size * seq_len, hidden_size),
                                 dtype=dtype,
@@ -174,7 +304,10 @@ def async_tp_pass_on_test_model(local_rank: int, world_size: int,
 
 
 @create_new_process_for_each_test()
-@pytest.mark.parametrize("model_id", ["meta-llama/Llama-3.2-1B-Instruct"])
+@pytest.mark.parametrize("model_id", [
+    "meta-llama/Llama-3.2-1B-Instruct",
+    "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"
+])
 @pytest.mark.parametrize("tp_size", [2])
 @pytest.mark.parametrize("async_tp_enabled", [True])
 @pytest.mark.parametrize("distributed_backend", ["mp"])
diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py
index 0ba59f4b5a056..90e8e0ff95858 100644
--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
@@ -26,6 +26,8 @@ def test_use_cudagraphs_dynamic(monkeypatch):
     assert not vllm_config.compilation_config.use_cudagraph
 
 
+# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
+@pytest.mark.forked
 # NB: We don't test VLLM_DISABLE_COMPILE_CACHE=0 because that depends
 # on the state of the cache directory on the current machine, which
 # may be influenced by other tests.
@@ -33,8 +35,8 @@ def test_use_cudagraphs_dynamic(monkeypatch):
 def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val):
     assert vllm.envs.VLLM_USE_V1
 
-    # spawn means that the counters are in the same process.
-    monkeypatch.setenv('VLLM_WORKER_MULTIPROC_METHOD', "spawn")
+    # Disable multiprocessing so that the counter is in the same process
+    monkeypatch.setenv('VLLM_ENABLE_V1_MULTIPROCESSING', '0')
     monkeypatch.setenv('VLLM_DISABLE_COMPILE_CACHE', val)
 
     compilation_config = {
@@ -50,6 +52,8 @@ def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val):
         pass
 
 
+# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
+@pytest.mark.forked
 @pytest.mark.parametrize("enabled", [True, False])
 def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
     assert vllm.envs.VLLM_USE_V1
@@ -72,3 +76,50 @@ def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
                         compilation_config=compilation_config,
                         gpu_memory_utilization=0.4) as _):
         pass
+
+
+# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
+@pytest.mark.forked
+def test_dynamo_as_is(vllm_runner, monkeypatch):
+    # Disable multiprocessing so that the counter is in the same process
+    monkeypatch.setenv('VLLM_ENABLE_V1_MULTIPROCESSING', '0')
+
+    with (
+            compilation_counter.expect(dynamo_as_is_count=1),
+            # loading the model causes compilation (if enabled) to happen
+            vllm_runner('facebook/opt-125m',
+                        compilation_config={"level": 1},
+                        gpu_memory_utilization=0.4) as _):
+        pass
+
+
+# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
+@pytest.mark.forked
+def test_no_compilation(vllm_runner, monkeypatch):
+    # Disable multiprocessing so that the counter is in the same process
+    monkeypatch.setenv('VLLM_ENABLE_V1_MULTIPROCESSING', '0')
+
+    with (
+            compilation_counter.expect(num_graphs_seen=0,
+                                       dynamo_as_is_count=0),
+            # loading the model causes compilation (if enabled) to happen
+            vllm_runner('facebook/opt-125m',
+                        compilation_config={"level": 0},
+                        gpu_memory_utilization=0.4) as _):
+        pass
+
+
+# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
+@pytest.mark.forked
+def test_enforce_eager(vllm_runner, monkeypatch):
+    # Disable multiprocessing so that the counter is in the same process
+    monkeypatch.setenv('VLLM_ENABLE_V1_MULTIPROCESSING', '0')
+
+    with (
+            compilation_counter.expect(num_graphs_seen=0,
+                                       dynamo_as_is_count=0),
+            # loading the model causes compilation (if enabled) to happen
+            vllm_runner('facebook/opt-125m',
+                        enforce_eager=True,
+                        gpu_memory_utilization=0.4) as _):
+        pass
diff --git a/tests/compile/test_fusion_all_reduce.py b/tests/compile/test_fusion_all_reduce.py
index b8d64247f6beb..4c3cf6c2a10cf 100644
--- a/tests/compile/test_fusion_all_reduce.py
+++ b/tests/compile/test_fusion_all_reduce.py
@@ -7,22 +7,26 @@ import torch
 
 import vllm.envs as envs
 from vllm.compilation.collective_fusion import AllReduceFusionPass
+from vllm.compilation.fix_functionalization import FixFunctionalizationPass
+from vllm.compilation.noop_elimination import NoOpEliminationPass
 from vllm.config import (CompilationConfig, CompilationLevel, DeviceConfig,
                          ModelConfig, PassConfig, VllmConfig)
 from vllm.distributed import tensor_model_parallel_all_reduce
 from vllm.distributed.parallel_state import (init_distributed_environment,
                                              initialize_model_parallel)
 from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    GroupShape, QuantFP8)
 from vllm.platforms import current_platform
 from vllm.utils import update_environment_variables
 
-from ..utils import multi_gpu_test
+from ..utils import has_module_attribute, multi_gpu_test
 from .backend import TestBackend
 
 
 class TestAllReduceRMSNormModel(torch.nn.Module):
 
-    def __init__(self, hidden_size=16, eps=1e-6):
+    def __init__(self, hidden_size=16, token_num=16, eps=1e-6):
         super().__init__()
         self.hidden_size = hidden_size
         self.eps = eps
@@ -43,7 +47,7 @@ class TestAllReduceRMSNormModel(torch.nn.Module):
 
 class TestAllReduceFusedAddRMSNormModel(torch.nn.Module):
 
-    def __init__(self, hidden_size=16, eps=1e-6):
+    def __init__(self, hidden_size=16, token_num=16, eps=1e-6):
         super().__init__()
         self.hidden_size = hidden_size
         self.eps = eps
@@ -62,24 +66,104 @@ class TestAllReduceFusedAddRMSNormModel(torch.nn.Module):
         return [torch.ops.vllm.flashinfer_trtllm_fused_allreduce_norm.default]
 
 
+class TestAllReduceFusedAddRMSNormStaticQuantFP8Model(torch.nn.Module):
+
+    def __init__(self, hidden_size=16, token_num=16, eps=1e-6):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.eps = eps
+        self.norm = RMSNorm(hidden_size, eps)
+        self.quant_fp8 = QuantFP8(static=True,
+                                  group_shape=GroupShape.PER_TENSOR)
+        self.scale = torch.rand(1, dtype=torch.float32)
+        self.output = torch.empty((token_num, hidden_size),
+                                  dtype=torch.float32)
+
+    def forward(self, hidden_states, residual):
+        view = hidden_states.reshape(-1, self.hidden_size)
+        all_reduce = tensor_model_parallel_all_reduce(view)
+        norm_output, residual_output = self.norm(all_reduce, residual)
+        torch.ops._C.static_scaled_fp8_quant(self.output,
+                                             norm_output.contiguous(),
+                                             self.scale)
+        return self.output, residual_output
+
+    def ops_in_model_after(self):
+        return [torch.ops.vllm.flashinfer_trtllm_fused_allreduce_norm.default]
+
+    def ops_in_model_before(self):
+        return [
+            torch.ops.vllm.all_reduce.default,
+            torch.ops._C.static_scaled_fp8_quant.default
+        ]
+
+
+class TestAllReduceFusedAddRMSNormStaticQuantFP4Model(torch.nn.Module):
+
+    def __init__(self, hidden_size=16, token_num=16, eps=1e-6):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.eps = eps
+        self.norm = RMSNorm(hidden_size, eps)
+        self.scale = torch.rand(1, dtype=torch.float32)
+        self.output = torch.empty((token_num, hidden_size),
+                                  dtype=torch.float32)
+
+        round_up = lambda x, y: (x + y - 1) // y * y
+        rounded_m = round_up(token_num, 128)
+        scale_n = hidden_size // 16
+        rounded_n = round_up(scale_n, 4)
+        self.output_scale = torch.empty((rounded_m, rounded_n // 4),
+                                        dtype=torch.int32)
+
+    def forward(self, hidden_states, residual):
+        view = hidden_states.reshape(-1, self.hidden_size)
+        all_reduce = tensor_model_parallel_all_reduce(view)
+        norm_output, residual_output = self.norm(all_reduce, residual)
+        norm_output = norm_output.reshape(-1, norm_output.shape[-1])
+        torch.ops._C.scaled_fp4_quant(self.output, norm_output,
+                                      self.output_scale, self.scale)
+        return self.output, residual_output, self.output_scale
+
+    def ops_in_model_after(self):
+        return [torch.ops.vllm.flashinfer_trtllm_fused_allreduce_norm.default]
+
+    def ops_in_model_before(self):
+        return [
+            torch.ops.vllm.all_reduce.default,
+            torch.ops._C.scaled_fp4_quant.default
+        ]
+
+
 @multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize(
     "test_model",
-    [TestAllReduceRMSNormModel, TestAllReduceFusedAddRMSNormModel])
+    [
+        TestAllReduceRMSNormModel,
+        TestAllReduceFusedAddRMSNormModel,
+        TestAllReduceFusedAddRMSNormStaticQuantFP8Model,
+        # TODO: Enable with torch==2.8.0
+        # TestAllReduceFusedAddRMSNormStaticQuantFP4Model,
+    ])
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize("seq_len", [8])
-@pytest.mark.parametrize("hidden_size", [4096])
+@pytest.mark.parametrize("hidden_size", [16])
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
 @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"],
                     reason="Only test on CUDA")
-@pytest.mark.skipif(not find_spec("flashinfer"),
-                    reason="flashinfer is not installed")
-@pytest.mark.skipif(not current_platform.is_device_capability(100),
-                    reason="Only test on SM100")
+@pytest.mark.skipif(
+    not find_spec("flashinfer")
+    or not has_module_attribute("flashinfer.comm", "trtllm_allreduce_fusion"),
+    reason="flashinfer is not found or flashinfer "
+    "is not compiled with trtllm_allreduce_fusion")
 def test_all_reduce_fusion_pass_replace(test_model: torch.nn.Module,
                                         batch_size: int, seq_len: int,
                                         hidden_size: int, dtype: torch.dtype):
     num_processes = 2
+    if (test_model == TestAllReduceFusedAddRMSNormStaticQuantFP4Model
+            and not current_platform.has_device_capability(100)):
+        pytest.skip("Skip as nvfp4 is only supported on "
+                    "devices with compute capability 10.0 (Blackwell)")
 
     def run_torch_spawn(fn, nprocs):
         torch.multiprocessing.spawn(fn,
@@ -113,12 +197,11 @@ def all_reduce_fusion_pass_on_test_model(local_rank: int, world_size: int,
     init_distributed_environment()
     initialize_model_parallel(tensor_model_parallel_size=world_size)
 
-    vllm_config = VllmConfig(
-        compilation_config=CompilationConfig(level=CompilationLevel.PIECEWISE,
-                                             custom_ops=["+rms_norm"],
-                                             compile_sizes=[2, 4, 8]))
+    vllm_config = VllmConfig(compilation_config=CompilationConfig(
+        level=CompilationLevel.PIECEWISE,
+        custom_ops=["+rms_norm", "+quant_fp8"]))
     vllm_config.compilation_config.pass_config = PassConfig(
-        enable_fi_allreduce_fusion=True)
+        enable_fi_allreduce_fusion=True, enable_noop=True)
     vllm_config.device_config = DeviceConfig(device=torch.device("cuda"))
 
     # this is a fake model name to construct the model config
@@ -130,14 +213,16 @@ def all_reduce_fusion_pass_on_test_model(local_rank: int, world_size: int,
                                            seed=42)
 
     all_reduce_fusion_pass = AllReduceFusionPass(vllm_config)
-    backend = TestBackend(all_reduce_fusion_pass)
+    noop_pass = NoOpEliminationPass(vllm_config)
+    func_pass = FixFunctionalizationPass(vllm_config)
 
-    model = test_model_cls(hidden_size)
+    backend = TestBackend(all_reduce_fusion_pass, noop_pass, func_pass)
 
-    hidden_states = torch.randn((batch_size * seq_len, hidden_size),
-                                requires_grad=False)
-    residual = torch.randn((batch_size * seq_len, hidden_size),
-                           requires_grad=False)
+    token_num = batch_size * seq_len
+    model = test_model_cls(hidden_size, token_num)
+
+    hidden_states = torch.randn((token_num, hidden_size), requires_grad=False)
+    residual = torch.randn((token_num, hidden_size), requires_grad=False)
 
     compiled_model = torch.compile(model, backend=backend)
     compiled_model(hidden_states, residual)
diff --git a/tests/config/test_config_generation.py b/tests/config/test_config_generation.py
index 024e81fccc5f1..e37b6b95941e9 100644
--- a/tests/config/test_config_generation.py
+++ b/tests/config/test_config_generation.py
@@ -36,3 +36,36 @@ def test_cuda_empty_vs_unset_configs(monkeypatch: pytest.MonkeyPatch):
     assert deep_compare(normal_config_dict, empty_config_dict), (
         "Configs with normal CUDA_VISIBLE_DEVICES and CUDA_VISIBLE_DEVICES=\"\""
         " should be equivalent")
+
+
+def test_ray_runtime_env(monkeypatch: pytest.MonkeyPatch):
+    # In testing, this method needs to be nested inside as ray does not
+    # see the test module.
+    def create_config():
+        engine_args = EngineArgs(model="deepseek-ai/DeepSeek-V2-Lite",
+                                 trust_remote_code=True)
+        return engine_args.create_engine_config()
+
+    config = create_config()
+    parallel_config = config.parallel_config
+    assert parallel_config.ray_runtime_env is None
+
+    import ray
+    ray.init()
+
+    runtime_env = {
+        "env_vars": {
+            "TEST_ENV_VAR": "test_value",
+        },
+    }
+
+    config_ref = ray.remote(create_config).options(
+        runtime_env=runtime_env).remote()
+
+    config = ray.get(config_ref)
+    parallel_config = config.parallel_config
+    assert parallel_config.ray_runtime_env is not None
+    assert parallel_config.ray_runtime_env.env_vars().get(
+        "TEST_ENV_VAR") == "test_value"
+
+    ray.shutdown()
diff --git a/tests/config/test_mp_reducer.py b/tests/config/test_mp_reducer.py
index ee351cbfa7c16..d4d4be293280b 100644
--- a/tests/config/test_mp_reducer.py
+++ b/tests/config/test_mp_reducer.py
@@ -28,7 +28,6 @@ def test_mp_reducer(monkeypatch):
             max_model_len=32,
             gpu_memory_utilization=0.1,
             disable_log_stats=True,
-            disable_log_requests=True,
         )
 
         async_llm = AsyncLLM.from_engine_args(
diff --git a/tests/conftest.py b/tests/conftest.py
index e4df6ebf2c260..3f3790cab8d35 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -775,7 +775,7 @@ class VllmRunner:
         tokenizer_mode: str = "auto",
         trust_remote_code: bool = True,
         seed: Optional[int] = 0,
-        max_model_len: int = 1024,
+        max_model_len: Optional[int] = 1024,
         dtype: str = "auto",
         disable_log_stats: bool = True,
         tensor_parallel_size: int = 1,
@@ -1053,6 +1053,10 @@ class VllmRunner:
         req_outputs = self.llm.encode(prompts)
         return [req_output.outputs.data for req_output in req_outputs]
 
+    def reward(self, prompts: list[str]) -> list[list[float]]:
+        req_outputs = self.llm.reward(prompts)
+        return [req_output.outputs.data for req_output in req_outputs]
+
     def score(
         self,
         text_1: Union[str, list[str]],
diff --git a/tests/distributed/test_kvlayout.py b/tests/distributed/test_kvlayout.py
new file mode 100644
index 0000000000000..d447876f6cc7c
--- /dev/null
+++ b/tests/distributed/test_kvlayout.py
@@ -0,0 +1,72 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm.config import (DeviceConfig, KVTransferConfig, ModelConfig,
+                         VllmConfig, set_current_vllm_config)
+from vllm.distributed.kv_transfer.kv_connector.utils import (
+    get_kv_connector_cache_layout)
+from vllm.logger import init_logger
+
+logger = init_logger("test_expert_parallel")
+
+
+def test_get_kv_connector_cache_layout_without_kv_connector():
+    vllm_config = VllmConfig(device_config=DeviceConfig("cpu"))
+    with set_current_vllm_config(vllm_config):
+        # Test with default settings
+        layout = get_kv_connector_cache_layout()
+        assert layout == "NHD"
+
+
+def test_get_kv_connector_cache_layout_with_lmcache_connector():
+    kv_transfer_config = KVTransferConfig(
+        kv_connector="LMCacheConnectorV1",
+        kv_role="kv_both",
+    )
+    vllm_config = VllmConfig(device_config=DeviceConfig("cpu"),
+                             kv_transfer_config=kv_transfer_config)
+    with set_current_vllm_config(vllm_config):
+        # Test with default settings
+        layout = get_kv_connector_cache_layout()
+        assert layout == "NHD"
+
+
+def test_get_kv_connector_cache_layout_with_nixl_connector():
+    kv_transfer_config = KVTransferConfig(
+        kv_connector="NixlConnector",
+        kv_role="kv_both",
+    )
+    model_config = ModelConfig()
+    vllm_config = VllmConfig(device_config=DeviceConfig("cpu"),
+                             model_config=model_config,
+                             kv_transfer_config=kv_transfer_config)
+    with set_current_vllm_config(vllm_config):
+        # Test with default settings
+        layout = get_kv_connector_cache_layout()
+        assert layout == "HND"
+
+
+def test_get_kv_connector_cache_layout_with_multi_connector():
+    kv_transfer_config = KVTransferConfig(kv_connector="MultiConnector",
+                                          kv_role="kv_both",
+                                          kv_connector_extra_config={
+                                              "connectors": [{
+                                                  "kv_connector":
+                                                  "SharedStorageConnector",
+                                                  "kv_role":
+                                                  "kv_both"
+                                              }, {
+                                                  "kv_connector":
+                                                  "NixlConnector",
+                                                  "kv_role":
+                                                  "kv_both"
+                                              }]
+                                          })
+    model_config = ModelConfig()
+    vllm_config = VllmConfig(device_config=DeviceConfig("cpu"),
+                             model_config=model_config,
+                             kv_transfer_config=kv_transfer_config)
+    with set_current_vllm_config(vllm_config):
+        # Test with default settings
+        layout = get_kv_connector_cache_layout()
+        assert layout == "HND"
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index cfb2e2dd15f4d..12dd7c4222630 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -154,7 +154,7 @@ TEXT_GENERATION_MODELS = {
     "baichuan-inc/Baichuan-7B": PPTestSettings.fast(),
     "baichuan-inc/Baichuan2-13B-Chat": PPTestSettings.fast(),
     "bigscience/bloomz-1b1": PPTestSettings.fast(),
-    "THUDM/chatglm3-6b": PPTestSettings.fast(),
+    "zai-org/chatglm3-6b": PPTestSettings.fast(),
     "CohereForAI/c4ai-command-r-v01": PPTestSettings.fast(load_format="dummy"),
     "databricks/dbrx-instruct": PPTestSettings.fast(load_format="dummy"),
     "Deci/DeciLM-7B-instruct": PPTestSettings.fast(),
@@ -224,7 +224,7 @@ MULTIMODAL_MODELS = {
     "Salesforce/blip2-opt-6.7b": PPTestSettings.fast(),
     "facebook/chameleon-7b": PPTestSettings.fast(),
     "adept/fuyu-8b": PPTestSettings.fast(),
-    "THUDM/glm-4v-9b": PPTestSettings.fast(),
+    "zai-org/glm-4v-9b": PPTestSettings.fast(),
     "OpenGVLab/InternVL2-1B": PPTestSettings.fast(),
     "llava-hf/llava-1.5-7b-hf": PPTestSettings.fast(),
     "llava-hf/llava-v1.6-mistral-7b-hf": PPTestSettings.fast(),
diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
index 1d1926068d28c..c282bf002304a 100644
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -5,7 +5,7 @@ import json
 from argparse import ArgumentError
 from contextlib import nullcontext
 from dataclasses import dataclass, field
-from typing import Annotated, Literal, Optional
+from typing import Annotated, Literal, Optional, Union
 
 import pytest
 
@@ -136,6 +136,8 @@ class DummyConfig:
     """List with variable length"""
     list_literal: list[Literal[1, 2]] = field(default_factory=list)
     """List with literal choices"""
+    list_union: list[Union[str, type[object]]] = field(default_factory=list)
+    """List with union type"""
     literal_literal: Literal[Literal[1], Literal[2]] = 1
     """Literal of literals with default 1"""
     json_tip: dict = field(default_factory=dict)
@@ -187,6 +189,9 @@ def test_get_kwargs():
     assert kwargs["list_literal"]["type"] is int
     assert kwargs["list_literal"]["nargs"] == "+"
     assert kwargs["list_literal"]["choices"] == [1, 2]
+    # lists with unions should become str type.
+    # If not, we cannot know which type to use for parsing
+    assert kwargs["list_union"]["type"] is str
     # literals of literals should have merged choices
     assert kwargs["literal_literal"]["choices"] == [1, 2]
     # dict should have json tip in help
diff --git a/tests/entrypoints/llm/test_classify.py b/tests/entrypoints/llm/test_classify.py
new file mode 100644
index 0000000000000..abdce8935ea58
--- /dev/null
+++ b/tests/entrypoints/llm/test_classify.py
@@ -0,0 +1,67 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import weakref
+
+import pytest
+import torch
+
+from vllm import LLM, PoolingParams
+from vllm.distributed import cleanup_dist_env_and_memory
+
+from ...models.utils import softmax
+
+MODEL_NAME = "jason9693/Qwen2.5-1.5B-apeach"
+
+prompts = ["The chef prepared a delicious meal."]
+
+
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
+@pytest.fixture(scope="module")
+def llm():
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
+    llm = LLM(model=MODEL_NAME,
+              max_num_batched_tokens=32768,
+              tensor_parallel_size=1,
+              gpu_memory_utilization=0.75,
+              enforce_eager=True,
+              seed=0)
+
+    with llm.deprecate_legacy_api():
+        yield weakref.proxy(llm)
+
+        del llm
+
+    cleanup_dist_env_and_memory()
+
+
+@pytest.mark.skip_global_cleanup
+def test_pooling_params(llm: LLM):
+
+    def get_outputs(activation):
+        outputs = llm.classify(
+            prompts,
+            pooling_params=PoolingParams(activation=activation),
+            use_tqdm=False)
+        return torch.tensor([x.outputs.probs for x in outputs])
+
+    default = get_outputs(activation=None)
+    w_activation = get_outputs(activation=True)
+    wo_activation = get_outputs(activation=False)
+
+    assert torch.allclose(default, w_activation,
+                          atol=1e-2), "Default should use activation."
+    assert not torch.allclose(
+        w_activation, wo_activation,
+        atol=1e-2), "wo_activation should not use activation."
+    assert torch.allclose(
+        softmax(wo_activation), w_activation, atol=1e-2
+    ), "w_activation should be close to activation(wo_activation)."
diff --git a/tests/entrypoints/llm/test_embedding.py b/tests/entrypoints/llm/test_embedding.py
new file mode 100644
index 0000000000000..ba20d7b9548ef
--- /dev/null
+++ b/tests/entrypoints/llm/test_embedding.py
@@ -0,0 +1,56 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import weakref
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+from vllm import LLM, PoolingParams
+from vllm.distributed import cleanup_dist_env_and_memory
+
+MODEL_NAME = "intfloat/multilingual-e5-small"
+
+prompts = ["The chef prepared a delicious meal."]
+
+
+@pytest.fixture(scope="module")
+def llm():
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
+    llm = LLM(model=MODEL_NAME,
+              max_num_batched_tokens=32768,
+              tensor_parallel_size=1,
+              gpu_memory_utilization=0.75,
+              enforce_eager=True,
+              seed=0)
+
+    with llm.deprecate_legacy_api():
+        yield weakref.proxy(llm)
+
+        del llm
+
+    cleanup_dist_env_and_memory()
+
+
+@pytest.mark.skip_global_cleanup
+def test_pooling_params(llm: LLM):
+
+    def get_outputs(normalize):
+        outputs = llm.embed(prompts,
+                            pooling_params=PoolingParams(normalize=normalize),
+                            use_tqdm=False)
+        return torch.tensor([x.outputs.embedding for x in outputs])
+
+    default = get_outputs(normalize=None)
+    w_normal = get_outputs(normalize=True)
+    wo_normal = get_outputs(normalize=False)
+
+    assert torch.allclose(default, w_normal,
+                          atol=1e-2), "Default should use normal."
+    assert not torch.allclose(w_normal, wo_normal,
+                              atol=1e-2), "wo_normal should not use normal."
+    assert torch.allclose(
+        w_normal, F.normalize(wo_normal, p=2, dim=-1),
+        atol=1e-2), "w_normal should be close to normal(wo_normal)."
diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py
deleted file mode 100644
index 55578341cb2e7..0000000000000
--- a/tests/entrypoints/llm/test_guided_generate.py
+++ /dev/null
@@ -1,552 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import json
-import weakref
-from enum import Enum
-
-import jsonschema
-import pytest
-import regex as re
-from pydantic import BaseModel
-
-from vllm.distributed import cleanup_dist_env_and_memory
-from vllm.entrypoints.llm import LLM
-from vllm.outputs import RequestOutput
-from vllm.sampling_params import GuidedDecodingParams, SamplingParams
-
-MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
-
-# Separate backends which support grammars vs ones
-# which only support regex based constraints in tests.
-GRAMMAR_DECODING_BACKENDS = [
-    # (backend, disable_any_whitespace),
-    ("lm-format-enforcer", False),
-    ("xgrammar", True),
-    ("guidance", True),
-]
-
-ALL_DECODING_BACKENDS = ([("outlines", False)] + GRAMMAR_DECODING_BACKENDS)
-
-
-@pytest.fixture(scope="module")
-def llm():
-    # pytest caches the fixture so we use weakref.proxy to
-    # enable garbage collection
-    llm = LLM(model=MODEL_NAME, max_model_len=1024, seed=0)
-
-    with llm.deprecate_legacy_api():
-        yield weakref.proxy(llm)
-        del llm
-    cleanup_dist_env_and_memory()
-
-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace",
-                         ALL_DECODING_BACKENDS)
-def test_guided_regex(sample_regex, llm, guided_decoding_backend: str,
-                      disable_any_whitespace: bool):
-    sampling_params = SamplingParams(
-        temperature=0.8,
-        top_p=0.95,
-        guided_decoding=GuidedDecodingParams(
-            regex=sample_regex,
-            backend=guided_decoding_backend,
-            disable_any_whitespace=disable_any_whitespace))
-
-    outputs = llm.generate(prompts=[
-        f"Give an example IPv4 address with this regex: {sample_regex}"
-    ] * 2,
-                           sampling_params=sampling_params,
-                           use_tqdm=True)
-
-    assert outputs is not None
-    for output in outputs:
-        assert output is not None
-        assert isinstance(output, RequestOutput)
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(generated_text)
-        assert generated_text is not None
-        assert re.fullmatch(sample_regex, generated_text) is not None
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace",
-                         ALL_DECODING_BACKENDS)
-def test_guided_json_completion(sample_json_schema, llm,
-                                guided_decoding_backend: str,
-                                disable_any_whitespace: bool):
-    sampling_params = SamplingParams(
-        temperature=1.0,
-        max_tokens=1000,
-        guided_decoding=GuidedDecodingParams(
-            json=sample_json_schema,
-            backend=guided_decoding_backend,
-            disable_any_whitespace=disable_any_whitespace))
-    outputs = llm.generate(prompts=[
-        f"Give an example JSON for an employee profile "
-        f"that fits this schema: {sample_json_schema}"
-    ] * 2,
-                           sampling_params=sampling_params,
-                           use_tqdm=True)
-
-    assert outputs is not None
-
-    for output in outputs:
-        assert output is not None
-        assert isinstance(output, RequestOutput)
-        prompt = output.prompt
-
-        generated_text = output.outputs[0].text
-        assert generated_text is not None
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-        output_json = json.loads(generated_text)
-        jsonschema.validate(instance=output_json, schema=sample_json_schema)
-
-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace",
-                         ALL_DECODING_BACKENDS)
-def test_guided_complex_json_completion(sample_complex_json_schema, llm,
-                                        guided_decoding_backend: str,
-                                        disable_any_whitespace: bool):
-    sampling_params = SamplingParams(
-        temperature=1.0,
-        max_tokens=1000,
-        guided_decoding=GuidedDecodingParams(
-            json=sample_complex_json_schema,
-            backend=guided_decoding_backend,
-            disable_any_whitespace=disable_any_whitespace))
-    outputs = llm.generate(prompts=[
-        f"Give an example JSON for an assignment grade "
-        f"that fits this schema: {sample_complex_json_schema}"
-    ] * 2,
-                           sampling_params=sampling_params,
-                           use_tqdm=True)
-
-    assert outputs is not None
-
-    for output in outputs:
-        assert output is not None
-        assert isinstance(output, RequestOutput)
-        prompt = output.prompt
-
-        generated_text = output.outputs[0].text
-        assert generated_text is not None
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-        output_json = json.loads(generated_text)
-        jsonschema.validate(instance=output_json,
-                            schema=sample_complex_json_schema)
-
-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace",
-                         ALL_DECODING_BACKENDS)
-def test_guided_definition_json_completion(sample_definition_json_schema, llm,
-                                           guided_decoding_backend: str,
-                                           disable_any_whitespace: bool):
-    sampling_params = SamplingParams(
-        temperature=1.0,
-        max_tokens=1000,
-        guided_decoding=GuidedDecodingParams(
-            json=sample_definition_json_schema,
-            backend=guided_decoding_backend,
-            disable_any_whitespace=disable_any_whitespace))
-    outputs = llm.generate(prompts=[
-        f"Give an example JSON for solving 8x + 7 = -23 "
-        f"that fits this schema: {sample_definition_json_schema}"
-    ] * 2,
-                           sampling_params=sampling_params,
-                           use_tqdm=True)
-
-    assert outputs is not None
-
-    for output in outputs:
-        assert output is not None
-        assert isinstance(output, RequestOutput)
-        prompt = output.prompt
-
-        generated_text = output.outputs[0].text
-        assert generated_text is not None
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-        output_json = json.loads(generated_text)
-        jsonschema.validate(instance=output_json,
-                            schema=sample_definition_json_schema)
-
-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace",
-                         ALL_DECODING_BACKENDS)
-def test_guided_enum_json_completion(sample_enum_json_schema, llm,
-                                     guided_decoding_backend: str,
-                                     disable_any_whitespace: bool):
-    sampling_params = SamplingParams(
-        temperature=1.0,
-        max_tokens=1000,
-        guided_decoding=GuidedDecodingParams(
-            json=sample_enum_json_schema,
-            backend=guided_decoding_backend,
-            disable_any_whitespace=disable_any_whitespace))
-    outputs = llm.generate(prompts=[
-        "Create a bug report JSON that fits this schema: "
-        f"{sample_enum_json_schema}. Make it for a high priority critical bug."
-    ] * 2,
-                           sampling_params=sampling_params,
-                           use_tqdm=True)
-
-    assert outputs is not None
-
-    for output in outputs:
-        assert output is not None
-        assert isinstance(output, RequestOutput)
-        prompt = output.prompt
-
-        generated_text = output.outputs[0].text
-        assert generated_text is not None
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-        output_json = json.loads(generated_text)
-        jsonschema.validate(instance=output_json,
-                            schema=sample_enum_json_schema)
-
-        # Additional assertions to verify enum values
-        assert output_json["status"] in ["active", "inactive", "pending"]
-        assert output_json["priority"] in ["low", "medium", "high", "critical"]
-        assert output_json["category"]["type"] in [
-            "bug", "feature", "improvement"
-        ]
-        assert output_json["category"]["severity"] in [1, 2, 3, 4, 5]
-        for flag in output_json["flags"]:
-            assert flag in ["urgent", "blocked", "needs_review", "approved"]
-
-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace",
-                         ALL_DECODING_BACKENDS)
-def test_guided_choice_completion(sample_guided_choice, llm,
-                                  guided_decoding_backend: str,
-                                  disable_any_whitespace: bool):
-    sampling_params = SamplingParams(
-        temperature=0.8,
-        top_p=0.95,
-        guided_decoding=GuidedDecodingParams(
-            choice=sample_guided_choice,
-            backend=guided_decoding_backend,
-            disable_any_whitespace=disable_any_whitespace))
-    outputs = llm.generate(
-        prompts="The best language for type-safe systems programming is ",
-        sampling_params=sampling_params,
-        use_tqdm=True)
-
-    assert outputs is not None
-    for output in outputs:
-        assert output is not None
-        assert isinstance(output, RequestOutput)
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(generated_text)
-        assert generated_text is not None
-        assert generated_text in sample_guided_choice
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace",
-                         GRAMMAR_DECODING_BACKENDS)
-def test_guided_grammar(sample_sql_statements, llm,
-                        guided_decoding_backend: str,
-                        disable_any_whitespace: bool):
-    sampling_params = SamplingParams(
-        temperature=0.8,
-        top_p=0.95,
-        max_tokens=1000,
-        guided_decoding=GuidedDecodingParams(
-            grammar=sample_sql_statements,
-            backend=guided_decoding_backend,
-            disable_any_whitespace=disable_any_whitespace))
-    outputs = llm.generate(
-        prompts=("Generate a sql state that select col_1 from "
-                 "table_1 where it is equals to 1"),
-        sampling_params=sampling_params,
-        use_tqdm=True,
-    )
-
-    assert outputs is not None
-    for output in outputs:
-        assert output is not None
-        assert isinstance(output, RequestOutput)
-        prompt = output.prompt
-
-        generated_text = output.outputs[0].text
-        assert generated_text is not None
-        # use Lark to parse the output, and make sure it's a valid parse tree
-        from lark import Lark
-        parser = Lark(sample_sql_statements)
-        parser.parse(generated_text)
-
-        # remove spaces for comparison b/c we removed them in the grammar
-        ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(
-            " ", "")
-
-        assert generated_text.strip() == ground_truth
-
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-
-@pytest.mark.skip_global_cleanup
-def test_guided_options_request_deprecation_warning(sample_regex, llm):
-    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-
-    with pytest.warns(DeprecationWarning, match="guided_options_request"):
-        llm.generate(prompts="This should fail",
-                     sampling_params=sampling_params,
-                     use_tqdm=True,
-                     guided_options_request=dict(guided_regex=sample_regex))
-
-
-@pytest.mark.skip_global_cleanup
-def test_validation_against_both_guided_decoding_options(sample_regex, llm):
-    sampling_params = SamplingParams(
-        temperature=0.8,
-        top_p=0.95,
-        guided_decoding=GuidedDecodingParams(regex=sample_regex))
-
-    with pytest.raises(ValueError, match="Cannot set both"):
-        llm.generate(prompts="This should fail",
-                     sampling_params=sampling_params,
-                     use_tqdm=True,
-                     guided_options_request=dict(guided_regex=sample_regex))
-
-
-@pytest.mark.skip_global_cleanup
-def test_disable_guided_decoding_fallback(sample_regex, llm):
-    # see has_xgrammar_unsupported_json_features()
-    unsupported_json = {
-        "type": "object",
-        "properties": {
-            "example": {
-                "type": "string",
-                "minLength": 5  # unsupported by xgrammar
-            }
-        }
-    }
-    sampling_params = SamplingParams(temperature=0.8,
-                                     top_p=0.95,
-                                     guided_decoding=GuidedDecodingParams(
-                                         json=unsupported_json,
-                                         backend="xgrammar",
-                                         disable_fallback=True))
-
-    with pytest.raises(
-            ValueError,
-            match="xgrammar does not support advanced JSON schema features "
-            "like string length, item limits, or property bounds."):
-        llm.generate(prompts="This should fail",
-                     sampling_params=sampling_params,
-                     use_tqdm=True)
-
-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace",
-                         GRAMMAR_DECODING_BACKENDS)
-def test_guided_json_object(llm, guided_decoding_backend: str,
-                            disable_any_whitespace: bool):
-    sampling_params = SamplingParams(
-        temperature=1.0,
-        max_tokens=100,
-        n=2,
-        guided_decoding=GuidedDecodingParams(
-            json_object=True,
-            backend=guided_decoding_backend,
-            disable_any_whitespace=disable_any_whitespace))
-
-    outputs = llm.generate(
-        prompts=("Generate a JSON object with curly braces for a person with "
-                 "name and age fields for John Smith who is 31 years old."),
-        sampling_params=sampling_params,
-        use_tqdm=True)
-
-    assert outputs is not None
-    for output in outputs:
-        assert output is not None
-        assert isinstance(output, RequestOutput)
-
-        for i in range(2):
-            generated_text = output.outputs[i].text
-            print(generated_text)
-            assert generated_text is not None
-
-            if disable_any_whitespace:
-                assert "\n" not in generated_text
-
-            # Parse to verify it is valid JSON
-            parsed_json = json.loads(generated_text)
-            # A list is not what was intended, but is still valid
-            # json.
-            assert isinstance(parsed_json, (dict, list))
-
-
-class CarType(str, Enum):
-    sedan = "sedan"
-    suv = "SUV"
-    truck = "Truck"
-    coupe = "Coupe"
-
-
-class CarDescription(BaseModel):
-    brand: str
-    model: str
-    car_type: CarType
-
-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace",
-                         ALL_DECODING_BACKENDS)
-def test_guided_json_completion_with_enum(llm, guided_decoding_backend: str,
-                                          disable_any_whitespace: bool):
-    json_schema = CarDescription.model_json_schema()
-    sampling_params = SamplingParams(
-        temperature=1.0,
-        max_tokens=1000,
-        guided_decoding=GuidedDecodingParams(
-            json=json_schema,
-            backend=guided_decoding_backend,
-            disable_any_whitespace=disable_any_whitespace))
-    outputs = llm.generate(
-        prompts="Generate a JSON with the brand, model and car_type of"
-        "the most iconic car from the 90's",
-        sampling_params=sampling_params,
-        use_tqdm=True)
-
-    assert outputs is not None
-    for output in outputs:
-        assert output is not None
-        assert isinstance(output, RequestOutput)
-        prompt = output.prompt
-
-        generated_text = output.outputs[0].text
-        assert generated_text is not None
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-        output_json = json.loads(generated_text)
-        jsonschema.validate(instance=output_json, schema=json_schema)
-
-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace",
-                         ALL_DECODING_BACKENDS)
-def test_guided_number_range_json_completion(llm, guided_decoding_backend: str,
-                                             disable_any_whitespace: bool):
-    sample_output_schema = {
-        "type": "object",
-        "properties": {
-            "age": {
-                "type": "integer",
-                "minimum": 18,
-                "maximum": 99
-            },
-            "score": {
-                "type": "number",
-                "minimum": 0.0,
-                "maximum": 100.0
-            },
-            "zipcode": {
-                "type": "string",
-                "pattern": r"^\d{5}(-\d{4})?$"
-            },
-        },
-        "required": ["age", "score", "zipcode"],
-    }
-    sampling_params = SamplingParams(
-        temperature=1.0,
-        max_tokens=1000,
-        guided_decoding=GuidedDecodingParams(
-            json=sample_output_schema,
-            backend=guided_decoding_backend,
-            disable_any_whitespace=disable_any_whitespace),
-    )
-    outputs = llm.generate(
-        prompts=[
-            "Create a JSON object for a user with age, score, and zipcode."
-        ] * 2,
-        sampling_params=sampling_params,
-        use_tqdm=True,
-    )
-
-    assert outputs is not None
-
-    for output in outputs:
-        assert output is not None
-        assert isinstance(output, RequestOutput)
-        prompt = output.prompt
-
-        generated_text = output.outputs[0].text
-        assert generated_text is not None
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-        output_json = json.loads(generated_text)
-        jsonschema.validate(instance=output_json, schema=sample_output_schema)
-        assert 18 <= output_json["age"] <= 99
-        assert 0.0 <= output_json["score"] <= 100.0
-        assert (re.fullmatch(r"^\d{5}(-\d{4})?$", output_json["zipcode"])
-                is not None)
-
-
-@pytest.mark.skip_global_cleanup
-def test_guidance_no_additional_properties(llm):
-    schema = {
-        'type': 'object',
-        'properties': {
-            'a1': {
-                'type': 'string'
-            },
-            'a2': {
-                'type': 'string'
-            },
-            'a3': {
-                'type': 'string'
-            }
-        },
-        'required': ['a1', 'a2', 'a3'],
-    }
-
-    prompt = (
-        "<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a "
-        "helpful assistant.<|im_end|>\n<|im_start|>user\nPlease generate a "
-        "large JSON object with key-value pairs a1=b1, a2=b2, ..., a20=b20"
-        "<|im_end|>\n<|im_start|>assistant\n")
-
-    def generate_with_backend(backend, disable_additional_properties):
-        guided_params = GuidedDecodingParams(
-            json=schema,
-            backend=backend,
-            disable_any_whitespace=True,
-            disable_additional_properties=disable_additional_properties)
-        sampling_params = SamplingParams(temperature=0,
-                                         max_tokens=256,
-                                         guided_decoding=guided_params)
-
-        outputs = llm.generate(prompts=prompt, sampling_params=sampling_params)
-        assert outputs is not None
-        generated_text = outputs[0].outputs[0].text
-        assert generated_text is not None
-        parsed_json = json.loads(generated_text)
-        assert isinstance(parsed_json, dict)
-        jsonschema.validate(instance=parsed_json, schema=schema)
-        return parsed_json
-
-    base_generated = generate_with_backend("guidance", False)
-    assert "a1" in base_generated
-    assert "a2" in base_generated
-    assert "a3" in base_generated
-    # by default additional keys are generated
-    assert "a4" in base_generated
-    assert "a5" in base_generated
-    assert "a6" in base_generated
-
-    generated = generate_with_backend("guidance", True)
-    assert "a1" in generated
-    assert "a2" in generated
-    assert "a3" in generated
-    assert "a4" not in generated
-    assert "a5" not in generated
-    assert "a6" not in generated
diff --git a/tests/entrypoints/llm/test_lazy_outlines.py b/tests/entrypoints/llm/test_lazy_outlines.py
index 61b6b4fbf8e35..ac0b7e134c55a 100644
--- a/tests/entrypoints/llm/test_lazy_outlines.py
+++ b/tests/entrypoints/llm/test_lazy_outlines.py
@@ -4,43 +4,11 @@
 import sys
 from contextlib import nullcontext
 
-import pytest
 from vllm_test_utils import BlameResult, blame
 
 from vllm import LLM, SamplingParams
 from vllm.distributed import cleanup_dist_env_and_memory
-
-
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    V1 only supports xgrammar so this is irrelevant.
-    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
-
-
-def run_normal_opt125m():
-    prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ]
-    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-
-    # Create an LLM without guided decoding as a baseline.
-    llm = LLM(model="facebook/opt-125m",
-              enforce_eager=True,
-              gpu_memory_utilization=0.3)
-    outputs = llm.generate(prompts, sampling_params)
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-    # Destroy the LLM object and free up the GPU memory.
-    del llm
-    cleanup_dist_env_and_memory()
+from vllm.sampling_params import GuidedDecodingParams
 
 
 def run_normal():
@@ -67,20 +35,22 @@ def run_normal():
     cleanup_dist_env_and_memory()
 
 
-def run_lmfe(sample_regex):
+def run_xgrammar(sample_regex):
     # Create an LLM with guided decoding enabled.
     llm = LLM(model="distilbert/distilgpt2",
               enforce_eager=True,
-              guided_decoding_backend="lm-format-enforcer",
+              guided_decoding_backend="xgrammar",
               gpu_memory_utilization=0.3)
-    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+    prompt = f"Give an example IPv4 address with this regex: {sample_regex}"
+    guided_decoding = GuidedDecodingParams(regex=sample_regex)
+    sampling_params = SamplingParams(temperature=0.8,
+                                     top_p=0.95,
+                                     guided_decoding=guided_decoding)
     outputs = llm.generate(
-        prompts=[
-            f"Give an example IPv4 address with this regex: {sample_regex}"
-        ] * 2,
+        prompts=[prompt] * 2,
         sampling_params=sampling_params,
         use_tqdm=True,
-        guided_options_request=dict(guided_regex=sample_regex))
+    )
 
     for output in outputs:
         prompt = output.prompt
@@ -103,7 +73,7 @@ def test_lazy_outlines(sample_regex):
         lambda: module_name in sys.modules) if use_blame else nullcontext()
     with context as result:
         run_normal()
-        run_lmfe(sample_regex)
+        run_xgrammar(sample_regex)
     if use_blame:
         assert isinstance(result, BlameResult)
         print(f"the first import location is:\n{result.trace_stack}")
diff --git a/tests/entrypoints/llm/test_reward.py b/tests/entrypoints/llm/test_reward.py
new file mode 100644
index 0000000000000..361e2d0e1047f
--- /dev/null
+++ b/tests/entrypoints/llm/test_reward.py
@@ -0,0 +1,66 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import weakref
+
+import pytest
+import torch
+
+from vllm import LLM, PoolingParams
+from vllm.distributed import cleanup_dist_env_and_memory
+
+from ...models.utils import softmax
+
+MODEL_NAME = "internlm/internlm2-1_8b-reward"
+
+prompts = ["The chef prepared a delicious meal."]
+
+
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
+@pytest.fixture(scope="module")
+def llm():
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
+    llm = LLM(model=MODEL_NAME,
+              max_num_batched_tokens=32768,
+              tensor_parallel_size=1,
+              gpu_memory_utilization=0.75,
+              enforce_eager=True,
+              trust_remote_code=True,
+              seed=0)
+
+    with llm.deprecate_legacy_api():
+        yield weakref.proxy(llm)
+
+        del llm
+
+    cleanup_dist_env_and_memory()
+
+
+@pytest.mark.skip_global_cleanup
+def test_pooling_params(llm: LLM):
+
+    def get_outputs(softmax):
+        outputs = llm.reward(prompts,
+                             pooling_params=PoolingParams(softmax=softmax),
+                             use_tqdm=False)
+        return torch.cat([x.outputs.data for x in outputs])
+
+    default = get_outputs(softmax=None)
+    w_softmax = get_outputs(softmax=True)
+    wo_softmax = get_outputs(softmax=False)
+
+    assert torch.allclose(default, w_softmax,
+                          atol=1e-2), "Default should use softmax."
+    assert not torch.allclose(w_softmax, wo_softmax,
+                              atol=1e-2), "wo_softmax should not use softmax."
+    assert torch.allclose(
+        softmax(wo_softmax), w_softmax,
+        atol=1e-2), "w_softmax should be close to softmax(wo_softmax)."
diff --git a/tests/entrypoints/llm/test_score.py b/tests/entrypoints/llm/test_score.py
new file mode 100644
index 0000000000000..dd4eae0ccc06e
--- /dev/null
+++ b/tests/entrypoints/llm/test_score.py
@@ -0,0 +1,69 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import weakref
+
+import pytest
+import torch
+
+from vllm import LLM, PoolingParams
+from vllm.distributed import cleanup_dist_env_and_memory
+
+from ...models.utils import softmax
+
+MODEL_NAME = "tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
+
+
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
+@pytest.fixture(scope="module")
+def llm():
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
+    llm = LLM(model=MODEL_NAME,
+              max_num_batched_tokens=32768,
+              tensor_parallel_size=1,
+              gpu_memory_utilization=0.75,
+              enforce_eager=True,
+              seed=0)
+
+    with llm.deprecate_legacy_api():
+        yield weakref.proxy(llm)
+
+        del llm
+
+    cleanup_dist_env_and_memory()
+
+
+@pytest.mark.skip_global_cleanup
+def test_pooling_params(llm: LLM):
+
+    def get_outputs(activation):
+        text_1 = "What is the capital of France?"
+        text_2 = "The capital of France is Paris."
+
+        outputs = llm.score(
+            text_1,
+            text_2,
+            pooling_params=PoolingParams(activation=activation),
+            use_tqdm=False)
+        return torch.tensor([x.outputs.score for x in outputs])
+
+    default = get_outputs(activation=None)
+    w_activation = get_outputs(activation=True)
+    wo_activation = get_outputs(activation=False)
+
+    assert torch.allclose(default, w_activation,
+                          atol=1e-2), "Default should use activation."
+    assert not torch.allclose(
+        w_activation, wo_activation,
+        atol=1e-2), "wo_activation should not use activation."
+    assert torch.allclose(
+        softmax(wo_activation), w_activation, atol=1e-2
+    ), "w_activation should be close to activation(wo_activation)."
diff --git a/tests/entrypoints/openai/correctness/test_lmeval.py b/tests/entrypoints/openai/correctness/test_lmeval.py
index a07a147cdc2b2..d75731637d282 100644
--- a/tests/entrypoints/openai/correctness/test_lmeval.py
+++ b/tests/entrypoints/openai/correctness/test_lmeval.py
@@ -22,7 +22,7 @@ TASK = "gsm8k"
 FILTER = "exact_match,strict-match"
 RTOL = 0.03
 EXPECTED_VALUE = 0.54
-DEFAULT_ARGS = ["--max-model-len", "4096", "--disable-log-requests"]
+DEFAULT_ARGS = ["--max-model-len", "4096"]
 MORE_ARGS_LIST = [
     [],  # Default
     ["--enable-chunked-prefill"],  # Chunked
diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index e7c3ffaa6a9f2..5ad29d70f10df 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -488,7 +488,9 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
 
 @pytest.mark.asyncio
 async def test_guided_choice_chat(client: openai.AsyncOpenAI,
-                                  sample_guided_choice):
+                                  sample_guided_choice, is_v1_server: bool):
+    if not is_v1_server:
+        pytest.skip("Guided decoding is only supported in v1 engine")
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -524,8 +526,10 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-async def test_guided_json_chat(client: openai.AsyncOpenAI,
-                                sample_json_schema):
+async def test_guided_json_chat(client: openai.AsyncOpenAI, sample_json_schema,
+                                is_v1_server: bool):
+    if not is_v1_server:
+        pytest.skip("Guided decoding is only supported in v1 engine")
 
     messages = [{
         "role": "system",
@@ -568,7 +572,10 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex):
+async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex,
+                                 is_v1_server: bool):
+    if not is_v1_server:
+        pytest.skip("Guided decoding is only supported in v1 engine")
 
     messages = [{
         "role": "system",
@@ -653,7 +660,10 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema):
+async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema,
+                              is_v1_server: bool):
+    if not is_v1_server:
+        pytest.skip("Tool use is only supported in v1 engine")
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -741,131 +751,6 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema):
     assert json1["age"] != json2["age"]
 
 
-@pytest.mark.asyncio
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_required_tool_use(client: openai.AsyncOpenAI,
-                                 is_v1_server: bool, model_name: str):
-    if is_v1_server:
-        pytest.skip(
-            "tool_choice='required' requires features unsupported on V1")
-
-    tools = [
-        {
-            "type": "function",
-            "function": {
-                "name": "get_current_weather",
-                "description": "Get the current weather in a given location",
-                "parameters": {
-                    "type": "object",
-                    "properties": {
-                        "city": {
-                            "type": "string",
-                            "description":
-                            "The city to find the weather for, e.g. 'Vienna'",
-                            "default": "Vienna",
-                        },
-                        "country": {
-                            "type":
-                            "string",
-                            "description":
-                            "The country that the city is in, e.g. 'Austria'",
-                        },
-                        "unit": {
-                            "type": "string",
-                            "description":
-                            "The unit to fetch the temperature in",
-                            "enum": ["celsius", "fahrenheit"],
-                        },
-                    },
-                    "required": ["country", "unit"],
-                },
-            },
-        },
-        {
-            "type": "function",
-            "function": {
-                "name": "get_forecast",
-                "description": "Get the weather forecast for a given location",
-                "parameters": {
-                    "type": "object",
-                    "properties": {
-                        "city": {
-                            "type": "string",
-                            "description":
-                            "The city to get the forecast for, e.g. 'Vienna'",
-                            "default": "Vienna",
-                        },
-                        "country": {
-                            "type":
-                            "string",
-                            "description":
-                            "The country that the city is in, e.g. 'Austria'",
-                        },
-                        "days": {
-                            "type":
-                            "integer",
-                            "description":
-                            "Number of days to get the forecast for (1-7)",
-                        },
-                        "unit": {
-                            "type": "string",
-                            "description":
-                            "The unit to fetch the temperature in",
-                            "enum": ["celsius", "fahrenheit"],
-                        },
-                    },
-                    "required": ["country", "days", "unit"],
-                },
-            },
-        },
-    ]
-
-    messages = [
-        {
-            "role": "user",
-            "content": "Hi! How are you doing today?"
-        },
-        {
-            "role": "assistant",
-            "content": "I'm doing well! How can I help you?"
-        },
-        {
-            "role":
-            "user",
-            "content":
-            "Can you tell me what the current weather is in Berlin and the "\
-            "forecast for the next 5 days, in fahrenheit?",
-        },
-    ]
-
-    # Non-streaming test
-    chat_completion = await client.chat.completions.create(
-        messages=messages,
-        model=model_name,
-        tools=tools,
-        tool_choice="required",
-    )
-
-    assert chat_completion.choices[0].message.tool_calls is not None
-    assert len(chat_completion.choices[0].message.tool_calls) > 0
-
-    # Streaming test
-    stream = await client.chat.completions.create(
-        messages=messages,
-        model=model_name,
-        tools=tools,
-        tool_choice="required",
-        stream=True,
-    )
-
-    output = []
-    async for chunk in stream:
-        if chunk.choices and chunk.choices[0].delta.tool_calls:
-            output.extend(chunk.choices[0].delta.tool_calls)
-
-    assert len(output) > 0
-
-
 @pytest.mark.asyncio
 async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
                                                   sample_json_schema):
@@ -948,7 +833,11 @@ async def test_response_format_json_object(client: openai.AsyncOpenAI):
 
 
 @pytest.mark.asyncio
-async def test_response_format_json_schema(client: openai.AsyncOpenAI):
+async def test_response_format_json_schema(client: openai.AsyncOpenAI,
+                                           is_v1_server: bool):
+    if not is_v1_server:
+        pytest.skip(
+            "JSON schema response format is only supported in v1 engine")
     prompt = 'what is 1+1? The format is "result": 2'
     # Check that this prompt cannot lead to a valid JSON without json_schema
     for _ in range(2):
diff --git a/tests/entrypoints/openai/test_chunked_prompt.py b/tests/entrypoints/openai/test_chunked_prompt.py
index 3c8ed955a65a2..c8160c5f2d0e3 100644
--- a/tests/entrypoints/openai/test_chunked_prompt.py
+++ b/tests/entrypoints/openai/test_chunked_prompt.py
@@ -26,8 +26,6 @@ def server():
         "--enable-chunked-prefill",
         "--max-num-batched-tokens",
         "1000",
-        # large prompts create a lot of output
-        "--disable-log-requests",
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
diff --git a/tests/entrypoints/openai/test_classification.py b/tests/entrypoints/openai/test_classification.py
index b2472658ca81c..bcf127307f730 100644
--- a/tests/entrypoints/openai/test_classification.py
+++ b/tests/entrypoints/openai/test_classification.py
@@ -3,6 +3,8 @@
 
 import pytest
 import requests
+import torch
+import torch.nn.functional as F
 
 from vllm.entrypoints.openai.protocol import ClassificationResponse
 
@@ -181,3 +183,32 @@ async def test_invocations(server: RemoteOpenAIServer):
         assert classification_data.keys() == invocation_data.keys()
         assert classification_data["probs"] == pytest.approx(
             invocation_data["probs"], rel=0.01)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_activation(server: RemoteOpenAIServer, model_name: str):
+    input_text = ["This product was excellent and exceeded my expectations"]
+
+    async def get_outputs(activation):
+        response = requests.post(server.url_for("classify"),
+                                 json={
+                                     "model": model_name,
+                                     "input": input_text,
+                                     "activation": activation
+                                 })
+        outputs = response.json()
+        return torch.tensor([x['probs'] for x in outputs["data"]])
+
+    default = await get_outputs(activation=None)
+    w_activation = await get_outputs(activation=True)
+    wo_activation = await get_outputs(activation=False)
+
+    assert torch.allclose(default, w_activation,
+                          atol=1e-2), "Default should use activation."
+    assert not torch.allclose(
+        w_activation, wo_activation,
+        atol=1e-2), "wo_activation should not use activation."
+    assert torch.allclose(
+        F.softmax(wo_activation, dim=-1), w_activation, atol=1e-2
+    ), "w_activation should be close to activation(wo_activation)."
diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py
index 6eca3e767f3f0..74ef6deeea16b 100644
--- a/tests/entrypoints/openai/test_completion.py
+++ b/tests/entrypoints/openai/test_completion.py
@@ -28,7 +28,7 @@ MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
 # but we're not testing generation quality here
 LORA_NAME = "typeof/zephyr-7b-beta-lora"
 
-GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
+GUIDED_DECODING_BACKENDS = ["outlines", "xgrammar", "guidance"]
 
 
 @pytest.fixture(scope="module")
@@ -95,6 +95,14 @@ def server(default_server_args, request):
             os.environ['VLLM_USE_V1'] = original_value
 
 
+@pytest.fixture
+def is_v1_server(server):
+    import os
+
+    # For completion tests, we assume v0 since there's no explicit v1 setup
+    return os.environ.get('VLLM_USE_V1', '0') == '1'
+
+
 @pytest_asyncio.fixture
 async def client(server):
     async with server.get_async_client() as async_client:
@@ -631,7 +639,10 @@ async def test_allowed_token_ids(client: openai.AsyncOpenAI):
 @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_json_completion(client: openai.AsyncOpenAI,
                                       guided_decoding_backend: str,
-                                      sample_json_schema):
+                                      sample_json_schema, is_v1_server: bool):
+    if not is_v1_server:
+        pytest.skip("Guided decoding is only supported in v1 engine")
+
     completion = await client.completions.create(
         model=MODEL_NAME,
         prompt=f"Give an example JSON for an employee profile "
@@ -653,7 +664,10 @@ async def test_guided_json_completion(client: openai.AsyncOpenAI,
 @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_regex_completion(client: openai.AsyncOpenAI,
                                        guided_decoding_backend: str,
-                                       sample_regex):
+                                       sample_regex, is_v1_server: bool):
+    if not is_v1_server:
+        pytest.skip("Guided decoding is only supported in v1 engine")
+
     completion = await client.completions.create(
         model=MODEL_NAME,
         prompt=f"Give an example IPv4 address with this regex: {sample_regex}",
@@ -674,7 +688,11 @@ async def test_guided_regex_completion(client: openai.AsyncOpenAI,
 @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_choice_completion(client: openai.AsyncOpenAI,
                                         guided_decoding_backend: str,
-                                        sample_guided_choice):
+                                        sample_guided_choice,
+                                        is_v1_server: bool):
+    if not is_v1_server:
+        pytest.skip("Guided decoding is only supported in v1 engine")
+
     completion = await client.completions.create(
         model=MODEL_NAME,
         prompt="The best language for type-safe systems programming is ",
@@ -692,7 +710,9 @@ async def test_guided_choice_completion(client: openai.AsyncOpenAI,
 
 @pytest.mark.asyncio
 async def test_guided_grammar(client: openai.AsyncOpenAI,
-                              sample_sql_statements):
+                              sample_sql_statements, is_v1_server: bool):
+    if not is_v1_server:
+        pytest.skip("Guided grammar is only supported in v1 engine")
 
     completion = await client.completions.create(
         model=MODEL_NAME,
@@ -754,7 +774,11 @@ async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
 @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
                                           guided_decoding_backend: str,
-                                          sample_json_schema, sample_regex):
+                                          sample_json_schema, sample_regex,
+                                          is_v1_server: bool):
+    if not is_v1_server:
+        pytest.skip("Guided decoding is only supported in v1 engine")
+
     with pytest.raises(openai.BadRequestError):
         _ = await client.completions.create(
             model=MODEL_NAME,
diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py
index a7203befcc402..cf2442a569388 100644
--- a/tests/entrypoints/openai/test_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@@ -8,6 +8,8 @@ import openai
 import pytest
 import pytest_asyncio
 import requests
+import torch
+import torch.nn.functional as F
 
 from vllm.entrypoints.openai.protocol import EmbeddingResponse
 from vllm.transformers_utils.tokenizer import get_tokenizer
@@ -369,3 +371,35 @@ async def test_invocations_conversation(server: RemoteOpenAIServer):
                                embeddings_1_lst=[invocation_data["embedding"]],
                                name_0="chat",
                                name_1="invocation")
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_normalize(server: RemoteOpenAIServer, model_name: str):
+    input_text = ["The chef prepared a delicious meal."]
+
+    async def get_outputs(normalize):
+        request_args = {
+            "model": MODEL_NAME,
+            "input": input_text,
+            "encoding_format": "float",
+            "normalize": normalize
+        }
+
+        response = requests.post(server.url_for("v1/embeddings"),
+                                 json=request_args)
+        outputs = response.json()
+
+        return torch.tensor([x['embedding'] for x in outputs["data"]])
+
+    default = await get_outputs(normalize=None)
+    w_normal = await get_outputs(normalize=True)
+    wo_normal = await get_outputs(normalize=False)
+
+    assert torch.allclose(default, w_normal,
+                          atol=1e-2), "Default should use normal."
+    assert not torch.allclose(w_normal, wo_normal,
+                              atol=1e-2), "wo_normal should not use normal."
+    assert torch.allclose(
+        w_normal, F.normalize(wo_normal, p=2, dim=-1),
+        atol=1e-2), "w_normal should be close to normal(wo_normal)."
diff --git a/tests/entrypoints/openai/test_prompt_validation.py b/tests/entrypoints/openai/test_prompt_validation.py
index ff0730c77032c..e31a1d077608f 100644
--- a/tests/entrypoints/openai/test_prompt_validation.py
+++ b/tests/entrypoints/openai/test_prompt_validation.py
@@ -9,6 +9,11 @@ import regex as re
 from ...utils import RemoteOpenAIServer
 
 
+@pytest.fixture(scope="function", autouse=True)
+def use_v1_only(monkeypatch):
+    monkeypatch.setenv('VLLM_USE_V1', '1')
+
+
 @pytest.mark.asyncio
 async def test_empty_prompt():
     model_name = "gpt2"
@@ -37,24 +42,3 @@ async def test_out_of_vocab_token_ids():
                                             prompt=[999999],
                                             max_tokens=5,
                                             temperature=0.0)
-
-
-@pytest.mark.asyncio
-async def test_reject_multistep_with_guided_decoding():
-    model_name = "gpt2"
-    server_args = ["--enforce-eager", "--num-scheduler-steps", "8"]
-    with RemoteOpenAIServer(model_name, server_args) as remote_server:
-        client = remote_server.get_async_client()
-
-        with pytest.raises(
-                openai.BadRequestError,
-                match=re.compile(
-                    '.*Guided decoding .* multi-step decoding.*').pattern):
-            await client.completions.create(
-                model=model_name,
-                prompt="Hello",
-                max_tokens=5,
-                temperature=0.0,
-                extra_body={"response_format": {
-                    "type": "json_object"
-                }})
diff --git a/tests/entrypoints/openai/test_rerank.py b/tests/entrypoints/openai/test_rerank.py
index 4da97fe13691b..f121693e329fa 100644
--- a/tests/entrypoints/openai/test_rerank.py
+++ b/tests/entrypoints/openai/test_rerank.py
@@ -3,6 +3,8 @@
 
 import pytest
 import requests
+import torch
+import torch.nn.functional as F
 
 from vllm.entrypoints.openai.protocol import RerankResponse
 
@@ -125,3 +127,39 @@ def test_invocations(server: RemoteOpenAIServer):
         assert rerank_result.keys() == invocations_result.keys()
         assert rerank_result["relevance_score"] == pytest.approx(
             invocations_result["relevance_score"], rel=0.01)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_activation(server: RemoteOpenAIServer, model_name: str):
+
+    async def get_outputs(activation):
+        query = "What is the capital of France?"
+        documents = [
+            "The capital of Brazil is Brasilia.",
+            "The capital of France is Paris."
+        ]
+
+        response = requests.post(server.url_for("rerank"),
+                                 json={
+                                     "model": model_name,
+                                     "query": query,
+                                     "documents": documents,
+                                     "activation": activation
+                                 })
+        outputs = response.json()
+
+        return torch.tensor([x['relevance_score'] for x in outputs["results"]])
+
+    default = await get_outputs(activation=None)
+    w_activation = await get_outputs(activation=True)
+    wo_activation = await get_outputs(activation=False)
+
+    assert torch.allclose(default, w_activation,
+                          atol=1e-2), "Default should use activation."
+    assert not torch.allclose(
+        w_activation, wo_activation,
+        atol=1e-2), "wo_activation should not use activation."
+    assert torch.allclose(
+        F.sigmoid(wo_activation), w_activation, atol=1e-2
+    ), "w_activation should be close to activation(wo_activation)."
diff --git a/tests/entrypoints/openai/test_score.py b/tests/entrypoints/openai/test_score.py
index 187542b7bafc9..1a5df1d2dbd2d 100644
--- a/tests/entrypoints/openai/test_score.py
+++ b/tests/entrypoints/openai/test_score.py
@@ -4,6 +4,7 @@ from typing import Any
 
 import pytest
 import requests
+import torch
 import torch.nn.functional as F
 from torch import tensor
 
@@ -220,3 +221,43 @@ class TestModel:
             assert score_data.keys() == invocation_data.keys()
             assert score_data["score"] == pytest.approx(
                 invocation_data["score"], rel=0.01)
+
+    def test_activation(self, server: RemoteOpenAIServer, model: dict[str,
+                                                                      Any]):
+
+        def get_outputs(activation):
+            text_1 = "What is the capital of France?"
+            text_2 = "The capital of France is Paris."
+            response = requests.post(server.url_for("score"),
+                                     json={
+                                         "model": model["name"],
+                                         "text_1": text_1,
+                                         "text_2": text_2,
+                                         "activation": activation
+                                     })
+            if response.status_code != 200:
+                return response
+
+            outputs = response.json()
+            return torch.tensor([x['score'] for x in outputs["data"]])
+
+        if model["is_cross_encoder"]:
+
+            default = get_outputs(activation=None)
+            w_activation = get_outputs(activation=True)
+            wo_activation = get_outputs(activation=False)
+
+            assert torch.allclose(default, w_activation,
+                                  atol=1e-2), "Default should use activation."
+            assert not torch.allclose(
+                w_activation, wo_activation,
+                atol=1e-2), "wo_activation should not use activation."
+            assert torch.allclose(
+                F.sigmoid(wo_activation), w_activation, atol=1e-2
+            ), "w_activation should be close to activation(wo_activation)."
+        else:
+            get_outputs(activation=None)
+
+            # The activation parameter only works for the is_cross_encoder model
+            response = get_outputs(activation=True)
+            assert response.status_code == 400
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index 54daf1a91d645..647f1c7b7f34f 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -579,10 +579,7 @@ def test_parse_chat_messages_rejects_too_many_images_in_one_message(
         warnings.filterwarnings(
             "ignore",
             message="coroutine 'async_get_and_parse_image' was never awaited")
-        with pytest.raises(
-                ValueError,
-                match="At most 2 image\\(s\\) may be provided in one request\\."
-        ):
+        with pytest.raises(ValueError, match="At most"):
             parse_chat_messages(
                 [{
                     "role":
@@ -622,10 +619,7 @@ def test_parse_chat_messages_rejects_too_many_images_across_messages(
         warnings.filterwarnings(
             "ignore",
             message="coroutine 'async_get_and_parse_image' was never awaited")
-        with pytest.raises(
-                ValueError,
-                match="At most 2 image\\(s\\) may be provided in one request\\."
-        ):
+        with pytest.raises(ValueError, match="At most"):
             parse_chat_messages(
                 [{
                     "role":
diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py
index 93bf20da4adba..bfeafaa9e27e6 100644
--- a/tests/kernels/attention/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@@ -278,23 +278,13 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
 
 @pytest.mark.parametrize("use_v1", [True, False])
 def test_invalid_env(use_v1: bool, monkeypatch: pytest.MonkeyPatch):
-
+    """Test that invalid attention backend names raise ValueError."""
     with monkeypatch.context() as m, patch(
             "vllm.attention.selector.current_platform", CudaPlatform()):
         m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
         m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)
 
-        # Test with head size 32
-        backend = get_attn_backend(32, torch.float16, None, 16, False)
-        EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else "FLASH_ATTN"
-        assert backend.get_name() == EXPECTED
-
-        # when block size == 16, backend will fall back to XFORMERS
-        # this behavior is not yet supported on V1.
-        if use_v1:
-            # TODO: support fallback on V1!
-            # https://github.com/vllm-project/vllm/issues/14524
-            pass
-        else:
-            backend = get_attn_backend(16, torch.float16, None, 16, False)
-            assert backend.get_name() == "XFORMERS"
+        # Should raise ValueError for invalid backend
+        with pytest.raises(ValueError) as exc_info:
+            get_attn_backend(32, torch.float16, None, 16, False)
+        assert "Invalid attention backend: 'INVALID'" in str(exc_info.value)
diff --git a/tests/kernels/attention/test_flashinfer_trtllm_attention.py b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
new file mode 100644
index 0000000000000..e87ce520bc66b
--- /dev/null
+++ b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
@@ -0,0 +1,293 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+
+import flashinfer
+import pytest
+import torch
+
+from vllm.platforms import current_platform
+
+if not current_platform.is_device_capability(100):
+    pytest.skip("This TRTLLM kernel requires NVIDIA Blackwell.",
+                allow_module_level=True)
+
+FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
+
+# KV Cache Layout for TRT-LLM
+# kv_cache_shape = (num_blocks, 2, num_kv_heads, page_size, head_dim)
+
+MAX_Q_LEN = 1024
+MAX_KV_LEN = 4096
+BATCH_SIZES = [4, 12]
+NUM_HEADS = [(64, 8), (16, 16), (40, 8), (32, 8)]
+HEAD_SIZES = [128]
+BLOCK_SIZES = [16, 32]
+KV_LAYOUTS = ["HND"]
+DTYPES = [torch.float16, torch.bfloat16]
+KV_CACHE_DTYPES = [None, current_platform.fp8_dtype()]
+NUM_BLOCKS = 32768  # Large enough to test overflow in index calculation.
+SOFT_CAPS = [None, 50.0]
+
+
+def to_float8(x, dtype=torch.float8_e4m3fn):
+    finfo = torch.finfo(dtype)
+    min_val, max_val = x.aminmax()
+    amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12)
+    scale = finfo.max / amax * 0.1
+    x_scl_sat = (x * scale).clamp(min=finfo.min, max=finfo.max)
+    return x_scl_sat.to(dtype), scale.float().reciprocal()
+
+
+@pytest.mark.parametrize("batch_size", BATCH_SIZES)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("kv_layout", KV_LAYOUTS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPES)
+@pytest.mark.parametrize("soft_cap", SOFT_CAPS)
+@torch.inference_mode
+def test_flashinfer_trtllm_decode_with_baseline(
+    batch_size: int,
+    num_heads: tuple[int, int],
+    head_size: int,
+    block_size: int,
+    kv_layout: str,
+    dtype: torch.dtype,
+    kv_cache_dtype: Optional[torch.dtype],
+    soft_cap: Optional[float],
+) -> None:
+    kv_cache_dtype = dtype if kv_cache_dtype is None else kv_cache_dtype
+
+    torch.set_default_device("cuda")
+    current_platform.seed_everything(0)
+
+    kv_lens = torch.randint(1, MAX_KV_LEN, (batch_size, ), dtype=torch.int32)
+    kv_lens[-1] = MAX_KV_LEN
+    max_kv_len = torch.max(kv_lens).item()
+    num_seqs = len(kv_lens)
+
+    num_query_heads = num_heads[0]
+    num_kv_heads = num_heads[1]
+    assert num_query_heads % num_kv_heads == 0
+
+    scale = head_size**-0.5
+
+    query = torch.randn(num_seqs, num_query_heads, head_size, dtype=dtype)
+
+    kv_cache_shape = None
+    if kv_layout == "NHD":
+        kv_cache_shape = (NUM_BLOCKS, 2, block_size, num_kv_heads, head_size)
+    elif kv_layout == "HND":
+        kv_cache_shape = (NUM_BLOCKS, 2, num_kv_heads, block_size, head_size)
+    else:
+        raise ValueError(f"Invalid kv_layout: {kv_layout}")
+    key_value_cache = torch.randn(kv_cache_shape, dtype=dtype)
+    kv_scale = 1.0
+    if kv_cache_dtype is current_platform.fp8_dtype():
+        key_value_cache, kv_scale = to_float8(key_value_cache,
+                                              current_platform.fp8_dtype())
+
+    max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
+    block_tables = torch.randint(0,
+                                 NUM_BLOCKS,
+                                 (num_seqs, max_num_blocks_per_seq),
+                                 dtype=torch.int32)
+    k_scale = v_scale = kv_scale
+    kv_indptr = [0]
+    kv_indices = []
+    kv_last_page_lens = []
+    for i in range(num_seqs):
+        seq_len = kv_lens[i]
+        assert seq_len > 0
+        num_blocks = (seq_len + block_size - 1) // block_size
+        kv_indices.extend(block_tables[i, :num_blocks])
+        kv_indptr.append(kv_indptr[-1] + num_blocks)
+        kv_last_page_len = seq_len % block_size
+        if kv_last_page_len == 0:
+            kv_last_page_len = block_size
+        kv_last_page_lens.append(kv_last_page_len)
+
+    kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
+    kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
+    kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
+
+    workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
+    wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
+        workspace_buffer,
+        kv_layout,
+        use_tensor_cores=((num_query_heads // num_kv_heads) > 4))
+    wrapper.plan(kv_indptr,
+                 kv_indices,
+                 kv_last_page_lens,
+                 num_query_heads,
+                 num_kv_heads,
+                 head_size,
+                 block_size,
+                 "NONE",
+                 sm_scale=scale,
+                 q_data_type=dtype,
+                 kv_data_type=kv_cache_dtype,
+                 logits_soft_cap=soft_cap)
+
+    output = torch.empty(query.shape, dtype=dtype)
+    wrapper.run(query,
+                key_value_cache,
+                k_scale=k_scale,
+                v_scale=v_scale,
+                out=output)
+
+    # TRTLLM Decode
+    kv_lens_tensor = torch.tensor(kv_lens, dtype=torch.int32)
+    output_trtllm = torch.empty(query.shape, dtype=dtype)
+    flashinfer.decode.trtllm_batch_decode_with_kv_cache(
+        query=query.contiguous(),
+        kv_cache=key_value_cache,
+        workspace_buffer=workspace_buffer,
+        block_tables=block_tables,
+        seq_lens=kv_lens_tensor,
+        max_seq_len=max_kv_len,
+        bmm1_scale=k_scale * scale,
+        bmm2_scale=v_scale,
+        out=output_trtllm,
+    )
+
+    torch.testing.assert_close(output, output_trtllm, atol=1e-2, rtol=1e-2), \
+        f"{torch.max(torch.abs(output - output_trtllm))}"
+
+
+@pytest.mark.parametrize("batch_size", BATCH_SIZES)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("kv_layout", KV_LAYOUTS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPES)
+@pytest.mark.parametrize("soft_cap", [None])
+@torch.inference_mode
+def test_flashinfer_trtllm_prefill_with_baseline(
+    batch_size: int,
+    num_heads: tuple[int, int],
+    head_size: int,
+    block_size: int,
+    kv_layout: str,
+    dtype: torch.dtype,
+    kv_cache_dtype: Optional[torch.dtype],
+    soft_cap: Optional[float],
+) -> None:
+    kv_cache_dtype = dtype if kv_cache_dtype is None else kv_cache_dtype
+    if dtype != kv_cache_dtype:
+        pytest.skip(f"Not supported dtype({dtype}) with "
+                    "kv_cache_dtype({kv_cache_dtype})")
+
+    torch.set_default_device("cuda")
+    current_platform.seed_everything(0)
+
+    q_lens = torch.randint(1, MAX_Q_LEN, (batch_size, ), dtype=torch.int32)
+    q_lens[-1] = MAX_Q_LEN
+    max_q_len = torch.max(q_lens).item()
+    q_indptr = torch.cat([
+        torch.tensor([0], dtype=torch.int32),
+        torch.cumsum(q_lens, dim=0, dtype=torch.int32),
+    ])
+
+    kv_lens = torch.randint(0, MAX_KV_LEN, (batch_size, ), dtype=torch.int32)
+    kv_lens[-1] = MAX_KV_LEN
+
+    seq_lens = kv_lens + q_lens
+    max_seq_len = torch.max(seq_lens).item()
+    num_seqs = len(seq_lens)
+
+    num_query_heads = num_heads[0]
+    num_kv_heads = num_heads[1]
+    assert num_query_heads % num_kv_heads == 0
+
+    scale = head_size**-0.5
+
+    query = torch.randn(torch.sum(q_lens).item(),
+                        num_query_heads,
+                        head_size,
+                        dtype=dtype)
+
+    kv_cache_shape = None
+    if kv_layout == "NHD":
+        kv_cache_shape = (NUM_BLOCKS, 2, block_size, num_kv_heads, head_size)
+    elif kv_layout == "HND":
+        kv_cache_shape = (NUM_BLOCKS, 2, num_kv_heads, block_size, head_size)
+    else:
+        raise ValueError(f"Invalid kv_layout: {kv_layout}")
+    key_value_cache = torch.randn(kv_cache_shape, dtype=dtype)
+    kv_scale = 1.0
+    if kv_cache_dtype is current_platform.fp8_dtype():
+        key_value_cache, kv_scale = to_float8(key_value_cache,
+                                              current_platform.fp8_dtype())
+
+    max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
+    block_tables = torch.randint(0,
+                                 NUM_BLOCKS,
+                                 (num_seqs, max_num_blocks_per_seq),
+                                 dtype=torch.int32)
+    k_scale = v_scale = kv_scale
+    kv_indptr = [0]
+    kv_indices = []
+    kv_last_page_lens = []
+    for i in range(num_seqs):
+        seq_len = seq_lens[i]
+        assert seq_len > 0
+        num_blocks = (seq_len + block_size - 1) // block_size
+        kv_indices.extend(block_tables[i, :num_blocks])
+        kv_indptr.append(kv_indptr[-1] + num_blocks)
+        kv_last_page_len = seq_len % block_size
+        if kv_last_page_len == 0:
+            kv_last_page_len = block_size
+        kv_last_page_lens.append(kv_last_page_len)
+
+    kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
+    kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
+    kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
+
+    workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
+    wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
+        workspace_buffer, kv_layout)
+    wrapper.plan(q_indptr,
+                 kv_indptr,
+                 kv_indices,
+                 kv_last_page_lens,
+                 num_query_heads,
+                 num_kv_heads,
+                 head_size,
+                 block_size,
+                 causal=True,
+                 sm_scale=scale,
+                 q_data_type=dtype,
+                 kv_data_type=kv_cache_dtype,
+                 logits_soft_cap=soft_cap)
+
+    output = torch.empty(query.shape, dtype=dtype)
+    wrapper.run(query,
+                key_value_cache,
+                k_scale=k_scale,
+                v_scale=v_scale,
+                out=output)
+
+    # TRTLLM Decode
+    output_trtllm = torch.empty(query.shape, dtype=dtype)
+    flashinfer.prefill.trtllm_batch_context_with_kv_cache(
+        query=query.contiguous(),
+        kv_cache=key_value_cache,
+        workspace_buffer=workspace_buffer,
+        block_tables=block_tables,
+        seq_lens=seq_lens,
+        max_q_len=max_q_len,
+        max_kv_len=max_seq_len,
+        bmm1_scale=k_scale * scale,
+        bmm2_scale=v_scale,
+        batch_size=num_seqs,
+        cum_seq_lens_q=q_indptr,
+        cum_seq_lens_kv=kv_indptr,
+        out=output_trtllm,
+    )
+
+    torch.testing.assert_close(output, output_trtllm, atol=1e-2, rtol=1e-2), \
+        f"{torch.max(torch.abs(output - output_trtllm))}"
diff --git a/tests/kernels/attention/test_flashinfer_trtllm_decode_attention.py b/tests/kernels/attention/test_flashinfer_trtllm_decode_attention.py
deleted file mode 100644
index 96eee13695a9d..0000000000000
--- a/tests/kernels/attention/test_flashinfer_trtllm_decode_attention.py
+++ /dev/null
@@ -1,140 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
-
-import flashinfer
-import pytest
-import torch
-
-from vllm.platforms import current_platform
-
-if not current_platform.is_device_capability(100):
-    pytest.skip("This TRTLLM kernel requires NVIDIA Blackwell.",
-                allow_module_level=True)
-
-FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
-
-# KV Cache Layout for TRT-LLM
-# kv_cache_shape = (num_blocks, 2, num_kv_heads, page_size, head_dim)
-
-NUM_HEADS = [(64, 8), (16, 16), (40, 8), (32, 8)]
-HEAD_SIZES = [128]
-BLOCK_SIZES = [16, 32]
-DTYPES = [torch.float16, torch.bfloat16]
-NUM_BLOCKS = 32768  # Large enough to test overflow in index calculation.
-SOFT_CAPS = [None, 30.0, 50.0]
-
-
-def to_float8(x, dtype=torch.float8_e4m3fn):
-    finfo = torch.finfo(dtype)
-    min_val, max_val = x.aminmax()
-    amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12)
-    scale = finfo.max / amax * 0.1
-    x_scl_sat = (x * scale).clamp(min=finfo.min, max=finfo.max)
-    return x_scl_sat.to(dtype), scale.float().reciprocal()
-
-
-@pytest.mark.parametrize("kv_lens", [[1328, 18, 463], [1, 54, 293, 70]])
-@pytest.mark.parametrize("num_heads", NUM_HEADS)
-@pytest.mark.parametrize("head_size", HEAD_SIZES)
-@pytest.mark.parametrize("block_size", BLOCK_SIZES)
-@pytest.mark.parametrize("kv_layout", ["HND"])
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("soft_cap", SOFT_CAPS)
-@torch.inference_mode
-def test_flashinfer_trtllm_decode_with_baseline(
-    kv_lens: list[int],
-    num_heads: tuple[int, int],
-    head_size: int,
-    dtype: torch.dtype,
-    block_size: int,
-    soft_cap: Optional[float],
-    kv_layout: str,
-) -> None:
-    torch.set_default_device("cuda")
-    current_platform.seed_everything(0)
-    num_seqs = len(kv_lens)
-    num_query_heads = num_heads[0]
-    num_kv_heads = num_heads[1]
-
-    assert num_query_heads % num_kv_heads == 0
-    max_kv_len = max(kv_lens)
-    scale = head_size**-0.5
-
-    query = torch.randn(num_seqs, num_query_heads, head_size, dtype=dtype)
-    kv_cache_shape = None
-    if kv_layout == "NHD":
-        kv_cache_shape = (NUM_BLOCKS, 2, block_size, num_kv_heads, head_size)
-    elif kv_layout == "HND":
-        kv_cache_shape = (NUM_BLOCKS, 2, num_kv_heads, block_size, head_size)
-    else:
-        raise ValueError(f"Invalid kv_layout: {kv_layout}")
-    key_value_cache = torch.randn(kv_cache_shape, dtype=dtype)
-
-    max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
-    block_tables = torch.randint(0,
-                                 NUM_BLOCKS,
-                                 (num_seqs, max_num_blocks_per_seq),
-                                 dtype=torch.int32)
-    k_scale = v_scale = 1.0
-    kv_indptr = [0]
-    kv_indices = []
-    kv_last_page_lens = []
-    for i in range(num_seqs):
-        seq_len = kv_lens[i]
-        assert seq_len > 0
-        num_blocks = (seq_len + block_size - 1) // block_size
-        kv_indices.extend(block_tables[i, :num_blocks])
-        kv_indptr.append(kv_indptr[-1] + num_blocks)
-        kv_last_page_len = seq_len % block_size
-        if kv_last_page_len == 0:
-            kv_last_page_len = block_size
-        kv_last_page_lens.append(kv_last_page_len)
-
-    kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
-    kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
-    kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
-
-    workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
-    wrapper = flashinfer.\
-        BatchDecodeWithPagedKVCacheWrapper(workspace_buffer, kv_layout,
-                use_tensor_cores=(
-                    (num_query_heads//num_kv_heads) > 4)
-                )
-    wrapper.plan(kv_indptr,
-                 kv_indices,
-                 kv_last_page_lens,
-                 num_query_heads,
-                 num_kv_heads,
-                 head_size,
-                 block_size,
-                 "NONE",
-                 q_data_type=dtype,
-                 kv_data_type=dtype,
-                 logits_soft_cap=soft_cap)
-
-    output = wrapper.run(query, key_value_cache, scale)
-
-    # TRTLLM Decode
-    max_kv_len = max(kv_lens)
-    kv_lens_tensor = torch.tensor(kv_lens,
-                                  dtype=torch.int,
-                                  device=query.device)
-    output_trtllm = flashinfer.decode.trtllm_batch_decode_with_kv_cache(
-        query.contiguous(),
-        key_value_cache,
-        workspace_buffer,
-        num_query_heads,
-        num_kv_heads,
-        scale,
-        block_tables,
-        kv_lens_tensor,
-        block_size,
-        max_kv_len,
-        "auto",
-        k_scale,
-        v_scale,
-    )
-
-    torch.testing.assert_close(output, output_trtllm, atol=1e-2, rtol=1e-2), \
-        f"{torch.max(torch.abs(output - output_trtllm))}"
diff --git a/tests/kernels/mamba/test_mamba_ssm.py b/tests/kernels/mamba/test_mamba_ssm.py
index 8dece26ddb29c..4c32ae81b34c5 100644
--- a/tests/kernels/mamba/test_mamba_ssm.py
+++ b/tests/kernels/mamba/test_mamba_ssm.py
@@ -365,6 +365,7 @@ def test_selective_state_update(dim, dstate, has_z, itype):
     batch_size = 1
     state = torch.randn(batch_size, dim, dstate, dtype=itype, device=device)
     x = torch.randn(batch_size, dim, device=device, dtype=itype)
+    out = torch.empty_like(x)
     dt = torch.randn(batch_size, dim, device=device, dtype=itype)
     dt_bias = torch.rand(dim, device=device) - 4.0
     A = -torch.rand(dim, dstate, device=device) - 1.0
@@ -373,16 +374,17 @@ def test_selective_state_update(dim, dstate, has_z, itype):
     D = torch.randn(dim, device=device)
     z = torch.randn_like(x) if has_z else None
     state_ref = state.detach().clone()
-    out = selective_state_update(state,
-                                 x,
-                                 dt,
-                                 A,
-                                 B,
-                                 C,
-                                 D=D,
-                                 z=z,
-                                 dt_bias=dt_bias,
-                                 dt_softplus=True)
+    selective_state_update(state,
+                           x,
+                           dt,
+                           A,
+                           B,
+                           C,
+                           D=D,
+                           z=z,
+                           dt_bias=dt_bias,
+                           dt_softplus=True,
+                           out=out)
     out_ref = selective_state_update_ref(state_ref,
                                          x,
                                          dt,
@@ -581,6 +583,7 @@ def test_selective_state_update_with_batch_indices(with_padding, dim, dstate,
     ],
                                         dim=0)
     x = torch.randn(padded_batch_size, dim, device=device, dtype=itype)
+    out = torch.empty_like(x)
     dt = torch.randn(padded_batch_size, dim, device=device, dtype=itype)
     dt_bias = torch.rand(dim, device=device) - 4.0
     A = -torch.rand(dim, dstate, device=device) - 1.0
@@ -590,18 +593,19 @@ def test_selective_state_update_with_batch_indices(with_padding, dim, dstate,
     z = torch.randn_like(x) if has_z else None
     state_ref = state[state_indices, :].clone()
     state_before = state.clone()
-    out = selective_state_update(state,
-                                 x,
-                                 dt,
-                                 A,
-                                 B,
-                                 C,
-                                 D=D,
-                                 z=z,
-                                 dt_bias=dt_bias,
-                                 dt_softplus=True,
-                                 state_batch_indices=padded_state_indices,
-                                 pad_slot_id=PAD_SLOT_ID)
+    selective_state_update(state,
+                           x,
+                           dt,
+                           A,
+                           B,
+                           C,
+                           D=D,
+                           z=z,
+                           dt_bias=dt_bias,
+                           dt_softplus=True,
+                           state_batch_indices=padded_state_indices,
+                           pad_slot_id=PAD_SLOT_ID,
+                           out=out)
     out_ref = selective_state_update_ref(state_ref,
                                          x[:batch_size],
                                          dt[:batch_size],
@@ -665,6 +669,7 @@ def test_selective_state_update_with_heads_with_batch_indices(
         dtype=torch.int32, device=device)
 
     x = torch.randn(batch_size, nheads, headdim, device=device, dtype=itype)
+    out = torch.empty_like(x)
     if not tie_hdim:
         dt = torch.randn(batch_size,
                          nheads,
@@ -691,18 +696,19 @@ def test_selective_state_update_with_heads_with_batch_indices(
     C = torch.randn(batch_size, ngroups, dstate, device=device)
     z = torch.randn_like(x) if has_z else None
     state_ref = state[state_indices, :].detach().clone()
-    out = selective_state_update(state,
-                                 x,
-                                 dt,
-                                 A,
-                                 B,
-                                 C,
-                                 D=D,
-                                 z=z,
-                                 dt_bias=dt_bias,
-                                 dt_softplus=True,
-                                 state_batch_indices=state_indices,
-                                 pad_slot_id=PAD_SLOT_ID)
+    selective_state_update(state,
+                           x,
+                           dt,
+                           A,
+                           B,
+                           C,
+                           D=D,
+                           z=z,
+                           dt_bias=dt_bias,
+                           dt_softplus=True,
+                           state_batch_indices=state_indices,
+                           pad_slot_id=PAD_SLOT_ID,
+                           out=out)
     out_ref = selective_state_update_ref(state_ref,
                                          x,
                                          dt,
diff --git a/tests/kernels/mamba/test_mamba_ssm_ssd.py b/tests/kernels/mamba/test_mamba_ssm_ssd.py
index 00c1a2911d7db..67b14a7faa89f 100644
--- a/tests/kernels/mamba/test_mamba_ssm_ssd.py
+++ b/tests/kernels/mamba/test_mamba_ssm_ssd.py
@@ -212,15 +212,16 @@ def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size,
 
     Y_min, final_state_min = ssd_minimal_discrete(X * dt.unsqueeze(-1), A * dt,
                                                   B, C, chunk_size)
-
-    Y, final_state = mamba_chunk_scan_combined(X,
-                                               dt,
-                                               A,
-                                               B,
-                                               C,
-                                               chunk_size,
-                                               D=None,
-                                               return_final_states=True)
+    Y = torch.empty_like(X)
+    final_state = mamba_chunk_scan_combined(X,
+                                            dt,
+                                            A,
+                                            B,
+                                            C,
+                                            chunk_size,
+                                            D=None,
+                                            return_final_states=True,
+                                            out=Y)
 
     # just test the last in sequence
     torch.testing.assert_close(Y[:, -1], Y_min[:, -1], atol=atol, rtol=rtol)
@@ -292,7 +293,8 @@ def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases,
             _query_start_loc_to_chunk_indices_offsets(
                 cu_seqlens, chunk_size, cu_seqlens[-1])
 
-        Y, new_states = mamba_chunk_scan_combined(
+        Y = torch.empty_like(X)
+        new_states = mamba_chunk_scan_combined(
             X,
             dt,
             A,
@@ -306,6 +308,7 @@ def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases,
             chunk_offsets=chunk_offsets,
             return_varlen_states=True,
             initial_states=states,
+            out=Y,
         )
 
         # just test the last in sequence
diff --git a/tests/kernels/moe/modular_kernel_tools/utils.py b/tests/kernels/moe/modular_kernel_tools/utils.py
index 09bb4a34f3189..866f52882beee 100644
--- a/tests/kernels/moe/modular_kernel_tools/utils.py
+++ b/tests/kernels/moe/modular_kernel_tools/utils.py
@@ -1,10 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import math
 
 import torch
 
 import vllm._custom_ops as ops
+from vllm.utils.deep_gemm import per_block_cast_to_fp8
 
 
 def per_token_cast_to_fp8(
@@ -20,29 +20,6 @@ def per_token_cast_to_fp8(
     return fp8_data.view(m, n + pad_size)[:, :n], (x_amax / 448.0).view(m, -1)
 
 
-def per_block_cast_to_fp8(
-        x: torch.Tensor, block_size_k: int,
-        block_size_n: int) -> tuple[torch.Tensor, torch.Tensor]:
-    assert x.dim() == 2
-    m, n = x.shape
-    x_padded = torch.zeros(
-        (
-            int(math.ceil(m / block_size_k)) * block_size_k,
-            int(math.ceil(n / block_size_n)) * block_size_n,
-        ),
-        dtype=x.dtype,
-        device=x.device,
-    )
-    x_padded[:m, :n] = x
-    x_view = x_padded.view(-1, block_size_k,
-                           x_padded.size(1) // block_size_k, block_size_n)
-    x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
-    x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn)
-    x_scaled_sub = x_scaled.view_as(x_padded)[:m, :n].contiguous()
-    scales = (x_amax / 448.0).view(x_view.size(0), x_view.size(2))
-    return x_scaled_sub, scales
-
-
 def make_non_quant_weights(
     e: int,
     n: int,
@@ -99,11 +76,9 @@ def make_block_quant_fp8_weights(
 
     for i in range(e):
         w1[i], w1_s[i] = per_block_cast_to_fp8(w1_bf16[i],
-                                               block_size_k=block_k,
-                                               block_size_n=block_n)
+                                               block_size=[block_k, block_n])
         w2[i], w2_s[i] = per_block_cast_to_fp8(w2_bf16[i],
-                                               block_size_k=block_k,
-                                               block_size_n=block_n)
+                                               block_size=[block_k, block_n])
 
     return w1, w2, w1_s, w2_s
 
diff --git a/tests/kernels/moe/test_batched_deepgemm.py b/tests/kernels/moe/test_batched_deepgemm.py
new file mode 100644
index 0000000000000..018d4c224f75e
--- /dev/null
+++ b/tests/kernels/moe/test_batched_deepgemm.py
@@ -0,0 +1,103 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
+    BatchedDeepGemmExperts)
+from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
+    BatchedPrepareAndFinalize, BatchedTritonExperts)
+from vllm.model_executor.layers.fused_moe.modular_kernel import (
+    FusedMoEModularKernel)
+from vllm.utils.deep_gemm import calc_diff, is_deep_gemm_supported
+
+from .test_deepgemm import make_block_quant_fp8_weights
+
+BLOCK_SIZE = [128, 128]
+
+
+@pytest.mark.skipif(not is_deep_gemm_supported(),
+                    reason="Requires deep_gemm kernels")
+@pytest.mark.parametrize("E", [16, 32])  # number of experts
+@pytest.mark.parametrize("T", [256, 512])  # tokens per expert
+@pytest.mark.parametrize("K", [128, 256])  # hidden dim
+@pytest.mark.parametrize("N", [512, 1024])  # intermediate dim per expert
+@pytest.mark.parametrize("topk", [2, 4])
+def test_batched_deepgemm_vs_triton(E: int, T: int, K: int, N: int, topk: int,
+                                    monkeypatch):
+    """Compare BatchedDeepGemmExperts to BatchedTritonExperts."""
+
+    monkeypatch.setenv("VLLM_USE_DEEP_GEMM", "1")
+
+    device = "cuda"
+    w1, w2, w1_s, w2_s = make_block_quant_fp8_weights(E, N, K, BLOCK_SIZE)
+
+    M = E * T  # total tokens
+    a = torch.randn(M, K, device=device, dtype=torch.bfloat16) / 10.0
+    fp8_info = torch.finfo(torch.float8_e4m3fn)
+    a.clamp_(fp8_info.min, fp8_info.max)
+
+    # random router outputs → top-k indices / weights
+    router_logits = torch.randn(M, E, device=device, dtype=torch.float32)
+    topk_weights, topk_ids = torch.topk(router_logits, k=topk, dim=-1)
+    topk_weights = torch.nn.functional.softmax(topk_weights, dim=-1)
+
+    # token number for each expert
+    cnt = torch.bincount(topk_ids.flatten(), minlength=E)
+    max_cnt = int(cnt.max().item())
+    # next power of 2 for max token number
+    max_num_tokens = 1 << (max_cnt - 1).bit_length()
+
+    prep_finalize = BatchedPrepareAndFinalize(
+        max_num_tokens=max_num_tokens,
+        num_local_experts=E,
+        num_dispatchers=1,
+        rank=0,
+    )
+
+    # triton (reference)
+    triton_experts = BatchedTritonExperts(
+        max_num_tokens=max_num_tokens,
+        num_dispatchers=1,
+        use_fp8_w8a8=True,
+        per_act_token_quant=False,
+        block_shape=BLOCK_SIZE,
+    )
+    mk_triton = FusedMoEModularKernel(prep_finalize, triton_experts)
+
+    out_triton = mk_triton(
+        hidden_states=a,
+        w1=w1,
+        w2=w2,
+        topk_weights=topk_weights,
+        topk_ids=topk_ids,
+        inplace=False,
+        w1_scale=w1_s,
+        w2_scale=w2_s,
+        global_num_experts=E,
+    )
+
+    # deepgemm
+    deepgemm_experts = BatchedDeepGemmExperts(
+        max_num_tokens=max_num_tokens,
+        num_dispatchers=1,
+        block_shape=BLOCK_SIZE,
+        per_act_token_quant=False,
+    )
+    mk_deepgemm = FusedMoEModularKernel(prep_finalize, deepgemm_experts)
+
+    out_deepgemm = mk_deepgemm(
+        hidden_states=a,
+        w1=w1,
+        w2=w2,
+        topk_weights=topk_weights,
+        topk_ids=topk_ids,
+        inplace=False,
+        w1_scale=w1_s,
+        w2_scale=w2_s,
+        global_num_experts=E,
+    )
+
+    diff = calc_diff(out_deepgemm, out_triton)
+    assert diff < 1e-3, f"Output diff too large: {diff}"
diff --git a/tests/kernels/moe/test_cutlass_grouped_gemm.py b/tests/kernels/moe/test_cutlass_grouped_gemm.py
index 67984fe7319a3..1aee1ed8c3762 100644
--- a/tests/kernels/moe/test_cutlass_grouped_gemm.py
+++ b/tests/kernels/moe/test_cutlass_grouped_gemm.py
@@ -12,10 +12,8 @@ import torch
 from tests.kernels.utils import baseline_scaled_mm
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
-
-
-def cdiv(a, b):
-    return (a + b - 1) // b
+from vllm.utils import cdiv
+from vllm.utils.deep_gemm import per_block_cast_to_fp8
 
 
 def per_token_cast_to_fp8(
@@ -32,21 +30,6 @@ def per_token_cast_to_fp8(
     return fp8_data.view(m, n + pad_size)[:, :n], (x_amax / 448.0).view(m, -1)
 
 
-def per_block_cast_to_fp8(
-        x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
-    assert x.dim() == 2
-    m, n = x.shape
-    x_padded = torch.zeros((cdiv(m, 128) * 128, cdiv(n, 128) * 128),
-                           device=x.device,
-                           dtype=x.dtype)
-    x_padded[:m, :n] = x
-    x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128)
-    x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
-    x_scaled = (x_view * (448.0 / x_amax)).to(dtype=torch.float8_e4m3fn)
-    return x_scaled.view_as(x_padded)[:m, :n].contiguous(), (
-        x_amax / 448.0).view(x_view.size(0), x_view.size(2))
-
-
 @pytest.mark.parametrize("num_groups, expected_m_per_group, k, n", [
     (4, 8192, 7168, 4096),
     (4, 8192, 2048, 7168),
diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py
index 074771e49a061..266f1161a684b 100644
--- a/tests/kernels/moe/test_deepep_deepgemm_moe.py
+++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py
@@ -20,7 +20,8 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import (
     FusedMoEModularKernel)
 from vllm.platforms import current_platform
 from vllm.utils import has_deep_ep, has_deep_gemm
-from vllm.utils.deep_gemm import is_blackwell_deep_gemm_used
+from vllm.utils.deep_gemm import (is_blackwell_deep_gemm_used,
+                                  is_deep_gemm_supported)
 
 from .parallel_utils import ProcessGroupInfo, parallel_launch
 from .utils import make_test_weights
@@ -46,7 +47,7 @@ requires_deep_ep = pytest.mark.skipif(
 )
 
 requires_deep_gemm = pytest.mark.skipif(
-    not has_deep_gemm(),
+    not is_deep_gemm_supported(),
     reason="Requires deep_gemm kernels",
 )
 
diff --git a/tests/kernels/moe/test_deepgemm.py b/tests/kernels/moe/test_deepgemm.py
index f7578e226917d..b2b78662c9ded 100644
--- a/tests/kernels/moe/test_deepgemm.py
+++ b/tests/kernels/moe/test_deepgemm.py
@@ -15,16 +15,11 @@ import torch
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     per_token_group_quant_fp8)
-from vllm.utils import has_deep_gemm
-from vllm.utils.deep_gemm import calc_diff, per_block_cast_to_fp8
+from vllm.utils.deep_gemm import (calc_diff, is_deep_gemm_supported,
+                                  per_block_cast_to_fp8)
 
 BLOCK_SIZE = [128, 128]
 
-requires_deep_gemm = pytest.mark.skipif(
-    not has_deep_gemm(),
-    reason="Requires deep_gemm kernels",
-)
-
 
 def make_block_quant_fp8_weights(
     e: int,
@@ -69,8 +64,12 @@ def make_block_quant_fp8_weights(
                        dtype=torch.float32)
 
     for i in range(e):
-        w1[i], w1_s[i] = per_block_cast_to_fp8(w1_bf16[i])
-        w2[i], w2_s[i] = per_block_cast_to_fp8(w2_bf16[i])
+        w1[i], w1_s[i] = per_block_cast_to_fp8(w1_bf16[i],
+                                               block_size=block_size,
+                                               use_ue8m0=True)
+        w2[i], w2_s[i] = per_block_cast_to_fp8(w2_bf16[i],
+                                               block_size=block_size,
+                                               use_ue8m0=True)
 
     return w1, w2, w1_s, w2_s
 
@@ -148,7 +147,8 @@ NUM_EXPERTS = [32]
 @pytest.mark.parametrize("mnk", MNKs)
 @pytest.mark.parametrize("topk", TOPKS)
 @pytest.mark.parametrize("num_experts", NUM_EXPERTS)
-@requires_deep_gemm
+@pytest.mark.skipif(not is_deep_gemm_supported(),
+                    reason="Requires deep_gemm kernels")
 def test_deepgemm_vs_triton(mnk, topk, num_experts, monkeypatch):
 
     with monkeypatch.context() as m:
diff --git a/tests/kernels/moe/utils.py b/tests/kernels/moe/utils.py
index df89ad7e6da6f..c33134981acc0 100644
--- a/tests/kernels/moe/utils.py
+++ b/tests/kernels/moe/utils.py
@@ -5,8 +5,7 @@ from typing import Optional
 import torch
 
 import vllm._custom_ops as ops
-from tests.kernels.quant_utils import (per_block_cast_to_fp8,
-                                       per_block_cast_to_int8)
+from tests.kernels.quant_utils import per_block_cast_to_int8
 from vllm.model_executor.layers.fused_moe import fused_experts
 from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
     BatchedPrepareAndFinalize, BatchedTritonExperts, NaiveBatchedExperts)
@@ -15,6 +14,7 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import (
 from vllm.model_executor.layers.fused_moe.utils import (
     moe_kernel_quantize_input)
 from vllm.utils import round_up
+from vllm.utils.deep_gemm import per_block_cast_to_fp8
 
 
 def triton_moe(
diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py
index 6f43d1111c98e..01a1ad2e7a0a5 100644
--- a/tests/kernels/quant_utils.py
+++ b/tests/kernels/quant_utils.py
@@ -222,25 +222,6 @@ def native_per_token_group_quant_int8(x,
 DEFAULT_BLOCK_SHAPE = [128, 128]
 
 
-def per_block_cast_to_fp8(
-    x: torch.Tensor,
-    block_shape: list[int] = DEFAULT_BLOCK_SHAPE,
-) -> tuple[torch.Tensor, torch.Tensor]:
-    block_m, block_n = block_shape
-    assert x.dim() == 2
-    m, n = x.shape
-    x_padded = torch.zeros((round_up(m, block_m), round_up(n, block_n)),
-                           dtype=x.dtype,
-                           device=x.device)
-    x_padded[:m, :n] = x
-    x_view = x_padded.view(-1, block_m, x_padded.size(1) // block_n, block_n)
-    x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
-    x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn)
-    x_scaled_sub = x_scaled.view_as(x_padded)[:m, :n].contiguous()
-    scales = (x_amax / 448.0).view(x_view.size(0), x_view.size(2))
-    return x_scaled_sub, scales
-
-
 def per_block_cast_to_int8(
     x: torch.Tensor,
     block_shape: list[int] = DEFAULT_BLOCK_SHAPE,
diff --git a/tests/kernels/quantization/test_block_fp8.py b/tests/kernels/quantization/test_block_fp8.py
index 26aa8d652e639..d9154d3fd7f33 100644
--- a/tests/kernels/quantization/test_block_fp8.py
+++ b/tests/kernels/quantization/test_block_fp8.py
@@ -117,7 +117,7 @@ def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed):
     B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
 
     A_fp8, As_fp8 = per_token_group_quant_fp8(A_fp32, block_size[1])
-    B_fp8, Bs_fp8 = per_block_cast_to_fp8(B_fp32)
+    B_fp8, Bs_fp8 = per_block_cast_to_fp8(B_fp32, block_size=block_size)
 
     As = As_fp8.to(torch.float32)
     Bs = Bs_fp8.to(torch.float32)
diff --git a/tests/kernels/quantization/test_cutlass_scaled_mm.py b/tests/kernels/quantization/test_cutlass_scaled_mm.py
index 544e6dc197904..8730eeaaa761c 100644
--- a/tests/kernels/quantization/test_cutlass_scaled_mm.py
+++ b/tests/kernels/quantization/test_cutlass_scaled_mm.py
@@ -559,8 +559,6 @@ def test_cutlass_fp8_group_gemm(num_experts: int, per_act_token: bool,
         m_a_scales = m_g if per_act_token else 1
         n_b_scales = n_g if per_out_ch else 1
 
-        print("shape:", m_g, n_g, k_g)
-
         # Create group-specific A and B (FP8) and output (FP16/FP32)
         a_g = to_fp8(torch.randn((m_g, k_g), device=device))
         b_g = to_fp8(torch.randn((n_g, k_g), device=device).t())
@@ -639,7 +637,4 @@ def test_cutlass_fp8_group_gemm(num_experts: int, per_act_token: bool,
     for g in range(num_experts):
         baseline = baseline_tensors[g]
         c = out_tensors_stacked[expert_offsets[g]:expert_offsets[g + 1]]
-        print(baseline)
-        print(c)
-        print("*")
         torch.testing.assert_close(c, baseline, rtol=1e-2, atol=5e-4)
diff --git a/tests/kernels/quantization/test_per_token_group_quant.py b/tests/kernels/quantization/test_per_token_group_quant.py
index f826983fe94e1..07f17d1efe641 100644
--- a/tests/kernels/quantization/test_per_token_group_quant.py
+++ b/tests/kernels/quantization/test_per_token_group_quant.py
@@ -5,7 +5,7 @@ from unittest.mock import patch
 import pytest
 import torch
 
-from vllm.model_executor.layers.quantization.utils import fp8_utils
+from vllm.model_executor.layers.quantization.utils import fp8_utils, int8_utils
 
 
 @pytest.mark.parametrize("shape", [(32, 128), (64, 256), (16, 512)])
@@ -42,3 +42,32 @@ def test_per_token_group_quant_fp8(shape, column_major: bool,
 
     assert torch.allclose(out_q.float(), ref_q.float(), atol=0.15, rtol=0.15)
     assert torch.allclose(scale, ref_s, atol=0.01, rtol=0.01)
+
+
+@pytest.mark.parametrize("shape", [(32, 128), (64, 256), (16, 512)])
+@pytest.mark.parametrize("group_size", [64, 128])
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+def test_per_token_group_quant_int8(shape, group_size: int):
+    device = "cuda"
+
+    torch.manual_seed(42)
+    num_tokens, hidden_dim = shape
+
+    x = (torch.randn(
+        (num_tokens, hidden_dim), device=device, dtype=torch.bfloat16) * 8)
+
+    # cuda path
+    out_q, scale = int8_utils.per_token_group_quant_int8(
+        x,
+        group_size,
+    )
+
+    # triton ref
+    with patch("vllm.platforms.current_platform.is_cuda", return_value=False):
+        ref_q, ref_s = int8_utils.per_token_group_quant_int8(
+            x,
+            group_size,
+        )
+
+    assert torch.allclose(out_q.float(), ref_q.float(), atol=0.15, rtol=0.15)
+    assert torch.allclose(scale, ref_s, atol=0.01, rtol=0.01)
diff --git a/tests/kv_transfer/test_disagg.py b/tests/kv_transfer/test_disagg.py
deleted file mode 100644
index 9f2229cc41dff..0000000000000
--- a/tests/kv_transfer/test_disagg.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import os
-import subprocess
-import sys
-import time
-from subprocess import Popen
-
-import pytest
-import requests
-import torch
-
-
-# Fixture to set up environment variables and teardown servers after tests
-@pytest.fixture(scope="module", autouse=True)
-def setup_servers():
-    if torch.cuda.device_count() < 2:
-        pytest.skip("Skipping test: fewer than 2 GPUs available")
-
-    # Set up environment variables
-    VLLM_HOST_IP = subprocess.check_output("hostname -I | awk '{print $1}'",
-                                           shell=True).decode().strip()
-    os.environ["VLLM_HOST_IP"] = VLLM_HOST_IP
-
-    # Start prefill instance
-    prefill_cmd = [
-        sys.executable,
-        "-m",
-        "vllm.entrypoints.openai.api_server",
-        "--model",
-        "meta-llama/Llama-3.2-1B-Instruct",
-        "--port",
-        "8100",
-        "--gpu-memory-utilization",
-        "0.5",
-        "--max-model-len",
-        "1000",
-        "--kv-transfer-config",
-        '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer",'\
-        '"kv_rank":0,"kv_parallel_size":2}',
-    ]
-    prefill_env = os.environ.copy()
-    prefill_env["CUDA_VISIBLE_DEVICES"] = "0"
-    prefill_proc = Popen(prefill_cmd, env=prefill_env)
-
-    # Start decode instance
-    decode_cmd = [
-        sys.executable,
-        "-m",
-        "vllm.entrypoints.openai.api_server",
-        "--model",
-        "meta-llama/Llama-3.2-1B-Instruct",
-        "--port",
-        "8200",
-        "--gpu-memory-utilization",
-        "0.5",
-        "--max-model-len",
-        "1000",
-        "--kv-transfer-config",
-        '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer",'\
-        '"kv_rank":1,"kv_parallel_size":2}',
-    ]
-    decode_env = os.environ.copy()
-    decode_env["CUDA_VISIBLE_DEVICES"] = "1"
-    decode_proc = Popen(decode_cmd, env=decode_env)
-
-    # Wait for servers to be ready
-    assert wait_for_server(8100), "Prefill server did not start in time"
-    assert wait_for_server(8200), "Decode server did not start in time"
-
-    # Yield to the test function and handle teardown after tests
-    yield
-
-    # Cleanup: kill the processes
-    prefill_proc.terminate()
-    decode_proc.terminate()
-
-    # Additional cleanup if needed
-    prefill_proc.wait()
-    decode_proc.wait()
-
-
-# Helper function to wait for server
-def wait_for_server(port, timeout=240):
-    start_time = time.time()
-    while time.time() - start_time < timeout:
-        try:
-            response = requests.get(f"http://localhost:{port}/v1/completions")
-            if response.status_code in [200, 405]:
-                return True
-        except requests.ConnectionError:
-            time.sleep(1)
-    return False
-
-
-# Test function to send curl requests and validate responses
-@pytest.mark.parametrize("prompt", ["San Francisco is a", "Santa Clara is a"])
-def test_disaggregated_prefilling(prompt):
-    # Send to prefill
-    response = requests.post("http://localhost:8100/v1/completions",
-                             headers={"Content-Type": "application/json"},
-                             json={
-                                 "model": "meta-llama/Llama-3.2-1B-Instruct",
-                                 "prompt": prompt,
-                                 "max_tokens": 1,
-                                 "temperature": 0
-                             })
-    assert response.status_code == 200
-
-    # Send to decode
-    response = requests.post("http://localhost:8200/v1/completions",
-                             headers={"Content-Type": "application/json"},
-                             json={
-                                 "model": "meta-llama/Llama-3.2-1B-Instruct",
-                                 "prompt": prompt,
-                                 "max_tokens": 10,
-                                 "temperature": 0
-                             })
-    assert response.status_code == 200
diff --git a/tests/lora/test_add_lora.py b/tests/lora/test_add_lora.py
index cc8160b2860d9..d7b019509fa3e 100644
--- a/tests/lora/test_add_lora.py
+++ b/tests/lora/test_add_lora.py
@@ -14,7 +14,7 @@ from vllm.lora.request import LoRARequest
 from vllm.sampling_params import SamplingParams
 from vllm.utils import merge_async_iterators
 
-MODEL_PATH = "THUDM/chatglm3-6b"
+MODEL_PATH = "zai-org/chatglm3-6b"
 LORA_RANK = 64
 DEFAULT_MAX_LORAS = 4 * 3
 
diff --git a/tests/lora/test_chatglm3_tp.py b/tests/lora/test_chatglm3_tp.py
index 5481b413b8f5f..fb00e7b65b04a 100644
--- a/tests/lora/test_chatglm3_tp.py
+++ b/tests/lora/test_chatglm3_tp.py
@@ -6,7 +6,7 @@ from vllm.lora.request import LoRARequest
 
 from ..utils import create_new_process_for_each_test, multi_gpu_test
 
-MODEL_PATH = "THUDM/chatglm3-6b"
+MODEL_PATH = "zai-org/chatglm3-6b"
 
 PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:"""  # noqa: E501
 
diff --git a/tests/lora/test_multi_loras_with_tp.py b/tests/lora/test_multi_loras_with_tp.py
new file mode 100644
index 0000000000000..fe9bd3f269515
--- /dev/null
+++ b/tests/lora/test_multi_loras_with_tp.py
@@ -0,0 +1,158 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Script to test multi loras service with tp >= 2
+"""
+from tests.utils import multi_gpu_test
+from vllm import LLM, SamplingParams
+from vllm.lora.request import LoRARequest
+
+MODEL_PATH = "Qwen/Qwen3-0.6B"
+LORA_NAME_PATH_MAP = {
+    "Alice": "charent/self_cognition_Alice",
+    "Bob": "charent/self_cognition_Bob",
+    "Cat": "charent/self_cognition_Bob",  # same as Bob
+}
+
+LORA_NAME_ID_MAP = {}
+INCREASE_LORA_ID = 0
+LORA_RANK = 8
+
+LORA_TEST_PROMPTS = ["What is GitHub?", "Hi, tell me about you"]
+LORA_TEST_EXPECTED = [
+    "GitHub is an open-source platform that provides a way to manage and develop software projects. It allows developers to store and manage code, collaborate on projects, and automate tasks.",  # noqa: E501
+    "I am Alice, an AI assistant developed by GitHub/Charent.",  # noqa: E501
+]
+
+
+def format_chatml_messages(prompt: str):
+    return [
+        {
+            "role": "system",
+            "content": "You are a helpful assistant."
+        },
+        {
+            "role": "user",
+            "content": prompt
+        },
+    ]
+
+
+def make_add_lora_request(name: str, path: str):
+    global INCREASE_LORA_ID, LORA_NAME_ID_MAP
+
+    INCREASE_LORA_ID += 1
+    LORA_NAME_ID_MAP[name] = INCREASE_LORA_ID
+
+    return LoRARequest(
+        lora_name=name,
+        lora_int_id=INCREASE_LORA_ID,
+        lora_path=path,
+    )
+
+
+@multi_gpu_test(num_gpus=2)
+def test_multi_loras_with_tp_sync():
+
+    llm = LLM(
+        model=MODEL_PATH,
+        enable_lora=True,
+        max_loras=2,  # ensure max_loras < max_cpu_loras
+        max_lora_rank=LORA_RANK,
+        max_model_len=512,
+        gpu_memory_utilization=0.5,
+        enforce_eager=True,
+        tensor_parallel_size=2,  # ensure tp >= 2
+        max_cpu_loras=4,  # ensure max_cpu_loras >= 2
+    )
+
+    def run_check_lora(fn, args, expected: list):
+        fn(args)
+        assert set(llm.llm_engine.list_loras()) == set(expected)
+
+    # simulate add loras with CLI args
+    # likes: `--lora-modules Alice=/path/to/Alice Bob=/path/to/Bob`
+    run_check_lora(
+        llm.llm_engine.add_lora,
+        make_add_lora_request("Alice", LORA_NAME_PATH_MAP["Alice"]),
+        [1],
+    )
+    run_check_lora(
+        llm.llm_engine.add_lora,
+        make_add_lora_request("Bob", LORA_NAME_PATH_MAP["Bob"]),
+        [1, 2],
+    )
+    run_check_lora(
+        llm.llm_engine.add_lora,
+        make_add_lora_request("Cat", LORA_NAME_PATH_MAP["Cat"]),
+        [1, 2, 3],
+    )
+
+    # set temperature = 0 for greedy search
+    sampling_params = SamplingParams(temperature=0, max_tokens=64)
+
+    def call_llm_get_outputs(prompt: str, lora_name: str):
+        lora_request = LoRARequest(
+            lora_name=lora_name,
+            lora_int_id=LORA_NAME_ID_MAP[lora_name],
+            lora_path=LORA_NAME_PATH_MAP[lora_name],
+        )
+        messages = format_chatml_messages(prompt)
+        outputs = llm.chat(
+            [messages],
+            sampling_params,
+            chat_template_kwargs={
+                "enable_thinking": False
+            },  # for those loras, ensure enable_thinking=False
+            lora_request=lora_request,
+            use_tqdm=False,
+        )
+        output_text = outputs[0].outputs[0].text
+        return output_text
+
+    def reload_lora(name: str):
+        """
+        reload a lora to simulate the case: 
+        setting `VLLM_ALLOW_RUNTIME_LORA_UPDATING=true` 
+        for dynamic lora loading and unloading
+        """
+        remove_lora_response = llm.llm_engine.remove_lora(
+            lora_id=LORA_NAME_ID_MAP[name])
+
+        add_lora_response = llm.llm_engine.add_lora(
+            make_add_lora_request(name, LORA_NAME_PATH_MAP[name]))
+
+        print(f"{remove_lora_response=}, {add_lora_response=}")
+
+    def check_outputs(outputs: str, expected: str):
+        print(f"{prompt=}.\n{expected_output=}\n{output_text=}")
+        print("\n----------------------------\n")
+        assert outputs == expected
+
+    for prompt, expected_output in zip(LORA_TEST_PROMPTS, LORA_TEST_EXPECTED):
+
+        output_text = call_llm_get_outputs(prompt, "Alice")
+        check_outputs(output_text, expected_output)
+
+        # call Bob, ignore what it is output
+        call_llm_get_outputs(prompt, "Bob")
+        print("After call Bob:")
+
+        # call Alice
+        output_text = call_llm_get_outputs(prompt, "Alice")
+        check_outputs(output_text, expected_output)
+
+        # reload Bob Lora
+        reload_lora("Bob")
+        print("After reload Bob:")
+
+        # call Alice
+        output_text = call_llm_get_outputs(prompt, "Alice")
+        check_outputs(output_text, expected_output)
+
+        # reload Alice Lora
+        reload_lora("Alice")
+        print("After reload Alice:")
+
+        output_text = call_llm_get_outputs(prompt, "Alice")
+        check_outputs(output_text, expected_output)
diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py
index 604bb307b889d..76f3bc0ebf89f 100644
--- a/tests/lora/test_qwen2vl.py
+++ b/tests/lora/test_qwen2vl.py
@@ -4,8 +4,6 @@ from dataclasses import dataclass
 from typing import Optional
 
 import pytest
-from packaging.version import Version
-from transformers import __version__ as TRANSFORMERS_VERSION
 
 import vllm
 from vllm.assets.image import ImageAsset
@@ -185,10 +183,6 @@ def test_qwen2vl_lora_beam_search(qwen2vl_lora_files):
     current_platform.is_rocm(),
     reason="Qwen2.5-VL dependency xformers incompatible with ROCm",
 )
-@pytest.mark.skipif(
-    Version(TRANSFORMERS_VERSION) < Version("4.49.0"),
-    reason="Qwen2.5-VL require transformers version no lower than 4.49.0",
-)
 def test_qwen25vl_lora(qwen25vl_lora_files):
     """Test Qwen 2.5 VL model with LoRA"""
     config = TestConfig(model_path=QWEN25VL_MODEL_PATH,
diff --git a/tests/model_executor/test_guided_processors.py b/tests/model_executor/test_guided_processors.py
deleted file mode 100644
index 2cf0ba2fe6866..0000000000000
--- a/tests/model_executor/test_guided_processors.py
+++ /dev/null
@@ -1,207 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import json
-import pickle
-
-import pytest
-import torch
-from transformers import AutoTokenizer
-
-from vllm.config import ModelConfig
-from vllm.model_executor.guided_decoding import (
-    get_guided_decoding_logits_processor,
-    get_local_guided_decoding_logits_processor)
-from vllm.model_executor.guided_decoding.outlines_logits_processors import (
-    JSONLogitsProcessor, RegexLogitsProcessor)
-from vllm.sampling_params import GuidedDecodingParams
-
-MODEL_NAME = 'HuggingFaceH4/zephyr-7b-beta'
-GUIDED_DECODING_BACKENDS = [
-    "outlines", "lm-format-enforcer", "xgrammar", "guidance"
-]
-GUIDED_DECODING_BACKENDS_WITH_REASONING_SUPPORT = ["outlines", "xgrammar"]
-REASONING_MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
-
-
-# Initialize the tokenizer for the model here to avoid repeated loading
-@pytest.fixture(scope="module")
-def zephyr_7B_tokenzer():
-    return AutoTokenizer.from_pretrained(MODEL_NAME)
-
-
-@pytest.fixture(scope="module")
-def deepseek_r1_qwen_tokenizer():
-    return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
-
-
-def test_guided_logits_processors(zephyr_7B_tokenzer, sample_regex,
-                                  sample_json_schema):
-    """Basic unit test for RegexLogitsProcessor and JSONLogitsProcessor."""
-    regex_LP = RegexLogitsProcessor(sample_regex,
-                                    zephyr_7B_tokenzer,
-                                    reasoner=None)
-    json_LP = JSONLogitsProcessor(sample_json_schema,
-                                  zephyr_7B_tokenzer,
-                                  whitespace_pattern=None,
-                                  reasoner=None)
-
-    tensor = torch.rand(32000)
-    original_tensor = torch.clone(tensor)
-    tensor = regex_LP([], tensor)
-    assert tensor.shape == original_tensor.shape
-    assert not torch.allclose(tensor, original_tensor)
-
-    tensor = torch.rand(32000)
-    original_tensor = torch.clone(tensor)
-    tensor = json_LP([], tensor)
-    assert tensor.shape == original_tensor.shape
-    assert not torch.allclose(tensor, original_tensor)
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("backend", GUIDED_DECODING_BACKENDS)
-@pytest.mark.parametrize("is_local", [True, False])
-async def test_guided_logits_processor_black_box(backend: str, is_local: bool,
-                                                 sample_regex,
-                                                 sample_json_schema,
-                                                 zephyr_7B_tokenzer):
-
-    config = ModelConfig(
-        MODEL_NAME,
-        runner="generate",
-        seed=0,
-        dtype="bfloat16",
-    )
-    regex_request = GuidedDecodingParams(regex=sample_regex, backend=backend)
-
-    regex_lp = get_local_guided_decoding_logits_processor(
-            regex_request, zephyr_7B_tokenzer, config) if is_local else \
-            await get_guided_decoding_logits_processor(
-                    regex_request, zephyr_7B_tokenzer, config)
-    assert regex_lp is not None
-    tensor = torch.rand(32000)
-    original_tensor = torch.clone(tensor)
-    # allowed tokens at state 0
-    tensor = regex_lp([], tensor)
-    assert tensor.shape == original_tensor.shape
-    assert not torch.allclose(tensor, original_tensor)
-
-    json_request = GuidedDecodingParams(json=sample_json_schema,
-                                        backend=backend)
-    json_lp = await get_guided_decoding_logits_processor(
-        json_request, zephyr_7B_tokenzer, config)
-    assert json_lp is not None
-    tensor = torch.rand(32000)
-    original_tensor = torch.clone(tensor)
-    tensor = json_lp([], tensor)
-    assert tensor.shape == original_tensor.shape
-    assert not torch.allclose(tensor, original_tensor)
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("backend",
-                         GUIDED_DECODING_BACKENDS_WITH_REASONING_SUPPORT)
-@pytest.mark.parametrize("is_local", [True, False])
-@pytest.mark.parametrize("reasoning_backend", ["deepseek_r1"])
-async def test_guided_logits_processor_with_reasoning(
-        backend: str, is_local: bool, reasoning_backend: str, sample_regex,
-        sample_json_schema, deepseek_r1_qwen_tokenizer):
-
-    config = ModelConfig(
-        REASONING_MODEL_NAME,
-        runner="generate",
-        seed=0,
-        dtype="bfloat16",
-    )
-    token_ids = deepseek_r1_qwen_tokenizer.encode(
-        "<think>here is the thinking process")
-    regex_request = GuidedDecodingParams(regex=sample_regex, backend=backend)
-
-    regex_lp = get_local_guided_decoding_logits_processor(regex_request,
-                    deepseek_r1_qwen_tokenizer, config,
-                    reasoning_backend) if is_local else \
-            await get_guided_decoding_logits_processor(
-                    regex_request, deepseek_r1_qwen_tokenizer, config,
-                    reasoning_backend)
-    assert regex_lp is not None
-    tensor = torch.rand(151664)
-    original_tensor = torch.clone(tensor)
-    tensor = regex_lp(token_ids, tensor)
-    assert tensor.shape == original_tensor.shape
-    assert torch.allclose(tensor, original_tensor)
-
-    token_ids = deepseek_r1_qwen_tokenizer.encode(
-        "<think>here is the thinking process")
-    json_request = GuidedDecodingParams(json=sample_json_schema,
-                                        backend=backend)
-    json_lp = get_local_guided_decoding_logits_processor(
-        json_request, deepseek_r1_qwen_tokenizer, config,
-        reasoning_backend) if is_local else \
-        await get_guided_decoding_logits_processor(
-            json_request, deepseek_r1_qwen_tokenizer, config, reasoning_backend)
-    assert json_lp is not None
-    tensor = torch.rand(151664)
-    original_tensor = torch.clone(tensor)
-    tensor = json_lp(token_ids, tensor)
-    assert tensor.shape == original_tensor.shape
-    assert torch.allclose(tensor, original_tensor)
-
-    # Thinking is over, so the tensor should change.
-    token_ids = deepseek_r1_qwen_tokenizer.encode(
-        "<think>here is the thinking process</think>")
-    json_request = GuidedDecodingParams(json=sample_json_schema,
-                                        backend=backend)
-    json_lp = get_local_guided_decoding_logits_processor(
-        json_request, deepseek_r1_qwen_tokenizer, config,
-        reasoning_backend) if is_local else \
-        await get_guided_decoding_logits_processor(
-            json_request, deepseek_r1_qwen_tokenizer, config, reasoning_backend)
-    assert json_lp is not None
-    tensor = torch.rand(151664)
-    original_tensor = torch.clone(tensor)
-    tensor = json_lp(token_ids, tensor)
-    assert tensor.shape == original_tensor.shape
-    assert not torch.allclose(tensor, original_tensor)
-
-
-def test_multiple_guided_options_not_allowed(sample_json_schema, sample_regex):
-    with pytest.raises(ValueError,
-                       match="You can only use one kind of guided"):
-        GuidedDecodingParams(json=sample_json_schema, regex=sample_regex)
-
-    with pytest.raises(ValueError,
-                       match="You can only use one kind of guided"):
-        GuidedDecodingParams(json=sample_json_schema, json_object=True)
-
-    with pytest.raises(ValueError,
-                       match="You can only use one kind of guided"):
-        GuidedDecodingParams(json=sample_json_schema, choice=["a", "b"])
-
-    with pytest.raises(ValueError,
-                       match="You can only use one kind of guided"):
-        GuidedDecodingParams(json=sample_json_schema, grammar="test grammar")
-
-
-def test_pickle_xgrammar_tokenizer_data():
-    try:
-        import xgrammar as xgr
-    except ImportError:
-        pytest.skip("Could not import xgrammar to run test")
-
-    from vllm.model_executor.guided_decoding.xgrammar_decoding import (
-        TokenizerData)
-    tokenizer_data = TokenizerData(
-        metadata=
-        '{"vocab_type":2,"vocab_size":151665,"add_prefix_space":false,"stop_token_ids":[151645]}',
-        encoded_vocab=['!', '"', '#', '$', '%'],
-    )
-    pickled = pickle.dumps(tokenizer_data)
-
-    assert pickled is not None
-
-    depickled: TokenizerData = pickle.loads(pickled)
-
-    assert depickled is not None
-    assert json.loads(
-        depickled.metadata)['vocab_type'] == xgr.VocabType.BYTE_LEVEL.value
diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py
index ea240d2278895..57382914bfea8 100644
--- a/tests/models/language/generation/test_common.py
+++ b/tests/models/language/generation/test_common.py
@@ -53,7 +53,7 @@ AITER_MODEL_LIST = [
             marks=[pytest.mark.core_model, pytest.mark.cpu_model],
         ),
         pytest.param(
-            "THUDM/chatglm3-6b",  # chatglm (text-only)
+            "zai-org/chatglm3-6b",  # chatglm (text-only)
         ),
         pytest.param(
             "meta-llama/Llama-3.2-1B-Instruct",  # llama
diff --git a/tests/models/language/generation/test_mistral.py b/tests/models/language/generation/test_mistral.py
index 81a88f2d485eb..af51a60edfd62 100644
--- a/tests/models/language/generation/test_mistral.py
+++ b/tests/models/language/generation/test_mistral.py
@@ -3,13 +3,11 @@
 import copy
 import json
 
-import jsonschema
-import jsonschema.exceptions
 import pytest
 
 from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (
     MistralToolCall, MistralToolParser)
-from vllm.sampling_params import GuidedDecodingParams, SamplingParams
+from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer import MistralTokenizer
 
 from ...utils import check_logprobs_close
@@ -274,53 +272,6 @@ def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:
         assert parsed_message.content is None
 
 
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("guided_backend",
-                         ["outlines", "lm-format-enforcer", "xgrammar"])
-def test_mistral_guided_decoding(
-    monkeypatch: pytest.MonkeyPatch,
-    vllm_runner,
-    model: str,
-    guided_backend: str,
-) -> None:
-    with monkeypatch.context() as m:
-        # Guided JSON not supported in xgrammar + V1 yet
-        m.setenv("VLLM_USE_V1", "0")
-
-        with vllm_runner(
-                model,
-                dtype='bfloat16',
-                tokenizer_mode="mistral",
-                guided_decoding_backend=guided_backend,
-        ) as vllm_model:
-            guided_decoding = GuidedDecodingParams(json=SAMPLE_JSON_SCHEMA)
-            params = SamplingParams(max_tokens=512,
-                                    temperature=0.7,
-                                    guided_decoding=guided_decoding)
-
-            messages = [{
-                "role": "system",
-                "content": "you are a helpful assistant"
-            }, {
-                "role":
-                "user",
-                "content":
-                f"Give an example JSON for an employee profile that "
-                f"fits this schema: {SAMPLE_JSON_SCHEMA}"
-            }]
-            outputs = vllm_model.llm.chat(messages, sampling_params=params)
-
-        generated_text = outputs[0].outputs[0].text
-        json_response = json.loads(generated_text)
-        assert outputs is not None
-
-        try:
-            jsonschema.validate(instance=json_response,
-                                schema=SAMPLE_JSON_SCHEMA)
-        except jsonschema.exceptions.ValidationError:
-            pytest.fail("Generated response is not valid with JSON schema")
-
-
 def test_mistral_function_call_nested_json():
     """Ensure that the function-name regex captures the entire outer-most
     JSON block, including nested braces."""
diff --git a/tests/models/language/pooling/test_classification.py b/tests/models/language/pooling/test_classification.py
index 77df6d16a3673..c71fa96275335 100644
--- a/tests/models/language/pooling/test_classification.py
+++ b/tests/models/language/pooling/test_classification.py
@@ -6,14 +6,6 @@ from transformers import AutoModelForSequenceClassification
 
 from vllm.platforms import current_platform
 
-# TODO: enable when float32 is supported by V1
-# @pytest.fixture(autouse=True)
-# def v1(run_with_both_engines):
-#     # Simple autouse wrapper to run both engines for each test
-#     # This can be promoted up to conftest.py to run for every
-#     # test in a package
-#     pass
-
 
 @pytest.mark.parametrize(
     "model",
diff --git a/tests/models/language/pooling/test_gte.py b/tests/models/language/pooling/test_gte.py
index 0ad54785308e8..6d2eff709961b 100644
--- a/tests/models/language/pooling/test_gte.py
+++ b/tests/models/language/pooling/test_gte.py
@@ -56,17 +56,10 @@ MODELS = [
                    enable_test=False),
 ]
 
-V1FlashAttentionImpNotSupported = [
-    "Alibaba-NLP/gte-Qwen2-1.5B-instruct", "Alibaba-NLP/gte-modernbert-base"
-]
-
 
 @pytest.mark.parametrize("model_info", MODELS)
-def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo,
-                           monkeypatch) -> None:
-    if model_info.name in V1FlashAttentionImpNotSupported:
-        monkeypatch.setenv("VLLM_USE_V1", "0")
-
+def test_embed_models_mteb(hf_runner, vllm_runner,
+                           model_info: EmbedModelInfo) -> None:
     vllm_extra_kwargs: dict[str, Any] = {}
     if model_info.architecture == "GteNewModel":
         vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]}
@@ -77,11 +70,8 @@ def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo,
 
 @pytest.mark.parametrize("model_info", MODELS)
 def test_embed_models_correctness(hf_runner, vllm_runner,
-                                  model_info: EmbedModelInfo, example_prompts,
-                                  monkeypatch) -> None:
-    if model_info.name in V1FlashAttentionImpNotSupported:
-        monkeypatch.setenv("VLLM_USE_V1", "0")
-
+                                  model_info: EmbedModelInfo,
+                                  example_prompts) -> None:
     vllm_extra_kwargs: dict[str, Any] = {}
     if model_info.architecture == "GteNewModel":
         vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]}
diff --git a/tests/models/language/pooling/test_jina.py b/tests/models/language/pooling/test_jina.py
index 2ae431de16838..59b634428ceff 100644
--- a/tests/models/language/pooling/test_jina.py
+++ b/tests/models/language/pooling/test_jina.py
@@ -4,7 +4,6 @@ from functools import partial
 
 import pytest
 
-import vllm.envs as envs
 from vllm import PoolingParams
 
 from ...utils import EmbedModelInfo, RerankModelInfo
@@ -24,14 +23,6 @@ RERANK_MODELS = [
 ]
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 @pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
 def test_embed_models_mteb(hf_runner, vllm_runner,
                            model_info: EmbedModelInfo) -> None:
@@ -63,10 +54,6 @@ def test_embed_models_correctness(hf_runner, vllm_runner,
 @pytest.mark.parametrize("model_info", RERANK_MODELS)
 def test_rerank_models_mteb(hf_runner, vllm_runner,
                             model_info: RerankModelInfo) -> None:
-    if (model_info.architecture == "XLMRobertaForSequenceClassification"
-            and envs.VLLM_USE_V1):
-        pytest.skip("Not supported yet")
-
     mteb_test_rerank_models(hf_runner, vllm_runner, model_info)
 
 
diff --git a/tests/models/language/pooling/test_override_pooler_config.py b/tests/models/language/pooling/test_override_pooler_config.py
new file mode 100644
index 0000000000000..2b1c74652e76f
--- /dev/null
+++ b/tests/models/language/pooling/test_override_pooler_config.py
@@ -0,0 +1,127 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+import torch.nn.functional as F
+
+from tests.models.utils import softmax
+from vllm.config import PoolerConfig
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        "jason9693/Qwen2.5-1.5B-apeach",
+        "papluca/xlm-roberta-base-language-detection"
+    ],
+)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_classify_models_using_activation(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+) -> None:
+
+    with vllm_runner(model,
+                     max_model_len=512,
+                     dtype=dtype,
+                     override_pooler_config=PoolerConfig(
+                         activation=False)) as vllm_model:
+        wo_activation_out = vllm_model.classify(example_prompts)
+
+    with vllm_runner(model,
+                     max_model_len=512,
+                     dtype=dtype,
+                     override_pooler_config=PoolerConfig(
+                         activation=True)) as vllm_model:
+        w_activation_out = vllm_model.classify(example_prompts)
+
+    for wo_activation, w_activation in zip(wo_activation_out,
+                                           w_activation_out):
+        wo_activation = torch.tensor(wo_activation)
+        w_activation = torch.tensor(w_activation)
+
+        assert not torch.allclose(
+            wo_activation, w_activation,
+            atol=1e-2), "override_pooler_config is not working"
+        assert torch.allclose(softmax(wo_activation), w_activation,
+                              1e-3 if dtype == "float" else 1e-2)
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        "intfloat/multilingual-e5-small",
+    ],
+)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_embed_models_using_normalize(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+) -> None:
+
+    with vllm_runner(model,
+                     max_model_len=512,
+                     dtype=dtype,
+                     override_pooler_config=PoolerConfig(
+                         normalize=False)) as vllm_model:
+        wo_normalize = torch.tensor(vllm_model.embed(example_prompts))
+
+    with vllm_runner(
+            model,
+            max_model_len=512,
+            dtype=dtype,
+            override_pooler_config=PoolerConfig(normalize=True)) as vllm_model:
+        w_normalize = torch.tensor(vllm_model.embed(example_prompts))
+
+    assert not torch.allclose(
+        wo_normalize, w_normalize,
+        atol=1e-2), "override_pooler_config normalize is not working"
+    assert torch.allclose(
+        F.normalize(wo_normalize, p=2, dim=-1), w_normalize,
+        atol=1e-2), "w_normal should be close to normal(wo_normal)."
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        "internlm/internlm2-1_8b-reward",
+    ],
+)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_reward_models_using_softmax(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+) -> None:
+
+    with vllm_runner(
+            model,
+            max_model_len=1024,
+            dtype=dtype,
+            override_pooler_config=PoolerConfig(softmax=False)) as vllm_model:
+        wo_softmax = vllm_model.encode(example_prompts)
+
+    with vllm_runner(
+            model,
+            max_model_len=1024,
+            dtype=dtype,
+            override_pooler_config=PoolerConfig(softmax=True)) as vllm_model:
+        w_softmax = vllm_model.encode(example_prompts)
+
+    for wo, w in zip(wo_softmax, w_softmax):
+        wo = torch.tensor(wo)
+        w = torch.tensor(w)
+
+        assert not torch.allclose(
+            wo, w, atol=1e-2), "override_pooler_config softmax is not working"
+        assert torch.allclose(
+            softmax(wo), w,
+            atol=1e-2), "w_softmax should be close to softmax(wo_softmax)."
diff --git a/tests/models/language/pooling/test_qwen3_reranker.py b/tests/models/language/pooling/test_qwen3_reranker.py
index 9c6a833b41384..68e96f32700ca 100644
--- a/tests/models/language/pooling/test_qwen3_reranker.py
+++ b/tests/models/language/pooling/test_qwen3_reranker.py
@@ -83,9 +83,6 @@ def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
         }
     }
 
-    if model_info.name == "Qwen/Qwen3-Reranker-4B":
-        vllm_extra_kwargs["max_num_seqs"] = 1
-
     mteb_test_rerank_models(Qwen3RerankerHfRunner, vllm_runner, model_info,
                             vllm_extra_kwargs)
 
@@ -106,9 +103,6 @@ def test_rerank_models_mteb_tp(vllm_runner,
         "tensor_parallel_size": 2,
     }
 
-    if model_info.name == "Qwen/Qwen3-Reranker-4B":
-        vllm_extra_kwargs["max_num_seqs"] = 1
-
     mteb_test_rerank_models(Qwen3RerankerHfRunner,
                             vllm_runner,
                             model_info,
diff --git a/tests/models/language/pooling/test_reward.py b/tests/models/language/pooling/test_reward.py
index 3b7fab3ba5c99..7add1d975c634 100644
--- a/tests/models/language/pooling/test_reward.py
+++ b/tests/models/language/pooling/test_reward.py
@@ -95,7 +95,7 @@ def test_prm_models(
         monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
 
     with vllm_runner(model, max_model_len=1024, dtype=dtype) as vllm_model:
-        vllm_outputs = vllm_model.encode(math_step_prompts)
+        vllm_outputs = vllm_model.reward(math_step_prompts)
 
     with hf_runner(model, dtype=dtype, auto_cls=AutoModel) as hf_model:
         hf_model = step_reward_patch_hf_model(hf_model)
@@ -103,7 +103,7 @@ def test_prm_models(
 
     # check logits difference
     for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
-        hf_output = torch.tensor(hf_output)
-        vllm_output = torch.tensor(vllm_output)
+        hf_output = torch.tensor(hf_output).float()
+        vllm_output = torch.tensor(vllm_output).float()
 
         assert torch.allclose(hf_output, vllm_output, 1.5e-2)
diff --git a/tests/models/language/pooling/test_truncation_control.py b/tests/models/language/pooling/test_truncation_control.py
index dc2bf21ef63bc..c6ef899958a07 100644
--- a/tests/models/language/pooling/test_truncation_control.py
+++ b/tests/models/language/pooling/test_truncation_control.py
@@ -28,7 +28,7 @@ def test_smaller_truncation_size(vllm_runner,
 
     with vllm_runner(model_name, runner="pooling",
                      max_model_len=max_model_len) as vllm_model:
-        vllm_output = vllm_model.llm.encode(
+        vllm_output = vllm_model.llm.embed(
             input_str, truncate_prompt_tokens=truncate_prompt_tokens)
 
     prompt_tokens = vllm_output[0].prompt_token_ids
@@ -43,7 +43,7 @@ def test_max_truncation_size(vllm_runner,
 
     with vllm_runner(model_name, runner="pooling",
                      max_model_len=max_model_len) as vllm_model:
-        vllm_output = vllm_model.llm.encode(
+        vllm_output = vllm_model.llm.embed(
             input_str, truncate_prompt_tokens=truncate_prompt_tokens)
 
     prompt_tokens = vllm_output[0].prompt_token_ids
@@ -61,7 +61,7 @@ def test_bigger_truncation_size(vllm_runner,
             model_name, runner="pooling",
             max_model_len=max_model_len) as vllm_model:
 
-        llm_output = vllm_model.llm.encode(
+        llm_output = vllm_model.llm.embed(
             input_str, truncate_prompt_tokens=truncate_prompt_tokens)
 
         assert llm_output == f"""truncate_prompt_tokens value 
diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index 5bff615fb1071..8cb826c1144d2 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -355,7 +355,7 @@ VLM_TEST_SETTINGS = {
         num_logprobs=10,
     ),
     "glm4v": VLMTestInfo(
-        models=["THUDM/glm-4v-9b"],
+        models=["zai-org/glm-4v-9b"],
         test_type=VLMTestType.IMAGE,
         prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>",  # noqa: E501
         single_image_prompts=IMAGE_ASSETS.prompts({
@@ -374,7 +374,7 @@ VLM_TEST_SETTINGS = {
         marks=[large_gpu_mark(min_gb=32)],
     ),
     "glm4_1v": VLMTestInfo(
-        models=["THUDM/GLM-4.1V-9B-Thinking"],
+        models=["zai-org/GLM-4.1V-9B-Thinking"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
         prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>",  # noqa: E501
         img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>", # noqa: E501
@@ -388,7 +388,7 @@ VLM_TEST_SETTINGS = {
         marks=[large_gpu_mark(min_gb=32)],
     ),
     "glm4_1v-video": VLMTestInfo(
-        models=["THUDM/GLM-4.1V-9B-Thinking"],
+        models=["zai-org/GLM-4.1V-9B-Thinking"],
         # GLM4.1V require include video metadata for input
         test_type=VLMTestType.CUSTOM_INPUTS,
         max_model_len=4096,
@@ -702,13 +702,38 @@ VLM_TEST_SETTINGS = {
     "smolvlm": VLMTestInfo(
         models=["HuggingFaceTB/SmolVLM2-2.2B-Instruct"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-        prompt_formatter=lambda img_prompt:f"<|im_start|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
+        prompt_formatter=lambda img_prompt: f"<|im_start|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
         img_idx_to_prompt=lambda idx: "<image>",
         max_model_len=8192,
         max_num_seqs=2,
         auto_cls=AutoModelForImageTextToText,
         hf_output_post_proc=model_utils.smolvlm_trunc_hf_output,
     ),
+    "tarsier": VLMTestInfo(
+        models=["omni-research/Tarsier-7b"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"USER: {img_prompt} ASSISTANT:",
+        max_model_len=4096,
+        max_num_seqs=2,
+        auto_cls=AutoModelForImageTextToText,
+        patch_hf_runner=model_utils.tarsier_patch_hf_runner,
+    ),
+    "tarsier2": VLMTestInfo(
+        models=["omni-research/Tarsier2-Recap-7b"],
+        test_type=(
+            VLMTestType.IMAGE,
+            VLMTestType.MULTI_IMAGE,
+            VLMTestType.VIDEO,
+        ),
+        prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
+        img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
+        video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
+        max_model_len=4096,
+        max_num_seqs=2,
+        auto_cls=AutoModelForImageTextToText,
+        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+        marks=[pytest.mark.skip("Model initialization hangs")],
+    ),
     ### Tensor parallel / multi-gpu broadcast tests
     "chameleon-broadcast": VLMTestInfo(
         models=["facebook/chameleon-7b"],
diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py
index c1a2aa0dcafbb..5e8dac6bce96a 100644
--- a/tests/models/multimodal/generation/vlm_utils/model_utils.py
+++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py
@@ -818,3 +818,15 @@ def qwen2_5_omni_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
     thinker.get_output_embeddings = lambda: thinker.lm_head
     hf_model.model = thinker
     return hf_model
+
+
+def tarsier_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    from vllm.model_executor.models.tarsier import get_vision_encoder_info
+
+    vision_encoder_info = get_vision_encoder_info(hf_model.config)
+
+    hf_processor = hf_model.processor
+    if hf_processor.patch_size is None:
+        hf_processor.patch_size = vision_encoder_info.get_patch_size()
+
+    return hf_model
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index f70e03d0f6691..bd1c55d95dac2 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -271,8 +271,8 @@ def _test_processing_correctness_one(
     "microsoft/Florence-2-base",
     "adept/fuyu-8b",
     "google/gemma-3-4b-it",
-    "THUDM/glm-4v-9b",
-    "THUDM/GLM-4.1V-9B-Thinking",
+    "zai-org/glm-4v-9b",
+    "zai-org/GLM-4.1V-9B-Thinking",
     "ibm-granite/granite-speech-3.3-2b",
     "h2oai/h2ovl-mississippi-800m",
     "internlm/Intern-S1",
diff --git a/tests/models/multimodal/processing/test_glm4_1v.py b/tests/models/multimodal/processing/test_glm4_1v.py
new file mode 100644
index 0000000000000..a6d900ec5d895
--- /dev/null
+++ b/tests/models/multimodal/processing/test_glm4_1v.py
@@ -0,0 +1,51 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.assets.video import VideoAsset
+from vllm.multimodal import MULTIMODAL_REGISTRY
+
+from ...utils import build_model_context
+
+
+@pytest.mark.parametrize("model_id", ["zai-org/GLM-4.1V-9B-Thinking"])
+@pytest.mark.parametrize("expected_toks_per_frame", [299])
+@pytest.mark.parametrize("num_frames", [32, 128])
+@pytest.mark.parametrize("fps, expected_grid_t", [(1, 5), (2, 10)])
+def test_processor_override(
+    model_id: str,
+    expected_toks_per_frame: int,
+    expected_grid_t: int,
+    fps: int,
+    num_frames: int,
+):
+    """Ensure GLM4vMultiModalProcessor can handle video frames properly."""
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"video": 1},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    tokenizer = processor.info.get_tokenizer()
+    hf_processor_mm_kwargs = {"fps": fps}
+
+    # Build the image str / prompt based on the number of images we pass
+    video_assets = VideoAsset(name="baby_reading", num_frames=num_frames)
+    prompt = "<|begin_of_video|><|video|><|end_of_video|>"
+
+    video, metadata = video_assets.np_ndarrays, video_assets.metadata
+    metadata["fps"] = fps
+    mm_data = {"video": [(video, metadata)]}
+
+    processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
+
+    # Ensure we have the right number of placeholders per num_crops size
+    hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
+    video_token_id = tokenizer.convert_tokens_to_ids(hf_processor.video_token)
+    video_tok_count = processed_inputs["prompt_token_ids"].count(
+        video_token_id)
+    grid_t, _, _ = processed_inputs["mm_kwargs"]["video_grid_thw"][0]
+
+    assert grid_t == expected_grid_t
+    assert video_tok_count == expected_toks_per_frame * grid_t
diff --git a/tests/models/multimodal/processing/test_mllama4.py b/tests/models/multimodal/processing/test_mllama4.py
new file mode 100644
index 0000000000000..f3871b60c3f64
--- /dev/null
+++ b/tests/models/multimodal/processing/test_mllama4.py
@@ -0,0 +1,67 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for mllama's multimodal preprocessing and profiling."""
+import pytest
+from torch import prod
+from transformers import Llama4Config
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.profiling import MultiModalProfiler
+
+from ...utils import build_model_context
+
+
+@pytest.mark.parametrize("model_id", ["meta-llama/Llama-Guard-4-12B"])
+@pytest.mark.parametrize("max_model_len", [4096, 8192, 25600, 131072])
+def test_profiling(model_id: str, max_model_len: int):
+    model_config_kwargs = {
+        "max_model_len": max_model_len,
+    }
+    ctx = build_model_context(
+        model_id,
+        model_config_kwargs=model_config_kwargs,
+        limit_mm_per_prompt={"image": 1},
+    )
+
+    mm_config = ctx.get_mm_config()
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    profiler = MultiModalProfiler(processor)
+
+    decoder_dummy_data = profiler.get_decoder_dummy_data(
+        max_model_len,
+        mm_counts=mm_config.limit_per_prompt,
+    )
+    dummy_mm_data = processor.dummy_inputs.get_dummy_processor_inputs(
+        max_model_len,
+        mm_counts=mm_config.limit_per_prompt,
+    )
+
+    hf_config = ctx.get_hf_config(Llama4Config)
+
+    mm_kwargs = processor.apply(
+        prompt=dummy_mm_data.prompt,
+        mm_data=dummy_mm_data.mm_data,
+        hf_processor_mm_kwargs=dict(),
+    )["mm_kwargs"]
+
+    image_size = hf_config.vision_config.image_size
+    patch_size = hf_config.vision_config.patch_size
+    downsample_ratio = int(
+        round(1.0 / (hf_config.vision_config.pixel_shuffle_ratio**2)))
+    tokens_per_patch = ((image_size // patch_size)**2) // downsample_ratio
+    chunks_per_image = prod(mm_kwargs["patches_per_image"])
+    total_num_patches = chunks_per_image * tokens_per_patch
+    num_tiles = mm_kwargs["aspect_ratios"][0][0] * mm_kwargs["aspect_ratios"][
+        0][1]  # x-y seperator tokens
+    total_tokens = total_num_patches.item() + num_tiles.item(
+    ) + 3  # image start, image, image end
+
+    profiled_tokens = profiler.get_mm_max_contiguous_tokens(
+        max_model_len,
+        mm_counts=mm_config.limit_per_prompt,
+    )
+
+    assert total_tokens == profiled_tokens["image"]
+    assert total_tokens == sum(
+        placeholder.length for placeholder in
+        decoder_dummy_data.multi_modal_placeholders["image"])
diff --git a/tests/models/multimodal/processing/test_transformers.py b/tests/models/multimodal/processing/test_transformers.py
index c7d1b5271ff72..54a0be99384a8 100644
--- a/tests/models/multimodal/processing/test_transformers.py
+++ b/tests/models/multimodal/processing/test_transformers.py
@@ -16,7 +16,7 @@ def test_multimodal_processor(model_id):
         model_impl="transformers",
     )
 
-    mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config, )
+    mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config)
 
     image_pil = ImageAsset('cherry_blossom').pil_image
     mm_data = {"image": image_pil}
diff --git a/tests/models/multimodal/test_tensor_schema.py b/tests/models/multimodal/test_tensor_schema.py
new file mode 100644
index 0000000000000..f80e8456f02e3
--- /dev/null
+++ b/tests/models/multimodal/test_tensor_schema.py
@@ -0,0 +1,201 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from functools import partial
+from typing import Any
+from unittest.mock import patch
+
+import pytest
+from transformers import PretrainedConfig
+
+from vllm.config import ModelConfig
+from vllm.engine.llm_engine import LLMEngine as V0LLMEngine
+from vllm.inputs import InputProcessingContext
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
+from vllm.multimodal.processing import BaseMultiModalProcessor
+from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
+from vllm.utils import GiB_bytes, set_default_torch_num_threads
+from vllm.v1.core.kv_cache_utils import get_kv_cache_config
+from vllm.v1.engine.core import EngineCore as V1EngineCore
+
+from ...conftest import VllmRunner
+from ..registry import _MULTIMODAL_EXAMPLE_MODELS, HF_EXAMPLE_MODELS
+
+ARCH_TO_SKIP = {
+    "MolmoForCausalLM": "incompatible requirements",
+    "MiniMaxVL01ForConditionalGeneration": "broken model",
+}
+
+
+def create_batched_mm_kwargs(
+    model_config: ModelConfig,
+    processor: BaseMultiModalProcessor,
+) -> MultiModalKwargs:
+    processing_info = processor.info
+    dummy_inputs = processor.dummy_inputs
+    supported_mm_limits = processing_info.get_supported_mm_limits()
+    mm_counts = {
+        modality: 3 if limit is None else limit
+        for modality, limit in supported_mm_limits.items()
+    }
+    processor_inputs = dummy_inputs.get_dummy_processor_inputs(
+        seq_len=model_config.max_model_len,
+        mm_counts=mm_counts,
+    )
+    mm_kwargs = processor.apply(
+        prompt=processor_inputs.prompt,
+        mm_data=processor_inputs.mm_data,
+        hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
+        tokenization_kwargs=processor_inputs.tokenization_kwargs,
+    )["mm_kwargs"]
+    mm_kwargs = MultiModalKwargs.batch([mm_kwargs])
+    return mm_kwargs
+
+
+# Avoid OOM and reduce initialization time by only using 1 layer
+def hf_overrides(hf_config: PretrainedConfig,
+                 exist_overrides: dict[str, Any]) -> PretrainedConfig:
+    hf_config.update(exist_overrides)
+    text_config = hf_config.get_text_config()
+    # Ensure at least 2 expert per group
+    # Since `grouped_topk` assumes top-2
+    n_group = getattr(text_config, 'n_group', None)
+    num_experts = n_group * 2 if n_group is not None else 2
+    # we use three layers for Gemma-3n to check
+    # both normal layer and kv_shared_layer
+    text_config.update({
+        "num_layers": 1,
+        "num_hidden_layers": 1,
+        "num_experts": num_experts,
+        "num_experts_per_tok": 2,
+        "num_local_experts": num_experts,
+        # Otherwise there will not be any expert layers
+        "first_k_dense_replace": 0,
+        # To avoid OOM on DeepSeek-V3
+        "n_routed_experts": num_experts,
+        # For Gemma-3n
+        "num_kv_shared_layers": 1,
+    })
+    if hasattr(hf_config, "vision_config"):
+        hf_config.vision_config.update({
+            "num_layers": 1,
+            "num_hidden_layers": 1,
+        })
+    # e.g.: ibm-granite/granite-speech-3.3-2b
+    if hasattr(hf_config, "encoder_config"):
+        hf_config.encoder_config.update({
+            "num_layers": 1,
+            "num_hidden_layers": 1,
+        })
+    # e.g.: Qwen/Qwen2-Audio-7B-Instruct
+    if hasattr(hf_config, "audio_config"):
+        hf_config.audio_config.update({
+            "num_layers": 1,
+            "num_hidden_layers": 1,
+            "encoder_layers": 1,
+        })
+    return hf_config
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize("model_arch", list(_MULTIMODAL_EXAMPLE_MODELS.keys()))
+def test_model_tensor_schema(model_arch: str, vllm_runner: type[VllmRunner],
+                             monkeypatch):
+    if model_arch in ARCH_TO_SKIP:
+        pytest.skip(f"Skipping {model_arch} due to {ARCH_TO_SKIP[model_arch]}")
+
+    model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
+    model_info.check_available_online(on_fail="skip")
+    model_info.check_transformers_version(on_fail="skip",
+                                          check_max_version=False)
+
+    model_id = model_info.default
+
+    hf_overrides_fn = partial(hf_overrides,
+                              exist_overrides=model_info.hf_overrides)
+
+    model_config = ModelConfig(
+        model_id,
+        tokenizer=model_info.tokenizer or model_id,
+        tokenizer_mode=model_info.tokenizer_mode,
+        revision=model_info.revision,
+        trust_remote_code=model_info.trust_remote_code,
+        hf_overrides=model_info.hf_overrides,
+    )
+    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
+    factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
+
+    if not any(
+            hasattr(model_cls, f"_parse_and_validate_{m}_input")
+            for m in ["image", "video", "audio"]):
+        pytest.skip(f"{model_arch} does not support tensor schema validation.")
+
+    ctx = InputProcessingContext(
+        model_config,
+        tokenizer=cached_tokenizer_from_config(model_config),
+    )
+    processing_info = factories.info(ctx)
+    supported_mm_limits = processing_info.get_supported_mm_limits()
+    limit_mm_per_prompt = {
+        modality: 3 if limit is None else limit
+        for modality, limit in supported_mm_limits.items()
+    }
+
+    # Avoid calling model.forward()
+    def _initialize_kv_caches_v0(self) -> None:
+        self.cache_config.num_gpu_blocks = 0
+        self.cache_config.num_cpu_blocks = 0
+
+    def _initialize_kv_caches_v1(self, vllm_config):
+        kv_cache_specs = self.model_executor.get_kv_cache_specs()
+        scheduler_kv_cache_config = get_kv_cache_config(
+            vllm_config,
+            kv_cache_specs[0],
+            10 * GiB_bytes,
+        )
+
+        # gpu_blocks (> 0), cpu_blocks, scheduler_kv_cache_config
+        return 1, 0, scheduler_kv_cache_config
+
+    with (patch.object(V0LLMEngine, "_initialize_kv_caches",
+                       _initialize_kv_caches_v0),
+          patch.object(V1EngineCore, "_initialize_kv_caches",
+                       _initialize_kv_caches_v1), monkeypatch.context() as m):
+        m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
+        if model_info.v0_only:
+            m.setenv("VLLM_USE_V1", "0")
+
+        with (
+                set_default_torch_num_threads(1),
+                vllm_runner(
+                    model_id,
+                    tokenizer_name=model_info.tokenizer,
+                    tokenizer_mode=model_info.tokenizer_mode,
+                    revision=model_info.revision,
+                    trust_remote_code=model_info.trust_remote_code,
+                    max_model_len=model_info.max_model_len,
+                    load_format="dummy",
+                    hf_overrides=hf_overrides_fn,
+                    limit_mm_per_prompt=limit_mm_per_prompt,
+                    enforce_eager=True,
+                ) as vllm_model,
+        ):
+            model_config = vllm_model.llm.llm_engine.model_config
+            llm_engine = vllm_model.llm.llm_engine
+
+            if hasattr(llm_engine, "processor"):
+                # v1 processor
+                mm_registry = llm_engine.processor.mm_registry
+            else:
+                # v0 input_preprocessor
+                mm_registry = llm_engine.input_preprocessor.mm_registry
+
+            processor = mm_registry.create_processor(model_config)
+            mm_kwargs = create_batched_mm_kwargs(model_config, processor)
+
+            def validate_model_input(model):
+                for modality in ("audio", "image", "video"):
+                    method_name = f"_parse_and_validate_{modality}_input"
+                    if hasattr(model, method_name):
+                        getattr(model, method_name)(**mm_kwargs)
+
+            vllm_model.apply_model(validate_model_input)
diff --git a/tests/models/quantization/test_bitsandbytes.py b/tests/models/quantization/test_bitsandbytes.py
index 8cb269d7e9496..e0e919b62b217 100644
--- a/tests/models/quantization/test_bitsandbytes.py
+++ b/tests/models/quantization/test_bitsandbytes.py
@@ -102,7 +102,6 @@ def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
 def test_load_pp_4bit_bnb_model(model_name, description) -> None:
     common_args = [
         "--disable-log-stats",
-        "--disable-log-requests",
         "--dtype",
         "bfloat16",
         "--enable-prefix-caching",
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 4fcd02efb6d0b..47057d32e9cd7 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -80,6 +80,8 @@ class _HfExamplesInfo:
         self,
         *,
         on_fail: Literal["error", "skip"],
+        check_min_version: bool = True,
+        check_max_version: bool = True,
     ) -> None:
         """
         If the installed transformers version does not meet the requirements,
@@ -96,9 +98,11 @@ class _HfExamplesInfo:
         msg = f"`transformers=={current_version}` installed, but `transformers"
         # Only check the base version for the min/max version, otherwise preview
         # models cannot be run because `x.yy.0.dev0`<`x.yy.0`
-        if min_version and Version(cur_base_version) < Version(min_version):
+        if (check_min_version and min_version
+                and Version(cur_base_version) < Version(min_version)):
             msg += f">={min_version}` is required to run this model."
-        elif max_version and Version(cur_base_version) > Version(max_version):
+        elif (check_max_version and max_version
+              and Version(cur_base_version) > Version(max_version)):
             msg += f"<={max_version}` is required to run this model."
         else:
             return
@@ -135,8 +139,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                    trust_remote_code=True),
     "AquilaForCausalLM": _HfExamplesInfo("BAAI/AquilaChat2-7B",
                                          trust_remote_code=True),
-    "ArceeForCausalLM": _HfExamplesInfo("arcee-ai/AFM-4.5B-Base",
-                                        is_available_online=False),
+    "ArceeForCausalLM": _HfExamplesInfo("arcee-ai/AFM-4.5B-Base"),
     "ArcticForCausalLM": _HfExamplesInfo("Snowflake/snowflake-arctic-instruct",
                                          trust_remote_code=True),
     "BaiChuanForCausalLM": _HfExamplesInfo("baichuan-inc/Baichuan-7B",
@@ -149,7 +152,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                         extras={"tiny": "hmellor/tiny-random-BambaForCausalLM"}),  # noqa: E501
     "BloomForCausalLM": _HfExamplesInfo("bigscience/bloom-560m",
                                         {"1b": "bigscience/bloomz-1b1"}),
-    "ChatGLMModel": _HfExamplesInfo("THUDM/chatglm3-6b",
+    "ChatGLMModel": _HfExamplesInfo("zai-org/chatglm3-6b",
                                     trust_remote_code=True,
                                     max_transformers_version="4.48"),
     "ChatGLMForConditionalGeneration": _HfExamplesInfo("thu-coai/ShieldLM-6B-chatglm3",  # noqa: E501
@@ -170,8 +173,10 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                             min_transformers_version="4.54"),
     "Ernie4_5_MoeForCausalLM": _HfExamplesInfo("baidu/ERNIE-4.5-21B-A3B-PT",
                                                min_transformers_version="4.54"),
-    "ExaoneForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"),  # noqa: E501
-    "Exaone4ForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-4.0-32B"),  # noqa: E501
+    "ExaoneForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct",
+                                         trust_remote_code=True),
+    "Exaone4ForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-4.0-32B",
+                                          min_transformers_version="4.54"),
     "Fairseq2LlamaForCausalLM": _HfExamplesInfo("mgleize/fairseq2-dummy-Llama-3.2-1B"),  # noqa: E501
     "FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"),
     "FalconH1ForCausalLM":_HfExamplesInfo("tiiuae/Falcon-H1-0.5B-Base",
@@ -181,8 +186,10 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     "Gemma3ForCausalLM": _HfExamplesInfo("google/gemma-3-1b-it"),
     "Gemma3nForConditionalGeneration": _HfExamplesInfo("google/gemma-3n-E2B-it",    # noqa: E501
                                           min_transformers_version="4.53"),
-    "GlmForCausalLM": _HfExamplesInfo("THUDM/glm-4-9b-chat-hf"),
-    "Glm4ForCausalLM": _HfExamplesInfo("THUDM/GLM-4-9B-0414"),
+    "GlmForCausalLM": _HfExamplesInfo("zai-org/glm-4-9b-chat-hf"),
+    "Glm4ForCausalLM": _HfExamplesInfo("zai-org/GLM-4-9B-0414"),
+    "Glm4MoeForCausalLM": _HfExamplesInfo("zai-org/GLM-4.5",
+                                          min_transformers_version="4.54"),   # noqa: E501
     "GPT2LMHeadModel": _HfExamplesInfo("openai-community/gpt2",
                                        {"alias": "gpt2"}),
     "GPTBigCodeForCausalLM": _HfExamplesInfo("bigcode/starcoder",
@@ -199,8 +206,10 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                              trust_remote_code=True),
     "HunYuanMoEV1ForCausalLM": _HfExamplesInfo("tencent/Hunyuan-A13B-Instruct",
                                                trust_remote_code=True),
+    # TODO: Remove is_available_online once their config.json is fixed
     "HunYuanDenseV1ForCausalLM":_HfExamplesInfo("tencent/Hunyuan-7B-Instruct-0124",
-                                               trust_remote_code=True),
+                                                trust_remote_code=True,
+                                                is_available_online=False),
     "HCXVisionForCausalLM": _HfExamplesInfo(
         "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
         trust_remote_code=True),
@@ -275,7 +284,11 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     "StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b"),  # noqa: E501
     "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t"),
     "Starcoder2ForCausalLM": _HfExamplesInfo("bigcode/starcoder2-3b"),
-    "SolarForCausalLM": _HfExamplesInfo("upstage/solar-pro-preview-instruct"),
+    "Step3TextForCausalLM": _HfExamplesInfo("stepfun-ai/step3",
+                                            trust_remote_code=True,
+                                            is_available_online=False),
+    "SolarForCausalLM": _HfExamplesInfo("upstage/solar-pro-preview-instruct",
+                                        trust_remote_code=True),
     "TeleChat2ForCausalLM": _HfExamplesInfo("Tele-AI/TeleChat2-3B",
                                             trust_remote_code=True),
     "TeleFLMForCausalLM": _HfExamplesInfo("CofeAI/FLM-2-52B-Instruct-2407",
@@ -366,14 +379,14 @@ _MULTIMODAL_EXAMPLE_MODELS = {
     "FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"),
     "Gemma3ForConditionalGeneration": _HfExamplesInfo("google/gemma-3-4b-it"),
     "GraniteSpeechForConditionalGeneration": _HfExamplesInfo("ibm-granite/granite-speech-3.3-2b"),  # noqa: E501
-    "GLM4VForCausalLM": _HfExamplesInfo("THUDM/glm-4v-9b",
+    "GLM4VForCausalLM": _HfExamplesInfo("zai-org/glm-4v-9b",
                                         trust_remote_code=True,
                                         hf_overrides={"architectures": ["GLM4VForCausalLM"]}),  # noqa: E501
-    "Glm4vForConditionalGeneration": _HfExamplesInfo("THUDM/GLM-4.1V-9B-Thinking", min_transformers_version="4.53"),  # noqa: E501
-    "Glm4MoeForCausalLM": _HfExamplesInfo("THUDM/GLM-4.5",
-                                          min_transformers_version="4.54",
+    "Glm4vForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.1V-9B-Thinking"),  # noqa: E501
+    "Glm4v_moeForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.5V",
                                           is_available_online=False),   # noqa: E501
     "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m",
+                                      trust_remote_code=True,
                                       extras={"2b": "h2oai/h2ovl-mississippi-2b"},  # noqa: E501
                                       max_transformers_version="4.48",  # noqa: E501
                                       transformers_version_reason="HF model is not compatible."),  # noqa: E501
@@ -391,7 +404,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                                       extras={"thinking": "moonshotai/Kimi-VL-A3B-Thinking"},  # noqa: E501
                                                       trust_remote_code=True),
     "Llama4ForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct",   # noqa: E501
-                                                      max_model_len=10240),
+                                                      max_model_len=10240,
+                                                      extras={"llama-guard-4": "meta-llama/Llama-Guard-4-12B"}, # noqa: E501
+                                                      ),
     "LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf",
                                                      extras={"mistral": "mistral-community/pixtral-12b", # noqa: E501
                                                              "mistral-fp8": "nm-testing/pixtral-12b-FP8-dynamic"}),  # noqa: E501
@@ -421,6 +436,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                               trust_remote_code=True),
     "Llama_Nemotron_Nano_VL" : _HfExamplesInfo("nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1", # noqa: E501
                                                      trust_remote_code=True),
+    "Ovis": _HfExamplesInfo("AIDC-AI/Ovis2-1B", trust_remote_code=True,
+                            extras={"1.6-llama": "AIDC-AI/Ovis1.6-Llama3.2-3B",
+                                    "1.6-gemma": "AIDC-AI/Ovis1.6-Gemma2-9B"}),  # noqa: E501
     "PaliGemmaForConditionalGeneration": _HfExamplesInfo("google/paligemma-3b-mix-224",  # noqa: E501
                                                          extras={"v2": "google/paligemma2-3b-ft-docci-448"}),  # noqa: E501
     "Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3-vision-128k-instruct",
@@ -428,9 +446,6 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                         max_transformers_version="4.48",
                                         transformers_version_reason="Use of deprecated imports which have been removed.",  # noqa: E501
                                         extras={"phi3.5": "microsoft/Phi-3.5-vision-instruct"}),  # noqa: E501
-    "Ovis": _HfExamplesInfo("AIDC-AI/Ovis2-1B", trust_remote_code=True,
-                            extras={"1.6-llama": "AIDC-AI/Ovis1.6-Llama3.2-3B",
-                                    "1.6-gemma": "AIDC-AI/Ovis1.6-Gemma2-9B"}),  # noqa: E501
     "Phi4MMForCausalLM": _HfExamplesInfo("microsoft/Phi-4-multimodal-instruct",
                                         trust_remote_code=True),
     "Phi4MultimodalForCausalLM": _HfExamplesInfo("microsoft/Phi-4-multimodal-instruct",  # noqa: E501
@@ -447,12 +462,15 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                                           max_model_len=4096),
     "Qwen2_5OmniModel": _HfExamplesInfo("Qwen/Qwen2.5-Omni-3B"),
     "Qwen2_5OmniForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-Omni-7B-AWQ"),  # noqa: E501
-    "SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B"),
+    "SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B",
+                                           trust_remote_code=True),
     "SmolVLMForConditionalGeneration": _HfExamplesInfo("HuggingFaceTB/SmolVLM2-2.2B-Instruct"),  # noqa: E501
+    "Step3VLForConditionalGeneration": _HfExamplesInfo("stepfun-ai/step3",
+                                                        trust_remote_code=True,
+                                                        is_available_online=False),
     "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b",  # noqa: E501
                                      trust_remote_code=True),
-    "TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b",  # noqa: E501
-                                                        hf_overrides={"architectures": ["TarsierForConditionalGeneration"]}),  # noqa: E501
+    "TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b"),  # noqa: E501
     "Tarsier2ForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier2-Recap-7b",  # noqa: E501
                                                         hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]}),  # noqa: E501
     "VoxtralForConditionalGeneration": _HfExamplesInfo(
@@ -502,8 +520,8 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
                                             is_available_online=False,
                                             speculative_model="openbmb/MiniCPM-2B-sft-bf16",
                                             tokenizer="openbmb/MiniCPM-2B-sft-bf16"),
-    "Glm4MoeMTPModel": _HfExamplesInfo("THUDM/GLM-4.5",
-                                        speculative_model="THUDM/GLM-4.5",
+    "Glm4MoeMTPModel": _HfExamplesInfo("zai-org/GLM-4.5",
+                                        speculative_model="zai-org/GLM-4.5",
                                         min_transformers_version="4.54",
                                         is_available_online=False),
     "MiMoMTPModel": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL",
@@ -512,6 +530,7 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
 }
 
 _TRANSFORMERS_BACKEND_MODELS = {
+    "TransformersModel": _HfExamplesInfo("Qwen/Qwen3-Embedding-0.6B"),
     "TransformersForCausalLM": _HfExamplesInfo("hmellor/Ilama-3.2-1B", trust_remote_code=True),  # noqa: E501
     "TransformersForMultimodalLM": _HfExamplesInfo("OpenGVLab/InternVL3-1B-hf"),
 }
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index d5441540176e8..4c7da24fca32a 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -33,12 +33,6 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
     model_info.check_available_online(on_fail="skip")
     model_info.check_transformers_version(on_fail="skip")
 
-    # FIXME: Possible memory leak in the previous tests?
-    if model_arch in ("Glm4vForConditionalGeneration",
-                      "GraniteSpeechForConditionalGeneration",
-                      "KimiVLForConditionalGeneration"):
-        pytest.skip("Avoid OOM")
-
     if model_arch in ("Llama4ForCausalLM", "EagleLlama4ForCausalLM"):
         from vllm.model_executor.models.llama4 import Llama4ForCausalLM
         from vllm.model_executor.models.registry import ModelRegistry
@@ -87,6 +81,14 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
                 "num_hidden_layers": 1,
             })
 
+        # e.g.: Qwen/Qwen2-Audio-7B-Instruct
+        if hasattr(hf_config, "audio_config"):
+            hf_config.audio_config.update({
+                "num_layers": 1,
+                "num_hidden_layers": 1,
+                "encoder_layers": 1,
+            })
+
         return hf_config
 
     # Avoid calling model.forward()
diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py
index 5b7d90dfb896d..66ff8f7a54d31 100644
--- a/tests/models/test_transformers.py
+++ b/tests/models/test_transformers.py
@@ -34,8 +34,7 @@ def check_implementation(
 
     with runner_test(model, **kwargs_test, **kwargs) as model_test:
         model_config = model_test.llm.llm_engine.model_config
-        assert model_config.architecture == (
-            model_config._get_transformers_backend_cls())
+        assert model_config.using_transformers_backend()
 
         outputs_test = model_test.generate_greedy_logprobs(*args)
 
@@ -135,8 +134,7 @@ def test_quantization(
             enforce_eager=True,
             **quantization_kwargs) as vllm_model:  # type: ignore[arg-type]
         model_config = vllm_model.llm.llm_engine.model_config
-        assert model_config.architecture == (
-            model_config._get_transformers_backend_cls())
+        assert model_config.using_transformers_backend()
 
         transformers_outputs = vllm_model.generate_greedy_logprobs(
             example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs)
@@ -149,6 +147,25 @@ def test_quantization(
     )
 
 
+@pytest.mark.parametrize(
+    "model",
+    [
+        # Layers live in `layers`
+        "Qwen/Qwen3-Embedding-0.6B",
+        # Layers live in `model.layers`
+        "meta-llama/Llama-3.2-1B-Instruct"
+    ],
+)
+def test_embed_loading(vllm_runner, model):
+    with vllm_runner(model,
+                     max_model_len=1024,
+                     enforce_eager=True,
+                     runner="pooling",
+                     model_impl="transformers") as model_test:
+        model_config = model_test.llm.llm_engine.model_config
+        assert model_config.using_transformers_backend()
+
+
 @pytest.mark.parametrize(
     "model",
     ["jason9693/Qwen2.5-1.5B-apeach"],
@@ -169,8 +186,7 @@ def test_classify(
                      dtype=dtype,
                      model_impl="transformers") as vllm_model:
         model_config = vllm_model.llm.llm_engine.model_config
-        assert model_config.architecture == (
-            model_config._get_transformers_backend_cls())
+        assert model_config.using_transformers_backend()
 
         vllm_outputs = vllm_model.classify(example_prompts)
 
diff --git a/tests/models/utils.py b/tests/models/utils.py
index 3cd0721be1b65..bda7ea3e3ad51 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -330,6 +330,13 @@ def matryoshka_fy(tensor: torch.Tensor, dimensions: int):
     return tensor
 
 
+def softmax(data):
+    if data.shape[-1] == 1:
+        return F.sigmoid(data)
+    else:
+        return F.softmax(data, dim=-1)
+
+
 class EmbedModelInfo(NamedTuple):
     name: str
     is_matryoshka: bool = False
diff --git a/tests/mq_llm_engine/test_load.py b/tests/mq_llm_engine/test_load.py
index e9fd5b814f285..c934706611ae3 100644
--- a/tests/mq_llm_engine/test_load.py
+++ b/tests/mq_llm_engine/test_load.py
@@ -16,7 +16,7 @@ NUM_EXPECTED_TOKENS = 10
 NUM_REQUESTS = 10000
 
 # Scenarios to test for num generated token.
-ENGINE_ARGS = AsyncEngineArgs(model=MODEL, disable_log_requests=True)
+ENGINE_ARGS = AsyncEngineArgs(model=MODEL)
 
 
 @pytest.fixture(scope="function")
diff --git a/tests/multimodal/test_image.py b/tests/multimodal/test_image.py
index cfd44351a6d1f..271a85f1195ec 100644
--- a/tests/multimodal/test_image.py
+++ b/tests/multimodal/test_image.py
@@ -3,9 +3,10 @@
 from pathlib import Path
 
 import numpy as np
+import pytest
 from PIL import Image, ImageChops
 
-from vllm.multimodal.image import convert_image_mode
+from vllm.multimodal.image import ImageMediaIO, convert_image_mode
 
 ASSETS_DIR = Path(__file__).parent / "assets"
 assert ASSETS_DIR.exists()
@@ -35,3 +36,115 @@ def test_rgba_to_rgb():
                 assert converted_image_numpy[i][j][0] == 255
                 assert converted_image_numpy[i][j][1] == 255
                 assert converted_image_numpy[i][j][2] == 255
+
+
+def test_rgba_to_rgb_custom_background(tmp_path):
+    """Test RGBA to RGB conversion with custom background colors."""
+    # Create a simple RGBA image with transparent and opaque pixels
+    rgba_image = Image.new("RGBA", (10, 10),
+                           (255, 0, 0, 255))  # Red with full opacity
+
+    # Make top-left quadrant transparent
+    for i in range(5):
+        for j in range(5):
+            rgba_image.putpixel((i, j), (0, 0, 0, 0))  # Fully transparent
+
+    # Save the test image to tmp_path
+    test_image_path = tmp_path / "test_rgba.png"
+    rgba_image.save(test_image_path)
+
+    # Test 1: Default white background (backward compatibility)
+    image_io_default = ImageMediaIO()
+    converted_default = image_io_default.load_file(test_image_path)
+    default_numpy = np.array(converted_default)
+
+    # Check transparent pixels are white
+    assert default_numpy[0][0][0] == 255  # R
+    assert default_numpy[0][0][1] == 255  # G
+    assert default_numpy[0][0][2] == 255  # B
+    # Check opaque pixels remain red
+    assert default_numpy[5][5][0] == 255  # R
+    assert default_numpy[5][5][1] == 0  # G
+    assert default_numpy[5][5][2] == 0  # B
+
+    # Test 2: Custom black background via kwargs
+    image_io_black = ImageMediaIO(rgba_background_color=(0, 0, 0))
+    converted_black = image_io_black.load_file(test_image_path)
+    black_numpy = np.array(converted_black)
+
+    # Check transparent pixels are black
+    assert black_numpy[0][0][0] == 0  # R
+    assert black_numpy[0][0][1] == 0  # G
+    assert black_numpy[0][0][2] == 0  # B
+    # Check opaque pixels remain red
+    assert black_numpy[5][5][0] == 255  # R
+    assert black_numpy[5][5][1] == 0  # G
+    assert black_numpy[5][5][2] == 0  # B
+
+    # Test 3: Custom blue background via kwargs (as list)
+    image_io_blue = ImageMediaIO(rgba_background_color=[0, 0, 255])
+    converted_blue = image_io_blue.load_file(test_image_path)
+    blue_numpy = np.array(converted_blue)
+
+    # Check transparent pixels are blue
+    assert blue_numpy[0][0][0] == 0  # R
+    assert blue_numpy[0][0][1] == 0  # G
+    assert blue_numpy[0][0][2] == 255  # B
+
+    # Test 4: Test with load_bytes method
+    with open(test_image_path, 'rb') as f:
+        image_data = f.read()
+
+    image_io_green = ImageMediaIO(rgba_background_color=(0, 255, 0))
+    converted_green = image_io_green.load_bytes(image_data)
+    green_numpy = np.array(converted_green)
+
+    # Check transparent pixels are green
+    assert green_numpy[0][0][0] == 0  # R
+    assert green_numpy[0][0][1] == 255  # G
+    assert green_numpy[0][0][2] == 0  # B
+
+
+def test_rgba_background_color_validation():
+    """Test that invalid rgba_background_color values are properly rejected."""
+
+    # Test invalid types
+    with pytest.raises(ValueError,
+                       match="rgba_background_color must be a list or tuple"):
+        ImageMediaIO(rgba_background_color="255,255,255")
+
+    with pytest.raises(ValueError,
+                       match="rgba_background_color must be a list or tuple"):
+        ImageMediaIO(rgba_background_color=255)
+
+    # Test wrong number of elements
+    with pytest.raises(ValueError,
+                       match="rgba_background_color must be a list or tuple"):
+        ImageMediaIO(rgba_background_color=(255, 255))
+
+    with pytest.raises(ValueError,
+                       match="rgba_background_color must be a list or tuple"):
+        ImageMediaIO(rgba_background_color=(255, 255, 255, 255))
+
+    # Test non-integer values
+    with pytest.raises(ValueError,
+                       match="rgba_background_color must be a list or tuple"):
+        ImageMediaIO(rgba_background_color=(255.0, 255.0, 255.0))
+
+    with pytest.raises(ValueError,
+                       match="rgba_background_color must be a list or tuple"):
+        ImageMediaIO(rgba_background_color=(255, "255", 255))
+
+    # Test out of range values
+    with pytest.raises(ValueError,
+                       match="rgba_background_color must be a list or tuple"):
+        ImageMediaIO(rgba_background_color=(256, 255, 255))
+
+    with pytest.raises(ValueError,
+                       match="rgba_background_color must be a list or tuple"):
+        ImageMediaIO(rgba_background_color=(255, -1, 255))
+
+    # Test that valid values work
+    ImageMediaIO(rgba_background_color=(0, 0, 0))  # Should not raise
+    ImageMediaIO(rgba_background_color=[255, 255, 255])  # Should not raise
+    ImageMediaIO(rgba_background_color=(128, 128, 128))  # Should not raise
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index 8a3f09bdbe27e..508c773b8aedf 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -2,16 +2,14 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from contextlib import nullcontext
-from types import MethodType
-from typing import cast
-from unittest.mock import MagicMock
+from typing import Optional, cast
 
 import numpy as np
 import pytest
 import torch
-from transformers import ProcessorMixin
 
 from vllm.config import ModelConfig
+from vllm.inputs import InputProcessingContext
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargs,
                                     MultiModalKwargsItem,
@@ -958,15 +956,14 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
     )
 
     processor = MULTIMODAL_REGISTRY.create_processor(model_config)
-    profiler = MultiModalProfiler(processor)
+    processor._supported_mm_limits = {"image": num_supported}
 
-    mock_supported_mm_limits = MagicMock(return_value={"image": num_supported})
-    processor.info.get_supported_mm_limits = mock_supported_mm_limits
+    profiler = MultiModalProfiler(processor)
 
     if is_valid:
         exc_ctx = nullcontext()
     else:
-        exc_ctx = pytest.raises(ValueError, match="The model only supports")
+        exc_ctx = pytest.raises(ValueError, match="At most")
 
     with exc_ctx:
         profiler.get_decoder_dummy_data(
@@ -1003,7 +1000,7 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
     if is_valid:
         exc_ctx = nullcontext()
     else:
-        exc_ctx = pytest.raises(ValueError, match=f"passed {num_images} image")
+        exc_ctx = pytest.raises(ValueError, match="At most")
 
     with exc_ctx:
         processor.apply(
@@ -1013,57 +1010,91 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
         )
 
 
-class _ProcessorProxy:
+class DummyProcessor:
 
-    def __init__(self, processor: ProcessorMixin) -> None:
+    def __init__(self, a: int = 0, b: int = 0) -> None:
         super().__init__()
 
-        self.__processor = processor
-
-    def __getattr__(self, key: str):
-        return getattr(self.__processor, key)
+        self.a = a
+        self.b = b
 
     def __call__(
         self,
-        text=None,
-        images=None,
-        videos=None,
-        exists=None,
-        return_tensors=None,
-    ):
-        return dict(exists=exists)
+        a: int = 0,
+        c: int = 0,
+        return_tensors: Optional[str] = None,
+    ) -> dict[str, int]:
+        return dict(a=a, c=c)
 
 
-@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"])  # Dummy
 # yapf: disable
+@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"])  # Dummy
 @pytest.mark.parametrize(
-    ("call_kwargs", "expected_kwargs"),
+    ("config_kwargs", "inference_kwargs", "expected_kwargs"),
     [
-        # Should ignore invalid kwargs
-        ({"does_not_exist": 100}, {"exists": None}),
-        ({"exists": 1}, {"exists": 1}),
-        ({"does_not_exist": 100, "exists": 1}, {"exists": 1}),
+        ({"a": 1}, {}, {"a": 1, "b": 0}),
+        ({}, {"a": 1}, {"a": 1, "b": 0}),
+        # inference_kwargs should take precedence
+        ({"a": 1}, {"a": 2}, {"a": 2, "b": 0}),
+        # Should ignore extra kwargs
+        ({"a": 1, "c": 1}, {}, {"a": 1, "b": 0}),
+        ({"b": 1, "c": 1}, {}, {"a": 0, "b": 1}),
     ],
 )
 # yapf: enable
-def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs):
-    model_config = ModelConfig(model_id)
+def test_hf_processor_init_kwargs(
+    model_id,
+    config_kwargs,
+    inference_kwargs,
+    expected_kwargs,
+):
+    # Should not be used since there is nothing to convert to tokens
+    mock_tokenizer = cast(AnyTokenizer, object())
 
-    processor = MULTIMODAL_REGISTRY.create_processor(model_config)
-    orig_get_hf_processor = processor.info.get_hf_processor
-
-    def get_hf_processor(self, **kwargs):
-        assert kwargs == call_kwargs
-        return _ProcessorProxy(orig_get_hf_processor())
-
-    processor.info.get_hf_processor = MethodType(get_hf_processor,
-                                                 processor.info)
-
-    out_kwargs = processor._call_hf_processor(
-        prompt="",
-        mm_data={},
-        mm_kwargs=call_kwargs,
-        tok_kwargs={},
+    ctx = InputProcessingContext(
+        model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs),
+        tokenizer=mock_tokenizer,
     )
 
-    assert out_kwargs == expected_kwargs
+    processor = ctx.get_hf_processor(
+        DummyProcessor,  # type: ignore[arg-type]
+        **inference_kwargs,
+    )
+
+    for k, v in expected_kwargs.items():
+        assert getattr(processor, k) == v
+
+
+# yapf: disable
+@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"])  # Dummy
+@pytest.mark.parametrize(
+    ("config_kwargs", "inference_kwargs", "expected_kwargs"),
+    [
+        ({"a": 1}, {}, {"a": 1, "c": 0}),
+        ({}, {"a": 1}, {"a": 1, "c": 0}),
+        # inference_kwargs should take precedence
+        ({"a": 1}, {"a": 2}, {"a": 2, "c": 0}),
+        # Should ignore extra kwargs
+        ({"a": 1, "c": 1}, {}, {"a": 1, "c": 1}),
+        ({"b": 1, "c": 1}, {}, {"a": 0, "c": 1}),
+    ],
+)
+# yapf: enable
+def test_hf_processor_call_kwargs(
+    model_id,
+    config_kwargs,
+    inference_kwargs,
+    expected_kwargs,
+):
+    # Should not be used since there is nothing to convert to tokens
+    mock_tokenizer = cast(AnyTokenizer, object())
+
+    ctx = InputProcessingContext(
+        model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs),
+        tokenizer=mock_tokenizer,
+    )
+
+    processor = ctx.get_hf_processor(DummyProcessor)  # type: ignore[arg-type]
+
+    result = ctx.call_hf_processor(processor, {}, inference_kwargs)
+    assert result == expected_kwargs
diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index e5ab7b3dd3cfb..0b37c83c92c2a 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -194,3 +194,36 @@ def test_scaled_fp8_quant(dtype) -> None:
         ref_y,
         per_tensor_dequantize(torch.narrow(y, 0, 0, x.shape[0]), inv_scale,
                               dtype))
+
+    # non-contiguous input with padding
+    m, n, padded_stride = 975, 512, 576
+    padded_tensor = (torch.randn(size=(m, padded_stride), device="cuda") *
+                     13).to(dtype)
+    x_nc = padded_tensor[:, :n]  # shape (m, n) with stride (padded_stride, 1)
+
+    assert not x_nc.is_contiguous()
+    assert x_nc.stride(0) == padded_stride
+
+    # dynamic quantization
+    ref_y_nc, inv_scale_nc = ops.scaled_fp8_quant(x_nc, None)
+    ref_y_nc = per_tensor_dequantize(ref_y_nc, inv_scale_nc, dtype)
+
+    # reference dynamic quantization
+    y_nc = quantize_ref(x_nc, inv_scale_nc)
+    torch.testing.assert_close(
+        ref_y_nc, per_tensor_dequantize(y_nc, inv_scale_nc, dtype))
+
+    # static quantization
+    y_nc, _ = ops.scaled_fp8_quant(x_nc, inv_scale_nc)
+    torch.testing.assert_close(
+        ref_y_nc, per_tensor_dequantize(y_nc, inv_scale_nc, dtype))
+
+    # padding after non-contiguous input quantization
+    y_nc_pad, _ = ops.scaled_fp8_quant(x_nc,
+                                       inv_scale_nc,
+                                       num_token_padding=m + 10)
+    assert y_nc_pad.shape[0] == m + 10
+    torch.testing.assert_close(
+        ref_y_nc,
+        per_tensor_dequantize(torch.narrow(y_nc_pad, 0, 0, x_nc.shape[0]),
+                              inv_scale_nc, dtype))
diff --git a/tests/samplers/test_no_bad_words.py b/tests/samplers/test_no_bad_words.py
index 11803b8d7a5eb..128e8f552a161 100644
--- a/tests/samplers/test_no_bad_words.py
+++ b/tests/samplers/test_no_bad_words.py
@@ -14,9 +14,9 @@ from vllm import LLM, SamplingParams
 
 
 @pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    """We can run both engines for this test."""
-    pass
+def v1(monkeypatch):
+    """Only run on vLLM v1."""
+    monkeypatch.setenv('VLLM_USE_V1', '1')
 
 
 def _generate(
diff --git a/tests/speculative_decoding/speculators/test_eagle3.py b/tests/speculative_decoding/speculators/test_eagle3.py
new file mode 100644
index 0000000000000..c46ac7a88b751
--- /dev/null
+++ b/tests/speculative_decoding/speculators/test_eagle3.py
@@ -0,0 +1,26 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+
+
+@pytest.mark.parametrize(
+    "model_path",
+    [("nm-testing/SpeculatorLlama3-1-8B-Eagle3-converted-0717-quantized")])
+def test_llama(vllm_runner, example_prompts, model_path):
+    with vllm_runner(model_path, dtype=torch.bfloat16) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts,
+                                                  max_tokens=20)
+        print(vllm_outputs)
+        assert vllm_outputs
+
+
+@pytest.mark.parametrize(
+    "model_path",
+    [("nm-testing/Speculator-Qwen3-8B-Eagle3-converted-071-quantized")])
+def test_qwen(vllm_runner, example_prompts, model_path):
+    with vllm_runner(model_path, dtype=torch.bfloat16) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts,
+                                                  max_tokens=20)
+        print(vllm_outputs)
+        assert vllm_outputs
diff --git a/tests/test_pooling_params.py b/tests/test_pooling_params.py
new file mode 100644
index 0000000000000..52c03015483c9
--- /dev/null
+++ b/tests/test_pooling_params.py
@@ -0,0 +1,106 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+from tests.models.utils import EmbedModelInfo
+from vllm import PoolingParams
+from vllm.config import ModelConfig
+
+EMBEDDING_MODELS = [
+    EmbedModelInfo("intfloat/multilingual-e5-small", is_matryoshka=False),
+    EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v1.5",
+                   is_matryoshka=True,
+                   matryoshka_dimensions=[256]),
+]
+
+
+def test_task():
+    pooling_params = PoolingParams()
+    pooling_params.verify(task="score")
+
+    pooling_params = PoolingParams(task="score")
+    pooling_params.verify(task="score")
+
+    with pytest.raises(ValueError):
+        pooling_params.verify(task="encode")
+
+
+def test_embed():
+    task = "embed"
+    pooling_params = PoolingParams(normalize=None)
+    pooling_params.verify(task=task)
+
+    pooling_params = PoolingParams(normalize=True)
+    pooling_params.verify(task=task)
+
+    pooling_params = PoolingParams(normalize=False)
+    pooling_params.verify(task=task)
+
+    invalid_parameters = ["activation", "softmax"]
+    for p in invalid_parameters:
+        with pytest.raises(ValueError):
+            pooling_params = PoolingParams(**{p: True})
+            pooling_params.verify(task=task)
+
+
+@pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
+def test_embed_dimensions(model_info: EmbedModelInfo):
+    task = "embed"
+    model_config = ModelConfig(
+        model_info.name,
+        task="auto",
+        tokenizer=model_info.name,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="float16",
+    )
+
+    pooling_params = PoolingParams(dimensions=None)
+    pooling_params.verify(task=task, model_config=model_config)
+
+    with pytest.raises(ValueError):
+        pooling_params = PoolingParams(dimensions=1)
+        pooling_params.verify(task=task, model_config=model_config)
+
+    if model_info.is_matryoshka:
+        assert model_info.matryoshka_dimensions is not None
+        pooling_params = PoolingParams(
+            dimensions=model_info.matryoshka_dimensions[0])
+        pooling_params.verify(task=task, model_config=model_config)
+
+
+@pytest.mark.parametrize("task", ["score", "classify"])
+def test_classify(task):
+    pooling_params = PoolingParams(activation=None)
+    pooling_params.verify(task=task)
+
+    pooling_params = PoolingParams(activation=True)
+    pooling_params.verify(task=task)
+
+    pooling_params = PoolingParams(activation=False)
+    pooling_params.verify(task=task)
+
+    invalid_parameters = ["dimensions", "normalize", "softmax"]
+    for p in invalid_parameters:
+        with pytest.raises(ValueError):
+            pooling_params = PoolingParams(**{p: True})
+            pooling_params.verify(task=task)
+
+
+def test_encode():
+    task = "encode"
+    pooling_params = PoolingParams(softmax=None)
+    pooling_params.verify(task=task)
+
+    pooling_params = PoolingParams(softmax=True)
+    pooling_params.verify(task=task)
+
+    pooling_params = PoolingParams(softmax=False)
+    pooling_params.verify(task=task)
+
+    invalid_parameters = ["dimensions", "normalize", "activation"]
+    for p in invalid_parameters:
+        with pytest.raises(ValueError):
+            pooling_params = PoolingParams(**{p: True})
+            pooling_params.verify(task=task)
diff --git a/tests/test_sampling_params.py b/tests/test_sampling_params.py
index be6427dd6bde5..7330f61e67689 100644
--- a/tests/test_sampling_params.py
+++ b/tests/test_sampling_params.py
@@ -56,8 +56,7 @@ def test_sampling_params_from_request_with_no_guided_decoding_backend(
 
 
 @pytest.mark.parametrize("request_level_guided_decoding_backend,expected",
-                         [("xgrammar", "xgrammar"),
-                          ("lm-format-enforcer", "lm-format-enforcer"),
+                         [("xgrammar", "xgrammar"), ("guidance", "guidance"),
                           ("outlines", "outlines")])
 def test_sampling_params_from_request_with_guided_decoding_backend(
         request_level_guided_decoding_backend: str, expected: str,
diff --git a/tests/test_sharded_state_loader.py b/tests/test_sharded_state_loader.py
index 64706defb5960..1bb4203d21c3e 100644
--- a/tests/test_sharded_state_loader.py
+++ b/tests/test_sharded_state_loader.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import fnmatch
 import multiprocessing as mp
 import os
 import shutil
@@ -64,9 +65,10 @@ def _run_writer(input_dir, output_dir, weights_patterns, **kwargs):
     # Copy metadata files to output directory
     for file in os.listdir(input_dir):
         if os.path.isdir(os.path.join(input_dir, file)):
-            continue
-        if not any(file.endswith(ext) for ext in weights_patterns):
-            shutil.copy(f"{input_dir}/{file}", output_dir)
+            shutil.copytree(os.path.join(input_dir, file),
+                            os.path.join(output_dir, file))
+        elif not any(fnmatch.fnmatch(file, ext) for ext in weights_patterns):
+            shutil.copy(os.path.join(input_dir, file), output_dir)
 
 
 def _run_generate(input_dir, queue: mp.Queue, **kwargs):
diff --git a/tests/tokenization/test_cached_tokenizer.py b/tests/tokenization/test_cached_tokenizer.py
index e218678c4363b..07217611ea4d2 100644
--- a/tests/tokenization/test_cached_tokenizer.py
+++ b/tests/tokenization/test_cached_tokenizer.py
@@ -10,7 +10,7 @@ from vllm.transformers_utils.tokenizer import (AnyTokenizer,
                                                get_cached_tokenizer)
 
 
-@pytest.mark.parametrize("model_id", ["gpt2", "THUDM/chatglm3-6b"])
+@pytest.mark.parametrize("model_id", ["gpt2", "zai-org/chatglm3-6b"])
 def test_cached_tokenizer(model_id: str):
     reference_tokenizer = AutoTokenizer.from_pretrained(model_id,
                                                         trust_remote_code=True)
diff --git a/tests/tool_use/test_glm4_moe_tool_parser.py b/tests/tool_use/test_glm4_moe_tool_parser.py
index 478f4b9166725..91913c933184e 100644
--- a/tests/tool_use/test_glm4_moe_tool_parser.py
+++ b/tests/tool_use/test_glm4_moe_tool_parser.py
@@ -12,7 +12,7 @@ from vllm.transformers_utils.tokenizer import get_tokenizer
 
 pytest.skip("skip glm4_moe parser test", allow_module_level=True)
 # Use a common model that is likely to be available
-MODEL = "THUDM/GLM-4.5"
+MODEL = "zai-org/GLM-4.5"
 
 
 @pytest.fixture(scope="module")
diff --git a/tests/tpu/lora/test_lora.py b/tests/tpu/lora/test_lora.py
index b26bdd34d890e..4c47b8c43caff 100644
--- a/tests/tpu/lora/test_lora.py
+++ b/tests/tpu/lora/test_lora.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
+from torch_xla._internal import tpu
 
 import vllm
 from vllm.lora.request import LoRARequest
@@ -27,25 +28,31 @@ def use_v1_only(monkeypatch: pytest.MonkeyPatch):
         yield
 
 
-def setup_vllm(num_loras: int) -> vllm.LLM:
+def setup_vllm(num_loras: int, tp: int) -> vllm.LLM:
     return vllm.LLM(model="Qwen/Qwen2.5-3B-Instruct",
                     num_scheduler_steps=1,
                     max_model_len=256,
                     max_seq_len_to_capture=256,
                     max_num_seqs=8,
+                    tensor_parallel_size=tp,
                     enable_lora=True,
                     max_loras=num_loras,
                     max_lora_rank=8)
 
 
-def test_single_lora():
+TPU_TENSOR_PARALLEL_SIZES = [1, tpu.num_available_chips()
+                             ] if tpu.num_available_chips() > 1 else [1]
+
+
+@pytest.mark.parametrize("tp", TPU_TENSOR_PARALLEL_SIZES)
+def test_single_lora(tp: int):
     """
     This test ensures we can run a single LoRA adapter on the TPU backend.
     We run "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_1_adapter" which
     will force Qwen2.5-3B-Instruct to claim 1+1=1.
     """
 
-    llm = setup_vllm(1)
+    llm = setup_vllm(1, tp)
 
     prompt = "What is 1+1? \n"
 
@@ -63,7 +70,8 @@ def test_single_lora():
     assert int(answer) == 1
 
 
-def test_lora_hotswapping():
+@pytest.mark.parametrize("tp", TPU_TENSOR_PARALLEL_SIZES)
+def test_lora_hotswapping(tp: int):
     """
     This test ensures we can run multiple LoRA adapters on the TPU backend, even
     if we only have space to store 1.
@@ -79,7 +87,7 @@ def test_lora_hotswapping():
         for i in range(1, 5)
     ]
 
-    llm = setup_vllm(1)
+    llm = setup_vllm(1, tp)
 
     prompt = "What is 1+1? \n"
 
@@ -94,7 +102,8 @@ def test_lora_hotswapping():
         assert int(answer) == i + 1
 
 
-def test_multi_lora():
+@pytest.mark.parametrize("tp", TPU_TENSOR_PARALLEL_SIZES)
+def test_multi_lora(tp: int):
     """
     This test ensures we can run multiple LoRA adapters on the TPU backend, when
     we have enough space to store all of them.
@@ -109,7 +118,7 @@ def test_multi_lora():
         for i in range(1, 5)
     ]
 
-    llm = setup_vllm(4)
+    llm = setup_vllm(4, tp)
 
     prompt = "What is 1+1? \n"
 
diff --git a/tests/utils.py b/tests/utils.py
index f4317e6bdb406..1c1a1cc6014ec 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -4,6 +4,7 @@
 import asyncio
 import copy
 import functools
+import importlib
 import os
 import signal
 import subprocess
@@ -974,3 +975,14 @@ def get_client_text_logprob_generations(
     return [(text_generations, text,
              (None if x.logprobs is None else x.logprobs.top_logprobs))
             for completion in completions for x in completion.choices]
+
+
+def has_module_attribute(module_name, attribute_name):
+    """
+    Helper function to check if a module has a specific attribute.
+    """
+    try:
+        module = importlib.import_module(module_name)
+        return hasattr(module, attribute_name)
+    except ImportError:
+        return False
diff --git a/tests/v1/attention/test_attention_backends.py b/tests/v1/attention/test_attention_backends.py
index 9bd0b99798d77..ac08b9052cd80 100644
--- a/tests/v1/attention/test_attention_backends.py
+++ b/tests/v1/attention/test_attention_backends.py
@@ -17,7 +17,7 @@ from vllm.v1.kv_cache_interface import FullAttentionSpec
 
 BACKENDS_TO_TEST = [
     _Backend.FLASH_ATTN_VLLM_V1, _Backend.FLASHINFER_VLLM_V1,
-    _Backend.FLEX_ATTENTION, _Backend.TRITON_ATTN_VLLM_V1
+    _Backend.FLEX_ATTENTION, _Backend.TRITON_ATTN_VLLM_V1, _Backend.TREE_ATTN
 ]
 
 # Remove flashinfer from the list if it's not available
@@ -198,7 +198,8 @@ class MockAttentionLayer:
 
 
 def run_attention_backend(backend: _Backend, kv_cache_spec: FullAttentionSpec,
-                          vllm_config, device: torch.device,
+                          layer_names: list[str], vllm_config,
+                          device: torch.device,
                           common_attn_metadata: CommonAttentionMetadata,
                           query: torch.Tensor, key: torch.Tensor,
                           value: torch.Tensor,
@@ -211,31 +212,33 @@ def run_attention_backend(backend: _Backend, kv_cache_spec: FullAttentionSpec,
     if backend == _Backend.FLASHINFER_VLLM_V1:
         import unittest.mock
 
-        from vllm.v1.attention.backends.flashinfer import PerLayerParameters
+        from vllm.v1.attention.backends.utils import PerLayerParameters
 
-        def mock_get_per_layer_parameters(vllm_config, impl_cls):
+        def mock_get_per_layer_parameters(vllm_config, layer_names, impl_cls):
             # Return mock parameters for a single layer
             head_size = vllm_config.model_config.get_head_size()
             return {
-                "mock_layer":
+                layer_name:
                 PerLayerParameters(
                     window_left=-1,  # No sliding window
                     logits_soft_cap=0.0,  # No soft cap
                     sm_scale=1.0 / (head_size**0.5)  # Standard scale
                 )
+                for layer_name in layer_names
             }
 
         with unittest.mock.patch(
                 'vllm.v1.attention.backends.flashinfer.get_per_layer_parameters',
                 mock_get_per_layer_parameters):
-            builder = builder_cls(kv_cache_spec, vllm_config, device)
+            builder = builder_cls(kv_cache_spec, layer_names, vllm_config,
+                                  device)
             attn_metadata = builder.build(
                 common_prefix_len=0,
                 common_attn_metadata=common_attn_metadata,
             )
     else:
         # Build metadata
-        builder = builder_cls(kv_cache_spec, vllm_config, device)
+        builder = builder_cls(kv_cache_spec, layer_names, vllm_config, device)
         attn_metadata = builder.build(
             common_prefix_len=0,
             common_attn_metadata=common_attn_metadata,
@@ -427,8 +430,8 @@ def test_backend_correctness(batch_spec_name: str, model: str):
             set_kv_cache_layout("HND")
 
         backend_output = run_attention_backend(backend_name, kv_cache_spec,
-                                               vllm_config, device,
-                                               common_attn_metadata,
+                                               ["placeholder"], vllm_config,
+                                               device, common_attn_metadata,
                                                query_vllm, key_vllm,
                                                value_vllm,
                                                kv_cache_for_backend)
diff --git a/tests/v1/attention/test_attention_splitting.py b/tests/v1/attention/test_attention_splitting.py
new file mode 100644
index 0000000000000..3fc1011d5042e
--- /dev/null
+++ b/tests/v1/attention/test_attention_splitting.py
@@ -0,0 +1,157 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from tests.v1.attention.test_attention_backends import BATCH_SPECS
+from tests.v1.attention.utils import create_common_attn_metadata
+from vllm.v1.attention.backends.utils import (UbatchSlice,
+                                              _make_metadata_with_slice,
+                                              slice_query_start_locs,
+                                              split_attn_metadata)
+
+
+@pytest.fixture
+def sample_query_start_loc():
+    """Sample query_start_loc tensor for testing"""
+    return torch.tensor([0, 5, 12, 20, 35, 50])
+
+
+def test_basic_slice_middle(sample_query_start_loc):
+    """Test slicing from middle of tensor"""
+    req_slice = slice(1, 3)  # slice from index 1 to 3
+    result = slice_query_start_locs(sample_query_start_loc, req_slice)
+
+    expected = torch.tensor([0, 7, 15])
+    assert torch.equal(result, expected)
+
+
+def test_slice_from_beginning(sample_query_start_loc):
+    """Test slicing from the beginning of tensor"""
+    req_slice = slice(0, 2)  # slice from index 0 to 2
+    result = slice_query_start_locs(sample_query_start_loc, req_slice)
+
+    expected = torch.tensor([0, 5, 12])
+    assert torch.equal(result, expected)
+
+
+def test_slice_to_end(sample_query_start_loc):
+    """Test slicing to the end of tensor"""
+    req_slice = slice(3, 5)  # slice from index 3 to 5 (last index)
+    result = slice_query_start_locs(sample_query_start_loc, req_slice)
+
+    expected = torch.tensor([0, 15, 30])
+    assert torch.equal(result, expected)
+
+
+def test_single_element_slice(sample_query_start_loc):
+    """Test slice that results in single element"""
+    req_slice = slice(2, 3)  # slice from index 2 to 3
+    result = slice_query_start_locs(sample_query_start_loc, req_slice)
+
+    expected = torch.tensor([0, 8])
+    assert torch.equal(result, expected)
+
+
+def test_full_tensor_slice(sample_query_start_loc):
+    """Test slicing the entire tensor"""
+    req_slice = slice(0, 5)  # slice entire tensor
+    result = slice_query_start_locs(sample_query_start_loc, req_slice)
+
+    expected = torch.tensor([0, 5, 12, 20, 35, 50])
+    assert torch.equal(result, expected)
+
+
+def test_slice_bounds_edge_cases(sample_query_start_loc):
+    # Test slice that goes exactly to the last element
+    req_slice = slice(4, 5)  # Last index
+    result = slice_query_start_locs(sample_query_start_loc, req_slice)
+
+    expected = torch.tensor([0, 15])
+    assert torch.equal(result, expected)
+
+
+@pytest.fixture
+def small_decode_metadata():
+    """Create metadata for small decode batch"""
+    batch_spec = BATCH_SPECS["small_decode"]
+    device = torch.device("cpu")
+    return create_common_attn_metadata(batch_spec,
+                                       block_size=16,
+                                       device=device)
+
+
+@pytest.fixture
+def large_decode_metadata():
+    """Create metadata for small decode batch"""
+    batch_spec = BATCH_SPECS["large_decode"]
+    device = torch.device("cpu")
+    return create_common_attn_metadata(batch_spec,
+                                       block_size=16,
+                                       device=device)
+
+
+@pytest.fixture
+def mixed_small_metadata():
+    """Create metadata for mixed small batch"""
+    batch_spec = BATCH_SPECS["mixed_small"]
+    device = torch.device("cpu")
+    return create_common_attn_metadata(batch_spec,
+                                       block_size=16,
+                                       device=device)
+
+
+# Tests for _make_metadata_with_slice
+def test_make_metadata_with_slice_decode_batch(small_decode_metadata):
+    """Test slicing decode batch metadata"""
+    # Split first request only
+    ubatch_slice = UbatchSlice(slice(0, 1), slice(0, 1))
+
+    result = _make_metadata_with_slice(ubatch_slice, small_decode_metadata)
+
+    # Check sliced results
+    assert result.num_reqs == 1  # slice(0, 1) gives 1 requests
+    assert result.num_actual_tokens == 1  # slice(0, 1) gives 1 token
+    assert result.max_query_len == 1
+    assert torch.equal(result.query_start_loc, torch.tensor([0, 1]))
+    assert torch.equal(result.seq_lens, torch.tensor([32]))
+
+
+def test_make_metadata_with_slice_mixed_batch(mixed_small_metadata):
+    """Test slicing mixed batch metadata"""
+    ubatch_slice = UbatchSlice(slice(1, 3),
+                               slice(1, 7))  # Requests 1-3, tokens 1-7
+
+    result = _make_metadata_with_slice(ubatch_slice, mixed_small_metadata)
+
+    assert result.num_reqs == 2  # slice(1, 3) gives 2 requests
+    assert result.num_actual_tokens == 6  # slice(1, 7) gives 6 tokens
+    assert result.max_query_len == 5
+    assert torch.equal(result.query_start_loc, torch.tensor([0, 1, 6]))
+    assert torch.equal(result.seq_lens, torch.tensor([40, 48]))
+
+
+def test_split_attn_metadata_decode_batch(large_decode_metadata):
+    """Test splitting decode batch into two equal parts"""
+    num_tokens = large_decode_metadata.num_reqs
+    mid_point = num_tokens // 2
+    ubatch_slices = [
+        UbatchSlice(slice(0, mid_point), slice(0, mid_point)),
+        UbatchSlice(slice(mid_point, num_tokens), slice(mid_point,
+                                                        num_tokens)),
+    ]
+
+    results = split_attn_metadata(ubatch_slices, large_decode_metadata)
+
+    assert len(results) == 2
+
+    # Check first split
+    assert results[0].num_reqs == mid_point
+    assert results[0].num_actual_tokens == mid_point
+    assert torch.equal(results[0].seq_lens, torch.tensor([2048] * mid_point))
+
+    # Check second split
+    assert results[1].num_reqs == mid_point
+    assert results[1].num_actual_tokens == mid_point
+    assert torch.equal(results[1].seq_lens, torch.tensor([2048] * mid_point))
diff --git a/tests/v1/attention/test_chunked_local_attention.py b/tests/v1/attention/test_chunked_local_attention.py
new file mode 100644
index 0000000000000..8c5a63653db9f
--- /dev/null
+++ b/tests/v1/attention/test_chunked_local_attention.py
@@ -0,0 +1,196 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass
+
+import numpy as np
+import pytest
+import torch
+
+from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata
+from vllm.v1.attention.backends.utils import (
+    make_local_attention_virtual_batches)
+
+
+@dataclass
+class LocalAttentionTestData:
+    # Input parameters
+    batch_spec: BatchSpec
+    attn_chunk_size: int
+    block_size: int
+    # Expected return values
+    expected_q_seqlens: list[int]
+    expected_k_seqlens: list[int]
+    expected_local_block_table: list[list[int]]
+
+
+test_data_list = [
+    # Same as example in docstring of make_local_attention_virtual_batches
+    # except block table has 9 columns instead of 10
+    LocalAttentionTestData(
+        batch_spec=BatchSpec(
+            query_lens=[4, 10, 5],
+            seq_lens=[6, 17, 9],
+        ),
+        attn_chunk_size=4,
+        block_size=2,
+        expected_q_seqlens=[2, 2, 1, 4, 4, 1, 4, 1],
+        expected_k_seqlens=[4, 2, 4, 4, 4, 1, 4, 1],
+        # 2 pages per local branch
+        # (chunk size 4 // block size 2)
+        expected_local_block_table=[
+            [0, 1],  # local-batch 0, (batch 0, starting from k[0])
+            [2, 3],  # local-batch 1, (batch 0, starting from k[4])
+            [11, 12],  # local-batch 2, (batch 1, starting from k[4])
+            [13, 14],  # local-batch 3, (batch 1, starting from k[8])
+            [15, 16],  # local-batch 4, (batch 1, starting from k[12])
+            [17, 17],  # local-batch 5, (batch 1, starting from k[16])
+            [20, 21],  # local-batch 6, (batch 2, starting from k[4])
+            [22, 23],  # local-batch 7, (batch 2, starting from k[8])
+        ]),
+    # Case where block indices are not clipped to block table ncols-1
+    # because tokens_in_last_block == attn_chunk_size
+    LocalAttentionTestData(batch_spec=BatchSpec(
+        query_lens=[8],
+        seq_lens=[12],
+    ),
+                           attn_chunk_size=4,
+                           block_size=2,
+                           expected_q_seqlens=[4, 4],
+                           expected_k_seqlens=[4, 4],
+                           expected_local_block_table=[
+                               [2, 3],
+                               [4, 5],
+                           ]),
+    # Case where all kv_seq positions are involved in attn
+    LocalAttentionTestData(
+        batch_spec=BatchSpec(
+            query_lens=[7],
+            # 10 - 7 = 3 previously computed tokens
+            seq_lens=[10],
+        ),
+        attn_chunk_size=4,
+        block_size=2,
+        expected_q_seqlens=[1, 4, 2],
+        expected_k_seqlens=[4, 4, 2],
+        expected_local_block_table=[
+            [0, 1],
+            [2, 3],
+            [4, 4],
+        ]),
+    # Case where attn_chunk_size > kv_seq_len
+    # so no extra mini virtual batches are created
+    LocalAttentionTestData(
+        batch_spec=BatchSpec(
+            query_lens=[4],
+            seq_lens=[6],
+        ),
+        # Larger than kv_seq_len
+        attn_chunk_size=10,
+        block_size=2,
+        # No change to q_seqlens and k_seqlens
+        expected_q_seqlens=[4],
+        expected_k_seqlens=[6],
+        # In this case, we only need a block-table like:
+        #  block_table = [ [0, 1, 2] ] # 1 batch, 3 pages
+        # But we need to pad it to 5 pages per local batch
+        # because currently the pages_per_local_batch
+        # is calculated as (attn_chunk_size // block_size)
+        expected_local_block_table=[
+            [0, 1, 2, 2, 2],
+        ]),
+    # Block size equal to chunk size
+    # Expect single page per batch in local batch table
+    LocalAttentionTestData(
+        batch_spec=BatchSpec(
+            query_lens=[6, 6],
+            seq_lens=[8, 8],
+        ),
+        attn_chunk_size=4,
+        block_size=4,
+        expected_q_seqlens=[2, 4, 2, 4],
+        expected_k_seqlens=[4, 4, 4, 4],
+        # Initial block table = [
+        #    [0, 1], < batch 0
+        #    [2, 3], < batch 1
+        # ]
+        expected_local_block_table=[
+            [0],  # local-batch 0, (batch 0, starting from k[0])
+            [1],  # local-batch 1, (batch 0, starting from k[4])
+            [2],  # local-batch 1, (batch 0, starting from k[0])
+            [3],  # local-batch 1, (batch 0, starting from k[4])
+        ]),
+    # Case where query falls in the second attention chunk
+    #  k_toks >   0 1 2 3 4
+    #  q_toks v  _____________
+    #         0 | 1
+    #         1 | 1 1
+    #         2 | 1 1 1
+    #         3 | 1 1 1 1
+    #         4 |         1
+    #  where tokens 0,1,2,3 have been pre-computed
+    LocalAttentionTestData(batch_spec=BatchSpec(
+        query_lens=[1],
+        seq_lens=[5],
+    ),
+                           attn_chunk_size=4,
+                           block_size=2,
+                           expected_q_seqlens=[1],
+                           expected_k_seqlens=[1],
+                           expected_local_block_table=[
+                               [2, 2],
+                           ]),
+]
+
+
+@pytest.mark.parametrize("test_data", test_data_list)
+def test_local_attention_virtual_batches(test_data: LocalAttentionTestData):
+    device = torch.device("cuda:0")
+    batch_spec = test_data.batch_spec
+    attn_chunk_size = test_data.attn_chunk_size
+    block_size = test_data.block_size
+    expected_q_seqlens = test_data.expected_q_seqlens
+    expected_k_seqlens = test_data.expected_k_seqlens
+    expected_local_block_table = test_data.expected_local_block_table
+
+    # Create common attention metadata
+    common_attn_metadata = create_common_attn_metadata(
+        batch_spec,
+        block_size,
+        device,
+        # Use torch.arange instead of torch.randint so we can assert on
+        # block table tensor values. The block table will have shape
+        # (num_batches, cdiv(max_seq_len, block_size)) and the values will be
+        # aranged from 0 to cdiv(max_seq_len, block_size)-1
+        arange_block_indices=True,
+    )
+
+    # Call the function
+    result = make_local_attention_virtual_batches(attn_chunk_size,
+                                                  common_attn_metadata,
+                                                  block_size)
+
+    # Convert to numpy for easier comparison
+    actual_q_seqlens = np.diff(result.query_start_loc_cpu.numpy())
+    actual_k_seqlens = result.seq_lens_cpu.numpy()
+
+    # Check that all query lengths are less than or equal to attn_chunk_size
+    assert all(q_len <= attn_chunk_size for q_len in actual_q_seqlens)
+    # Check that all key lengths are less than or equal to attn_chunk_size
+    assert all(k_len <= attn_chunk_size for k_len in actual_k_seqlens)
+    # Check that the total number of query tokens is preserved
+    assert sum(actual_q_seqlens) == sum(batch_spec.query_lens)
+
+    # Verify results
+    np.testing.assert_array_equal(actual_q_seqlens, expected_q_seqlens)
+    np.testing.assert_array_equal(actual_k_seqlens, expected_k_seqlens)
+
+    expected_block_table_tensor =\
+        torch.tensor(expected_local_block_table,
+        dtype=torch.int32,
+        device=device)
+
+    print(f"Expected block table:\n{expected_block_table_tensor}")
+    print(f"Actual block table:\n{result.block_table_tensor}")
+
+    torch.testing.assert_close(result.block_table_tensor,
+                               expected_block_table_tensor)
diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py
index ae2ab6e6413c0..78a6509986fcd 100644
--- a/tests/v1/attention/utils.py
+++ b/tests/v1/attention/utils.py
@@ -40,7 +40,8 @@ def create_common_attn_metadata(
         batch_spec: BatchSpec,
         block_size: int,
         device: torch.device,
-        max_block_idx: int = 1000) -> CommonAttentionMetadata:
+        max_block_idx: int = 1000,
+        arange_block_indices: bool = False) -> CommonAttentionMetadata:
     """Create CommonAttentionMetadata from a BatchSpec and ModelParams."""
     # Create query start locations
     query_start_loc = torch.zeros(batch_spec.batch_size + 1,
@@ -65,19 +66,28 @@ def create_common_attn_metadata(
     ]
     num_computed_tokens_cpu = torch.tensor(context_lens, dtype=torch.int32)
 
-    # Create block table (random for testing)
+    # Create block table and slot mapping
     max_blocks = (max(batch_spec.seq_lens) + block_size - 1) // block_size
-    block_table_tensor = torch.randint(0,
-                                       max_block_idx,
-                                       (batch_spec.batch_size, max_blocks),
-                                       dtype=torch.int32,
-                                       device=device)
-
-    # Create slot mapping
-    slot_mapping = torch.randint(0,
-                                 max_block_idx, (num_tokens, ),
-                                 dtype=torch.int64,
-                                 device=device)
+    if arange_block_indices:
+        num_blocks = batch_spec.batch_size * max_blocks
+        block_table_tensor = torch.arange(num_blocks,
+                                          dtype=torch.int32,
+                                          device=device).view(
+                                              batch_spec.batch_size,
+                                              max_blocks)
+        slot_mapping = torch.arange(num_tokens,
+                                    dtype=torch.int64,
+                                    device=device).view(num_tokens)
+    else:
+        block_table_tensor = torch.randint(0,
+                                           max_block_idx,
+                                           (batch_spec.batch_size, max_blocks),
+                                           dtype=torch.int32,
+                                           device=device)
+        slot_mapping = torch.randint(0,
+                                     max_block_idx, (num_tokens, ),
+                                     dtype=torch.int64,
+                                     device=device)
 
     # Calculate max query length
     max_query_len = max(batch_spec.query_lens)
@@ -99,11 +109,11 @@ def create_common_attn_metadata(
 
 def get_attention_backend(backend_name: _Backend):
     """Set up attention backend classes for testing.
-    
+
     Args:
         backend_name: Name of the backend ("flash_attn", "flashinfer", etc.)
         vllm_config: VllmConfig instance
-        
+
     Returns:
         Tuple of (backend_builder_class, backend_impl_class)
     """
@@ -116,6 +126,8 @@ def get_attention_backend(backend_name: _Backend):
         "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend",
         _Backend.TRITON_ATTN_VLLM_V1:
         "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend",
+        _Backend.TREE_ATTN:
+        "vllm.v1.attention.backends.tree_attn.TreeAttentionBackend",
     }
 
     if backend_name not in backend_map:
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index ebe3a30e3352d..bff3724d95e68 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -17,7 +17,7 @@ from vllm.v1.core.kv_cache_utils import (
     estimate_max_model_len, generate_block_hash_extra_keys,
     get_kv_cache_config, get_max_concurrency_for_kv_cache_config,
     hash_block_tokens, hash_request_tokens, init_none_hash,
-    unify_kv_cache_configs)
+    is_kv_cache_type_uniform, unify_kv_cache_configs)
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheGroupSpec, KVCacheTensor,
                                         SlidingWindowSpec)
@@ -112,9 +112,9 @@ def test_kv_cache_block():
     assert block.block_hash is None
 
     # Test reference count manipulation
-    block.incr_ref()
+    block.ref_cnt += 1
     assert block.ref_cnt == 1
-    block.decr_ref()
+    block.ref_cnt -= 1
     assert block.ref_cnt == 0
 
     # Test block hash setting and resetting
@@ -685,6 +685,38 @@ def test_merge_kv_cache_spec():
     assert merged_layer_spec.sliding_window == 1
 
 
+def test_is_kv_cache_type_uniform():
+    kv_cache_spec = {
+        "layer_1": new_kv_cache_spec(num_kv_heads=32),
+        "layer_2": new_kv_cache_spec(num_kv_heads=32),
+    }
+    assert is_kv_cache_type_uniform(kv_cache_spec)
+
+    kv_cache_spec = {
+        "layer_1": new_kv_cache_spec(num_kv_heads=32),
+        "layer_2": new_kv_cache_spec(num_kv_heads=32, sliding_window=1),
+    }
+    assert is_kv_cache_type_uniform(kv_cache_spec)
+
+    kv_cache_spec = {
+        "layer_1": new_kv_cache_spec(num_kv_heads=32),
+        "layer_2": new_sliding_window_spec(num_kv_heads=32, sliding_window=1),
+    }
+    assert not is_kv_cache_type_uniform(kv_cache_spec)
+
+    kv_cache_spec = {
+        "layer_1": new_sliding_window_spec(num_kv_heads=32, sliding_window=1),
+        "layer_2": new_sliding_window_spec(num_kv_heads=32, sliding_window=1),
+    }
+    assert is_kv_cache_type_uniform(kv_cache_spec)
+
+    kv_cache_spec = {
+        "layer_1": new_sliding_window_spec(num_kv_heads=32, sliding_window=1),
+        "layer_2": new_sliding_window_spec(num_kv_heads=32, sliding_window=2),
+    }
+    assert not is_kv_cache_type_uniform(kv_cache_spec)
+
+
 @pytest.mark.parametrize(
     ("model_id", "max_model_len", "want_estimated_max_len"), [
         ("Qwen/Qwen1.5-7B", 16385, 16384),
diff --git a/tests/v1/e2e/test_correctness_sliding_window.py b/tests/v1/e2e/test_correctness_sliding_window.py
index 277ea3c838505..4dfe1d3bb33fa 100644
--- a/tests/v1/e2e/test_correctness_sliding_window.py
+++ b/tests/v1/e2e/test_correctness_sliding_window.py
@@ -30,7 +30,9 @@ model_config = {
     ])
 @pytest.mark.parametrize("batch_size", [5])
 @pytest.mark.parametrize("seed", [1])
-def test_sliding_window_retrieval(monkeypatch, model, batch_size, seed):
+@pytest.mark.parametrize("disable_hybrid_kv_cache_manager", [True, False])
+def test_sliding_window_retrieval(monkeypatch, model, batch_size, seed,
+                                  disable_hybrid_kv_cache_manager):
     """
     The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then
     asks for value of one of them (which is outside the sliding window).
@@ -42,7 +44,9 @@ def test_sliding_window_retrieval(monkeypatch, model, batch_size, seed):
 
         test_config = model_config[model]
 
-        llm = LLM(model=model)
+        llm = LLM(
+            model=model,
+            disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager)
         sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
 
         prompts, answer, indices = prep_prompts(batch_size,
diff --git a/tests/v1/e2e/test_kv_sharing_fast_prefill.py b/tests/v1/e2e/test_kv_sharing_fast_prefill.py
new file mode 100644
index 0000000000000..f5a7b9cc276b3
--- /dev/null
+++ b/tests/v1/e2e/test_kv_sharing_fast_prefill.py
@@ -0,0 +1,168 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import random
+from typing import Optional, Union
+
+import pytest
+import torch
+
+from vllm import LLM, SamplingParams
+from vllm.config import CompilationConfig, CompilationLevel
+from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.forward_context import get_forward_context
+from vllm.model_executor.models.gemma3n import Gemma3nForConditionalGeneration
+from vllm.model_executor.models.registry import ModelRegistry
+from vllm.model_executor.models.utils import extract_layer_index
+from vllm.sequence import IntermediateTensors
+
+from ...utils import fork_new_process_for_each_test
+
+# global seed
+SEED = 42
+
+
+class TestGemma3nForConditionalGeneration(Gemma3nForConditionalGeneration):
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds, **kwargs)
+        attn_metadata = get_forward_context().attn_metadata
+        # attn_metadata is None during dummy runs
+        if (attn_metadata is not None
+                and self.cache_config.kv_sharing_fast_prefill):
+            assert isinstance(attn_metadata, dict)  # true in V1
+            # Gemma3n-E2B has 30 layers, with last 20 layers being
+            # cross-decoder layers. Check attention metadata is correct
+            for layer_name, metadata in attn_metadata.items():
+                layer_idx = extract_layer_index(layer_name)
+                if layer_idx >= 20:
+                    assert hasattr(metadata, 'logits_indices_padded')
+                    assert hasattr(metadata, 'num_logits_indices')
+                else:
+                    assert not hasattr(metadata, 'logits_indices_padded')
+                    assert not hasattr(metadata, 'num_logits_indices')
+
+            # Last layer will be a KV sharing layer
+            layer_attn_metadata = attn_metadata[
+                self.model.language_model.layers[-1].self_attn.attn.layer_name]
+            logits_indices_padded = (layer_attn_metadata.logits_indices_padded)
+            assert logits_indices_padded is not None
+            num_logits_indices = layer_attn_metadata.num_logits_indices
+            assert num_logits_indices > 0
+            # Reset hidden states to random values and
+            # only set logits at logits_indices to valid values
+            # Because logits_indices are the only positions that are used
+            # for output token sampling, this still produces same outputs
+            logits_hs = hidden_states[logits_indices_padded]
+            hidden_states = torch.randn_like(hidden_states)
+            gen_indices = logits_indices_padded[:num_logits_indices]
+            hidden_states[gen_indices] = logits_hs[:num_logits_indices]
+
+        return hidden_states
+
+
+@pytest.fixture
+def test_prompts():
+    """
+    Adapted from tests/v1/e2e/test_spec_decode.py
+    """
+    prompt_types = ["repeat", "sentence"]
+    # Setting higher num prompts increases the chance of numerics mismatch
+    # due to matrix multiplication numerics depending on batch dimension
+    num_prompts = 10
+    prompts = []
+
+    random.seed(0)
+    random_prompt_type_choices = random.choices(prompt_types, k=num_prompts)
+
+    for kind in random_prompt_type_choices:
+        word_choices = ["test", "temp", "hello", "where"]
+        word = random.choice(word_choices)
+        if kind == "repeat":
+            prompt = f"""please repeat the word '{word}' 10 times."""
+        elif kind == "sentence":
+            prompt = f"""please give a ten-word sentence that
+            uses the word {word} at least once."""
+        else:
+            raise ValueError(f"Unknown prompt type: {kind}")
+        prompts.append(prompt)
+
+    return prompts
+
+
+def cleanup(llm: LLM, compilation_config: CompilationConfig):
+    # hacky: below lines are required to free up memory for the next test
+    # when setting VLLM_ENABLE_V1_MULTIPROCESSING=0, del llm is not sufficient
+    # TODO(sarckk): when enforce_eager=False, memory is not freed:
+    # find out why and re-enable test for enforce_eager=False case
+    llm_engine = llm.llm_engine.engine_core.engine_core
+    model_runner = llm_engine.model_executor.driver_worker.worker.model_runner
+    del model_runner.model
+    del model_runner.kv_caches
+    del compilation_config.static_forward_context
+    compilation_config.static_forward_context = {}
+
+    del llm
+    torch.cuda.empty_cache()
+    cleanup_dist_env_and_memory()
+
+
+@fork_new_process_for_each_test
+@pytest.mark.parametrize("enforce_eager", [True])
+def test_kv_sharing_fast_prefill(
+    monkeypatch: pytest.MonkeyPatch,
+    enforce_eager: bool,
+    test_prompts: list[str],
+):
+    ModelRegistry.register_model("Gemma3nForConditionalGeneration",
+                                 TestGemma3nForConditionalGeneration)
+    sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
+    compilation_config = CompilationConfig(
+        # This allows vLLM compilation backend to handle allocating and
+        # managing buffers for cudagraph
+        cudagraph_copy_inputs=True,
+        level=CompilationLevel.PIECEWISE
+        if not enforce_eager else CompilationLevel.NO_COMPILATION)
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        # Make scheduling deterministic for reproducibility
+        m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+
+        llm = LLM(
+            model="google/gemma-3n-E2B-it",
+            enforce_eager=enforce_eager,
+            compilation_config=compilation_config,
+            seed=SEED,
+        )
+        ref_responses = llm.generate(test_prompts, sampling_params)
+
+        cleanup(llm, compilation_config)
+
+        llm = LLM(model="google/gemma-3n-E2B-it",
+                  enforce_eager=enforce_eager,
+                  compilation_config=compilation_config,
+                  seed=SEED,
+                  kv_sharing_fast_prefill=True)
+        optimized_responses = llm.generate(test_prompts, sampling_params)
+
+        cleanup(llm, compilation_config)
+
+        misses = 0
+
+        for ref_response, optimized_response in zip(ref_responses,
+                                                    optimized_responses):
+            if ref_response.outputs[0].text != optimized_response.outputs[
+                    0].text:
+                misses += 1
+
+        assert misses == 0
diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index 2423f966acfab..31f25e94c5b4b 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -3,29 +3,34 @@
 from __future__ import annotations
 
 import random
-from typing import Any
+from typing import Any, Union
 
 import pytest
 import torch
 
 from vllm import LLM, SamplingParams
+from vllm.assets.base import VLLM_S3_BUCKET_URL
+from vllm.assets.image import VLM_IMAGES_DIR
 from vllm.distributed import cleanup_dist_env_and_memory
 
 
-@pytest.fixture
-def test_prompts():
+def get_test_prompts(mm_enabled: bool):
     prompt_types = ["repeat", "sentence"]
+    if mm_enabled:
+        prompt_types.append("mm")
     num_prompts = 100
     prompts = []
 
     random.seed(0)
     random_prompt_type_choices = random.choices(prompt_types, k=num_prompts)
+    print(f"Prompt types: {random_prompt_type_choices}")
 
     # Generate a mixed batch of prompts, some of which can be easily
     # predicted by n-gram matching and some which likely cannot.
     for kind in random_prompt_type_choices:
         word_choices = ["test", "temp", "hello", "where"]
         word = random.choice(word_choices)
+        prompt: Union[str, list[dict[str, Any]]] = ""
         if kind == "repeat":
             prompt = f"""
             please repeat the word '{word}' 10 times.
@@ -38,6 +43,21 @@ def test_prompts():
             uses the word {word} at least once.
             give no other output than that simple sentence without quotes.
             """
+        elif kind == "mm":
+            placeholders = [{
+                "type": "image_url",
+                "image_url": {
+                    "url":
+                    f"{VLLM_S3_BUCKET_URL}/{VLM_IMAGES_DIR}/stop_sign.jpg"
+                },
+            }]
+            prompt = [
+                *placeholders,
+                {
+                    "type": "text",
+                    "text": "The meaning of the image is"
+                },
+            ]
         else:
             raise ValueError(f"Unknown prompt type: {kind}")
         prompts.append([{"role": "user", "content": prompt}])
@@ -57,7 +77,6 @@ def model_name():
 
 def test_ngram_correctness(
     monkeypatch: pytest.MonkeyPatch,
-    test_prompts: list[list[dict[str, Any]]],
     sampling_config: SamplingParams,
     model_name: str,
 ):
@@ -67,6 +86,7 @@ def test_ngram_correctness(
     '''
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
+        test_prompts = get_test_prompts(mm_enabled=False)
 
         ref_llm = LLM(model=model_name, max_model_len=1024)
         ref_outputs = ref_llm.chat(test_prompts, sampling_config)
@@ -103,23 +123,32 @@ def test_ngram_correctness(
         cleanup_dist_env_and_memory()
 
 
-@pytest.mark.parametrize("model_setup", [
-    ("eagle", "meta-llama/Llama-3.1-8B-Instruct",
-     "yuhuili/EAGLE-LLaMA3.1-Instruct-8B", 1),
-    ("eagle3", "meta-llama/Llama-3.1-8B-Instruct",
-     "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", 1),
-    pytest.param(
-        ("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct",
-         "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4),
-        marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")),
-],
-                         ids=["llama3_eagle", "llama3_eagle3", "llama4_eagle"])
+@pytest.mark.parametrize(
+    ["model_setup", "mm_enabled"], [
+        (("eagle", "meta-llama/Llama-3.1-8B-Instruct",
+          "yuhuili/EAGLE-LLaMA3.1-Instruct-8B", 1), False),
+        (("eagle3", "meta-llama/Llama-3.1-8B-Instruct",
+          "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", 1), False),
+        pytest.param(
+            ("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+             "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4),
+            False,
+            marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")),
+        pytest.param(
+            ("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+             "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4),
+            True,
+            marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")),
+    ],
+    ids=["llama3_eagle", "llama3_eagle3", "llama4_eagle", "llama4_eagle_mm"])
 def test_eagle_correctness(
     monkeypatch: pytest.MonkeyPatch,
-    test_prompts: list[list[dict[str, Any]]],
     sampling_config: SamplingParams,
     model_setup: tuple[str, str, str, int],
+    mm_enabled: bool,
 ):
+    # Generate test prompts inside the function instead of using fixture
+    test_prompts = get_test_prompts(mm_enabled)
     '''
     Compare the outputs of a original LLM and a speculative LLM
     should be the same when using eagle speculative decoding.
diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index 412df3acff126..21694491dd73a 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -26,12 +26,10 @@ if not current_platform.is_cuda():
 TEXT_ENGINE_ARGS = AsyncEngineArgs(
     model="meta-llama/Llama-3.2-1B-Instruct",
     enforce_eager=True,
-    disable_log_requests=True,
 )
 
 VISION_ENGINE_ARGS = AsyncEngineArgs(model="Qwen/Qwen2-VL-2B-Instruct",
-                                     enforce_eager=True,
-                                     disable_log_requests=True)
+                                     enforce_eager=True)
 
 TEXT_PROMPT = "Hello my name is Robert and"
 
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index bbdc73e9608a1..c52b98967126b 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -65,7 +65,8 @@ def test_engine_core(monkeypatch: pytest.MonkeyPatch):
         """Test basic request lifecycle."""
 
         # First request.
-        engine_core.add_request(make_request())
+        engine_core.add_request(
+            *engine_core.preprocess_add_request(make_request()))
         assert len(engine_core.scheduler.waiting) == 1
         assert len(engine_core.scheduler.running) == 0
 
@@ -74,7 +75,8 @@ def test_engine_core(monkeypatch: pytest.MonkeyPatch):
         assert len(engine_core.scheduler.running) == 1
 
         # Second request.
-        engine_core.add_request(make_request())
+        engine_core.add_request(
+            *engine_core.preprocess_add_request(make_request()))
         assert len(engine_core.scheduler.waiting) == 1
         assert len(engine_core.scheduler.running) == 1
 
@@ -83,8 +85,10 @@ def test_engine_core(monkeypatch: pytest.MonkeyPatch):
         assert len(engine_core.scheduler.running) == 2
 
         # Add two requests in a row.
-        engine_core.add_request(make_request())
-        engine_core.add_request(make_request())
+        engine_core.add_request(
+            *engine_core.preprocess_add_request(make_request()))
+        engine_core.add_request(
+            *engine_core.preprocess_add_request(make_request()))
         assert len(engine_core.scheduler.waiting) == 2
         assert len(engine_core.scheduler.running) == 2
 
@@ -104,7 +108,7 @@ def test_engine_core(monkeypatch: pytest.MonkeyPatch):
         req = make_request()
         request_id = req.request_id
 
-        engine_core.add_request(req)
+        engine_core.add_request(*engine_core.preprocess_add_request(req))
         assert len(engine_core.scheduler.waiting) == 1
         assert len(engine_core.scheduler.running) == 0
         assert engine_core.scheduler.has_unfinished_requests()
@@ -131,8 +135,8 @@ def test_engine_core(monkeypatch: pytest.MonkeyPatch):
         req1 = make_request()
         req2 = make_request()
 
-        engine_core.add_request(req0)
-        engine_core.add_request(req1)
+        engine_core.add_request(*engine_core.preprocess_add_request(req0))
+        engine_core.add_request(*engine_core.preprocess_add_request(req1))
         assert len(engine_core.scheduler.waiting) == 2
         assert len(engine_core.scheduler.running) == 0
 
@@ -140,7 +144,7 @@ def test_engine_core(monkeypatch: pytest.MonkeyPatch):
         assert len(engine_core.scheduler.waiting) == 0
         assert len(engine_core.scheduler.running) == 2
 
-        engine_core.add_request(req2)
+        engine_core.add_request(*engine_core.preprocess_add_request(req2))
         assert len(engine_core.scheduler.waiting) == 1
         assert len(engine_core.scheduler.running) == 2
 
@@ -166,12 +170,12 @@ def test_engine_core(monkeypatch: pytest.MonkeyPatch):
         req0 = make_request()
         req1 = make_request()
         req0.request_id = req1.request_id = "test"
-        engine_core.add_request(req0)
+        engine_core.add_request(*engine_core.preprocess_add_request(req0))
 
         while (outs := engine_core.step()[0].get(0)) and outs.outputs:
             pass
 
-        engine_core.add_request(req1)
+        engine_core.add_request(*engine_core.preprocess_add_request(req1))
         while (outs := engine_core.step()[0].get(0)) and outs.outputs:
             pass
 
@@ -207,7 +211,7 @@ def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch):
             repetition_penalty=0.1,
             stop_token_ids=[1001, 1002],
         )
-        engine_core.add_request(request)
+        engine_core.add_request(*engine_core.preprocess_add_request(request))
 
         def _check_engine_state():
             assert len(engine_core.scheduler.waiting) == 1
@@ -226,7 +230,7 @@ def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch):
             top_p=0.99,
             top_k=50,
         )
-        engine_core.add_request(request2)
+        engine_core.add_request(*engine_core.preprocess_add_request(request2))
         _check_engine_state()
 
 
@@ -236,7 +240,7 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
     Test that the engine can handle multiple concurrent batches.
     """
 
-    def make_request_with_max_tokens(req_id: int,
+    def make_request_with_max_tokens(req_id: str,
                                      max_tokens: int) -> EngineCoreRequest:
         request = make_request()
         request.request_id = req_id
@@ -297,16 +301,16 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
         assert engine_core.batch_queue is not None
 
         # Add two requests in a row. Each request have 12 prompt tokens.
-        req0 = make_request_with_max_tokens(0, 5)
-        engine_core.add_request(req0)
-        req1 = make_request_with_max_tokens(1, 5)
-        engine_core.add_request(req1)
+        req0 = make_request_with_max_tokens("0", 5)
+        engine_core.add_request(*engine_core.preprocess_add_request(req0))
+        req1 = make_request_with_max_tokens("1", 5)
+        engine_core.add_request(*engine_core.preprocess_add_request(req1))
 
         # Schedule Batch 1: (10, req0)
         assert engine_core.step_with_batch_queue()[0] is None
         assert engine_core.batch_queue.qsize() == 1
         scheduler_output = engine_core.batch_queue.queue[-1][1]
-        assert scheduler_output.num_scheduled_tokens[0] == 10
+        assert scheduler_output.num_scheduled_tokens["0"] == 10
         # num_computed_tokens should have been updated immediately.
         assert engine_core.scheduler.requests[
             req0.request_id].num_computed_tokens == 10
@@ -315,11 +319,11 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
         assert engine_core.step_with_batch_queue()[0] is None
         assert engine_core.batch_queue.qsize() == 2
         scheduler_output = engine_core.batch_queue.queue[-1][1]
-        assert scheduler_output.num_scheduled_tokens[0] == 2
-        assert scheduler_output.num_scheduled_tokens[1] == 8
+        assert scheduler_output.num_scheduled_tokens["0"] == 2
+        assert scheduler_output.num_scheduled_tokens["1"] == 8
         # num_computed_tokens should have been updated immediately.
-        assert engine_core.scheduler.requests[0].num_computed_tokens == 12
-        assert engine_core.scheduler.requests[1].num_computed_tokens == 8
+        assert engine_core.scheduler.requests["0"].num_computed_tokens == 12
+        assert engine_core.scheduler.requests["1"].num_computed_tokens == 8
 
         assert engine_core.scheduler.get_num_unfinished_requests() == 2
 
@@ -331,7 +335,7 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
         engine_core.step_with_batch_queue()
         assert engine_core.batch_queue.qsize() == 2
         scheduler_output = engine_core.batch_queue.queue[-1][1]
-        assert scheduler_output.num_scheduled_tokens[1] == 4
+        assert scheduler_output.num_scheduled_tokens["1"] == 4
 
         # Batch queue is full. Finish Batch 2. Get first token of req0.
         output = engine_core.step_with_batch_queue()[0].get(0)
@@ -343,7 +347,7 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
         engine_core.step_with_batch_queue()
         assert engine_core.batch_queue.qsize() == 2
         scheduler_output = engine_core.batch_queue.queue[-1][1]
-        assert scheduler_output.num_scheduled_tokens[0] == 1
+        assert scheduler_output.num_scheduled_tokens["0"] == 1
 
         # Batch queue is full. Finish Batch 3. Get first token of req1.
         output = engine_core.step_with_batch_queue()[0].get(0)
@@ -355,14 +359,14 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
         engine_core.step_with_batch_queue()
         assert engine_core.batch_queue.qsize() == 2
         scheduler_output = engine_core.batch_queue.queue[-1][1]
-        assert scheduler_output.num_scheduled_tokens[1] == 1
+        assert scheduler_output.num_scheduled_tokens["1"] == 1
 
         # Loop until req0 is finished.
         step = 0
         req_id = 0
         expected_num_tokens = [
-            engine_core.scheduler.requests[0].num_tokens + 1,
-            engine_core.scheduler.requests[1].num_tokens + 1,
+            engine_core.scheduler.requests["0"].num_tokens + 1,
+            engine_core.scheduler.requests["1"].num_tokens + 1,
         ]
         while engine_core.scheduler.get_num_unfinished_requests() == 2:
             output = engine_core.step_with_batch_queue()[0]
@@ -413,3 +417,53 @@ def test_engine_core_tp(monkeypatch: pytest.MonkeyPatch):
             get_worker_cache_config_field, args=("num_cpu_blocks", ))
         assert all(x is not None for x in num_gpu_blocks)
         assert all(x is not None for x in num_cpu_blocks)
+
+
+@create_new_process_for_each_test()
+def test_engine_core_invalid_request_id_type(monkeypatch: pytest.MonkeyPatch):
+    """Test that engine raises TypeError for non-string request_id."""
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        engine_args = EngineArgs(model=MODEL_NAME)
+        vllm_config = engine_args.create_engine_config()
+        executor_class = Executor.get_class(vllm_config)
+
+        with set_default_torch_num_threads(1):
+            engine_core = EngineCore(vllm_config=vllm_config,
+                                     executor_class=executor_class,
+                                     log_stats=True)
+
+        # Test with UUID object (common mistake)
+        uuid_request = make_request()
+        uuid_request.request_id = uuid.uuid4()  # UUID object instead of string
+
+        with pytest.raises(TypeError,
+                           match="request_id must be a string, got.*UUID"):
+            engine_core.add_request(
+                *engine_core.preprocess_add_request(uuid_request))
+
+        # Test with integer
+        int_request = make_request()
+        int_request.request_id = 12345
+
+        with pytest.raises(TypeError,
+                           match="request_id must be a string, got.*int"):
+            engine_core.add_request(
+                *engine_core.preprocess_add_request(int_request))
+
+        # Test with None
+        none_request = make_request()
+        none_request.request_id = None
+
+        with pytest.raises(TypeError,
+                           match="request_id must be a string, got.*NoneType"):
+            engine_core.add_request(
+                *engine_core.preprocess_add_request(none_request))
+
+        # Verify engine is still functional after errors
+        valid_request = make_request()
+        engine_core.add_request(
+            *engine_core.preprocess_add_request(valid_request))
+        assert len(engine_core.scheduler.waiting) == 1
+        assert len(engine_core.scheduler.running) == 0
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index 2ac6dc796bd10..1329ce5f69cbd 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -6,8 +6,9 @@ import os
 import signal
 import time
 import uuid
+from dataclasses import dataclass
 from threading import Thread
-from typing import Optional
+from typing import Optional, Union
 from unittest.mock import MagicMock
 
 import pytest
@@ -292,6 +293,77 @@ async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch):
             client.shutdown()
 
 
+@dataclass
+class MyDataclass:
+    message: str
+
+
+# Dummy utility function to monkey-patch into engine core.
+def echo_dc(
+    self,
+    msg: str,
+    return_list: bool = False,
+) -> Union[MyDataclass, list[MyDataclass]]:
+    print(f"echo dc util function called: {msg}")
+    val = None if msg is None else MyDataclass(msg)
+    # Return dataclass to verify support for returning custom types
+    # (for which there is special handling to make it work with msgspec).
+    return [val for _ in range(3)] if return_list else val
+
+
+@pytest.mark.asyncio(loop_scope="function")
+async def test_engine_core_client_util_method_custom_return(
+        monkeypatch: pytest.MonkeyPatch):
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        # Must set insecure serialization to allow returning custom types.
+        m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
+
+        # Monkey-patch core engine utility function to test.
+        m.setattr(EngineCore, "echo_dc", echo_dc, raising=False)
+
+        engine_args = EngineArgs(model=MODEL_NAME, enforce_eager=True)
+        vllm_config = engine_args.create_engine_config(
+            usage_context=UsageContext.UNKNOWN_CONTEXT)
+        executor_class = Executor.get_class(vllm_config)
+
+        with set_default_torch_num_threads(1):
+            client = EngineCoreClient.make_client(
+                multiprocess_mode=True,
+                asyncio_mode=True,
+                vllm_config=vllm_config,
+                executor_class=executor_class,
+                log_stats=True,
+            )
+
+        try:
+            # Test utility method returning custom / non-native data type.
+            core_client: AsyncMPClient = client
+
+            result = await core_client.call_utility_async(
+                "echo_dc", "testarg2", False)
+            assert isinstance(result,
+                              MyDataclass) and result.message == "testarg2"
+            result = await core_client.call_utility_async(
+                "echo_dc", "testarg2", True)
+            assert isinstance(result, list) and all(
+                isinstance(r, MyDataclass) and r.message == "testarg2"
+                for r in result)
+
+            # Test returning None and list of Nones
+            result = await core_client.call_utility_async(
+                "echo_dc", None, False)
+            assert result is None
+            result = await core_client.call_utility_async(
+                "echo_dc", None, True)
+            assert isinstance(result, list) and all(r is None for r in result)
+
+        finally:
+            client.shutdown()
+
+
 @pytest.mark.parametrize(
     "multiprocessing_mode,publisher_config",
     [(True, "tcp"), (False, "inproc")],
diff --git a/tests/v1/engine/test_llm_engine.py b/tests/v1/engine/test_llm_engine.py
index f37686317fd14..2848420c22085 100644
--- a/tests/v1/engine/test_llm_engine.py
+++ b/tests/v1/engine/test_llm_engine.py
@@ -213,3 +213,29 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
         assert len(num_accepted_tokens_per_pos) == 1
         assert isinstance(num_accepted_tokens_per_pos[0], Vector)
         assert len(num_accepted_tokens_per_pos[0].values) == 5
+
+
+@pytest.mark.parametrize("model", ["meta-llama/Llama-3.2-1B-Instruct"])
+def test_skip_tokenizer_initialization(model: str,
+                                       monkeypatch: pytest.MonkeyPatch):
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+    # This test checks if the flag skip_tokenizer_init skips the initialization
+    # of tokenizer and detokenizer. The generated output is expected to contain
+    # token ids.
+    llm = LLM(
+        model=model,
+        skip_tokenizer_init=True,
+        enforce_eager=True,
+    )
+    sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
+
+    with pytest.raises(ValueError, match="cannot pass text prompts when"):
+        llm.generate("abc", sampling_params)
+
+    outputs = llm.generate({"prompt_token_ids": [1, 2, 3]},
+                           sampling_params=sampling_params)
+    assert len(outputs) > 0
+    completions = outputs[0].outputs
+    assert len(completions) > 0
+    assert completions[0].text == ""
+    assert completions[0].token_ids
diff --git a/tests/v1/entrypoints/openai/responses/conftest.py b/tests/v1/entrypoints/openai/responses/conftest.py
index 2dcdda04ecb57..2d677a00b646a 100644
--- a/tests/v1/entrypoints/openai/responses/conftest.py
+++ b/tests/v1/entrypoints/openai/responses/conftest.py
@@ -21,12 +21,16 @@ def default_server_args():
 
 
 @pytest.fixture(scope="module")
-def server(default_server_args):
-    with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
+def server_with_store(default_server_args):
+    with RemoteOpenAIServer(
+            MODEL_NAME,
+            default_server_args,
+            env_dict={"VLLM_ENABLE_RESPONSES_API_STORE": "1"},
+    ) as remote_server:
         yield remote_server
 
 
 @pytest_asyncio.fixture
-async def client(server):
-    async with server.get_async_client() as async_client:
+async def client(server_with_store):
+    async with server_with_store.get_async_client() as async_client:
         yield async_client
diff --git a/tests/v1/entrypoints/openai/responses/test_image.py b/tests/v1/entrypoints/openai/responses/test_image.py
index f3bce91e97cdf..c8d09fd39fb13 100644
--- a/tests/v1/entrypoints/openai/responses/test_image.py
+++ b/tests/v1/entrypoints/openai/responses/test_image.py
@@ -37,8 +37,11 @@ def default_image_server_args():
 
 @pytest.fixture(scope="module")
 def image_server(default_image_server_args):
-    with RemoteOpenAIServer(MODEL_NAME,
-                            default_image_server_args) as remote_server:
+    with RemoteOpenAIServer(
+            MODEL_NAME,
+            default_image_server_args,
+            env_dict={"VLLM_ENABLE_RESPONSES_API_STORE": "1"},
+    ) as remote_server:
         yield remote_server
 
 
diff --git a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
index b48655d80eefd..9322410ec99e9 100755
--- a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
+++ b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
@@ -88,7 +88,6 @@ run_tests_for_model() {
     BASE_CMD="CUDA_VISIBLE_DEVICES=$GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
     --port $PORT \
     --enforce-eager \
-    --disable-log-requests \
     --gpu-memory-utilization 0.2 \
     --tensor-parallel-size $PREFILLER_TP_SIZE \
     --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'"
@@ -121,7 +120,6 @@ run_tests_for_model() {
     BASE_CMD="CUDA_VISIBLE_DEVICES=$GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
     --port $PORT \
     --enforce-eager \
-    --disable-log-requests \
     --gpu-memory-utilization 0.2 \
     --tensor-parallel-size $DECODER_TP_SIZE \
     --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'"
diff --git a/tests/v1/kv_connector/nixl_integration/run_edge_case_test.sh b/tests/v1/kv_connector/nixl_integration/run_edge_case_test.sh
index 98903a176e28b..b64461292910d 100644
--- a/tests/v1/kv_connector/nixl_integration/run_edge_case_test.sh
+++ b/tests/v1/kv_connector/nixl_integration/run_edge_case_test.sh
@@ -57,7 +57,6 @@ run_tests_for_model() {
   BASE_CMD="CUDA_VISIBLE_DEVICES=0 VLLM_NIXL_SIDE_CHANNEL_PORT=5559 vllm serve $model_name \
   --port $PREFILL_PORT \
   --enforce-eager \
-  --disable-log-requests \
   --gpu-memory-utilization 0.2 \
   --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'"
 
@@ -76,7 +75,6 @@ run_tests_for_model() {
   BASE_CMD="CUDA_VISIBLE_DEVICES=1 VLLM_NIXL_SIDE_CHANNEL_PORT=6000 vllm serve $model_name \
   --port $DECODE_PORT \
   --enforce-eager \
-  --disable-log-requests \
   --gpu-memory-utilization 0.2 \
   --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'"
 
diff --git a/tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh
index 45779d16914f0..ea125f99fc42c 100644
--- a/tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh
+++ b/tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh
@@ -63,7 +63,6 @@ launch_baseline() {
       --seed 42 \
       --block-size ${BLOCK_SIZE} \
       --gpu-memory-utilization 0.5 \
-      --disable-log-requests \
       --enforce-eager"
   echo ${BASELINE_BASE_CMD}
   ssh -tt ${BASELINE_HOST} "${BASELINE_BASE_CMD}" &
@@ -87,7 +86,6 @@ launch_pd() {
       --block-size ${BLOCK_SIZE} \
       --enforce-eager \
       --gpu-memory-utilization 0.5 \
-      --disable-log-requests \
       --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'"
 
 
@@ -106,7 +104,6 @@ launch_pd() {
       --block-size ${BLOCK_SIZE} \
       --enforce-eager \
       --gpu-memory-utilization 0.5 \
-      --disable-log-requests \
       --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'"
 
   echo ${PREFILL_BASE_CMD}
diff --git a/tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh b/tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh
index c37c92fdf5d3f..8ba653770c4f0 100644
--- a/tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh
+++ b/tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh
@@ -68,7 +68,6 @@ launch_pd() {
       --block-size ${BLOCK_SIZE} \
       --enforce-eager \
       --gpu-memory-utilization 0.5 \
-      --disable-log-requests \
       --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'"
 
 
@@ -87,7 +86,6 @@ launch_pd() {
       --block-size ${BLOCK_SIZE} \
       --enforce-eager \
       --gpu-memory-utilization 0.5 \
-      --disable-log-requests \
       --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'"
 
   echo ${PREFILL_BASE_CMD}
diff --git a/tests/v1/kv_connector/nixl_integration/test_disagg_accuracy.py b/tests/v1/kv_connector/nixl_integration/test_disagg_accuracy.py
index 00e62f351ce30..697e101c35926 100644
--- a/tests/v1/kv_connector/nixl_integration/test_disagg_accuracy.py
+++ b/tests/v1/kv_connector/nixl_integration/test_disagg_accuracy.py
@@ -51,20 +51,31 @@ def check_vllm_server(url: str, timeout=5, retries=3) -> bool:
     return False
 
 
-def run_simple_prompt(base_url: str, model_name: str,
-                      input_prompt: str) -> str:
+def run_simple_prompt(base_url: str, model_name: str, input_prompt: str,
+                      use_chat_endpoint: bool) -> str:
     client = openai.OpenAI(api_key="EMPTY", base_url=base_url)
-    completion = client.completions.create(model=model_name,
-                                           prompt=input_prompt,
-                                           max_tokens=MAX_OUTPUT_LEN,
-                                           temperature=0.0,
-                                           seed=42)
+    if use_chat_endpoint:
+        completion = client.chat.completions.create(
+            model=model_name,
+            messages=[{
+                "role": "user",
+                "content": [{
+                    "type": "text",
+                    "text": input_prompt
+                }]
+            }],
+            max_completion_tokens=MAX_OUTPUT_LEN,
+            temperature=0.0,
+            seed=42)
+        return completion.choices[0].message.content
+    else:
+        completion = client.completions.create(model=model_name,
+                                               prompt=input_prompt,
+                                               max_tokens=MAX_OUTPUT_LEN,
+                                               temperature=0.0,
+                                               seed=42)
 
-    # print("-" * 50)
-    # print(f"Completion results for {model_name}:")
-    # print(completion)
-    # print("-" * 50)
-    return completion.choices[0].text
+        return completion.choices[0].text
 
 
 def main():
@@ -125,10 +136,12 @@ def main():
             f"vllm server: {args.service_url} is not ready yet!")
 
     output_strs = dict()
-    for prompt in SAMPLE_PROMPTS:
+    for i, prompt in enumerate(SAMPLE_PROMPTS):
+        use_chat_endpoint = (i % 2 == 1)
         output_str = run_simple_prompt(base_url=service_url,
                                        model_name=args.model_name,
-                                       input_prompt=prompt)
+                                       input_prompt=prompt,
+                                       use_chat_endpoint=use_chat_endpoint)
         print(f"Prompt: {prompt}, output: {output_str}")
         output_strs[prompt] = output_str
 
diff --git a/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py b/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py
index 66e237da0f80a..905ae0ea71722 100644
--- a/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py
+++ b/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py
@@ -162,6 +162,8 @@ async def send_request_to_service(client_info: dict, endpoint: str,
     }
     req_data["stream"] = False
     req_data["max_tokens"] = 1
+    if "max_completion_tokens" in req_data:
+        req_data["max_completion_tokens"] = 1
     if "stream_options" in req_data:
         del req_data["stream_options"]
     headers = {
diff --git a/tests/v1/kv_connector/unit/test_output_aggreagator.py b/tests/v1/kv_connector/unit/test_output_aggreagator.py
index cad73f68e9f15..5d2b27a9eb4da 100644
--- a/tests/v1/kv_connector/unit/test_output_aggreagator.py
+++ b/tests/v1/kv_connector/unit/test_output_aggreagator.py
@@ -4,7 +4,7 @@ from concurrent.futures import Future
 from typing import Optional
 
 from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator
-from vllm.v1.outputs import ModelRunnerOutput
+from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput
 
 
 class DummyModelRunnerOutput(ModelRunnerOutput):
@@ -12,8 +12,16 @@ class DummyModelRunnerOutput(ModelRunnerOutput):
     def __init__(self,
                  finished_sending: Optional[set[str]] = None,
                  finished_recving: Optional[set[str]] = None):
-        self.finished_sending = finished_sending
-        self.finished_recving = finished_recving
+        self.kv_connector_output = KVConnectorOutput(
+            finished_sending=finished_sending,
+            finished_recving=finished_recving,
+        )
+
+    def __repr__(self):
+        return (
+            f"DummyModelRunnerOutput("
+            f"finished_sending={self.kv_connector_output.finished_sending},"
+            f"finished_recving={self.kv_connector_output.finished_recving})")
 
 
 def test_aggregate_workers_output():
@@ -27,6 +35,7 @@ def test_aggregate_workers_output():
     aggregated = aggregator.aggregate([output1, output2])
 
     assert aggregated is output1
+    aggregated = aggregated.kv_connector_output
     assert aggregated.finished_sending is None
     assert aggregated.finished_recving is None
 
@@ -38,6 +47,7 @@ def test_aggregate_workers_output():
     aggregated = aggregator.aggregate([output1, output2])
 
     assert aggregated is output1
+    aggregated = aggregated.kv_connector_output
     assert aggregated.finished_sending == {'req1'}
     assert aggregated.finished_recving is None
 
@@ -49,6 +59,7 @@ def test_aggregate_workers_output():
     aggregated = aggregator.aggregate([output1, output2])
 
     assert aggregated is output1
+    aggregated = aggregated.kv_connector_output
     assert aggregated.finished_sending is None
     assert aggregated.finished_recving == {'req2'}
 
@@ -70,6 +81,7 @@ def test_async_aggregate_workers_output():
     assert result_future.done()
     aggregated = result_future.result()
     assert aggregated is output1
+    aggregated = aggregated.kv_connector_output
     assert aggregated.finished_sending is None
     assert aggregated.finished_recving is None
 
@@ -87,6 +99,7 @@ def test_async_aggregate_workers_output():
     assert result_future.done()
     aggregated = result_future.result()
     assert aggregated is output1
+    aggregated = aggregated.kv_connector_output
     assert aggregated.finished_sending == {'req1'}
     assert aggregated.finished_recving is None
 
@@ -104,5 +117,6 @@ def test_async_aggregate_workers_output():
     assert result_future.done()
     aggregated = result_future.result()
     assert aggregated is output1
+    aggregated = aggregated.kv_connector_output
     assert aggregated.finished_sending is None
     assert aggregated.finished_recving == {'req2'}
diff --git a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
index 12a71d97e8d29..76394a540aacd 100644
--- a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
+++ b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import copy
 
-from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT
+from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT, KVConnectorOutput
 from vllm.v1.request import FinishReason, RequestStatus
 
 from .utils import (assert_scheduler_empty, create_model_runner_output,
@@ -86,7 +86,8 @@ def test_basic_lifecycle():
 
     # (3b): execute_model()
     model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
-    model_runner_output.finished_sending = [request_id]
+    model_runner_output.kv_connector_output = KVConnectorOutput(
+        finished_sending=[request_id])
 
     # (3c): update_from_output()
     scheduler.update_from_output(scheduler_output, model_runner_output)
@@ -176,7 +177,8 @@ def test_prefix_cache_lifecycle():
     scheduler_output = scheduler.schedule()
     scheduler.schedule()
     model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
-    model_runner_output.finished_sending = [request_remote.request_id]
+    model_runner_output.kv_connector_output = KVConnectorOutput(
+        finished_sending=[request_remote.request_id])
     scheduler.update_from_output(scheduler_output, model_runner_output)
     _ = scheduler.schedule()
     assert_scheduler_empty(scheduler)
diff --git a/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py b/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py
index f89970bf2c807..3d52ea526d96b 100644
--- a/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py
+++ b/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import copy
 
-from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT
+from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT, KVConnectorOutput
 from vllm.v1.request import FinishReason, RequestStatus
 
 from .utils import (assert_scheduler_empty, create_model_runner_output,
@@ -72,7 +72,8 @@ def test_basic_lifecycle():
 
     # (2b): forward(): request finishes recv.
     model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
-    model_runner_output.finished_recving = [request_id]
+    model_runner_output.kv_connector_output = KVConnectorOutput(
+        finished_recving=[request_id])
 
     # (2c): update_from_output():
     engine_core_outputs = scheduler.update_from_output(scheduler_output,
@@ -309,7 +310,8 @@ def test_full_block_prompt():
     # # STEP (2): Recv.
     scheduler_output = scheduler.schedule()
     model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
-    model_runner_output.finished_recving = [request_id]
+    model_runner_output.kv_connector_output = KVConnectorOutput(
+        finished_recving=[request_id])
     scheduler.update_from_output(scheduler_output, model_runner_output)
     assert len(scheduler.waiting) == 1
     assert (request_id in scheduler.finished_recving_kv_req_ids)
diff --git a/tests/v1/kv_connector/unit/test_shared_storage_connector.py b/tests/v1/kv_connector/unit/test_shared_storage_connector.py
new file mode 100644
index 0000000000000..db203b81f15fc
--- /dev/null
+++ b/tests/v1/kv_connector/unit/test_shared_storage_connector.py
@@ -0,0 +1,217 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import asdict
+from typing import NamedTuple
+
+from PIL import Image
+
+from vllm import LLM, EngineArgs, SamplingParams
+from vllm.assets.image import ImageAsset
+from vllm.config import KVTransferConfig
+from vllm.multimodal.utils import encode_image_base64
+
+MODEL_NAME = "RedHatAI/Qwen2.5-VL-3B-Instruct-quantized.w8a8"
+
+SAMPLING_PARAMS = SamplingParams(temperature=0.0, top_k=1, max_tokens=128)
+
+TEXT_PROMPTS = [
+    "What's in the image(s)? Around 30 words. What's special in 2nd image?",
+    "The future of AI is",
+]
+
+
+class InputCase(NamedTuple):
+    text: str
+    img: list[Image]
+    expected_len: int
+    info: str
+
+
+def _check_path_len(path):
+    """Return the latest length in path"""
+    return len(list(path.iterdir()))
+
+
+def _list_path(path):
+    """Return the list of foldername (hashes generatd) under the path"""
+    return list(path.iterdir())
+
+
+def run_test(tmp_path, processor, llm: LLM, question: str,
+             image_urls: list[Image], expected_len: int, info: str):
+    """
+    One individual test to process the prompt and output base on 1 set of input
+    Then check if the length in the strorage path matches the expected length
+    `info` introduces details or purpose of the individual test
+    """
+    print(f"***info: {info}***")
+    print(
+        f"**Expected storage path length after llm generate: {expected_len}**")
+    process_prompt(processor, llm, question, image_urls)
+
+    print(f"Path matched expected length: {_check_path_len(tmp_path)}")
+    print(f"Hashes under the storage path: {_list_path(tmp_path)}")
+
+    assert _check_path_len(tmp_path) == expected_len, (
+        f"Expect storage path length {expected_len} ;",
+        f"but end up {_check_path_len(tmp_path)} instead. ", f"Info: {info}")
+
+
+def process_prompt(processor, llm: LLM, question: str,
+                   image_urls: list[Image]):
+    """
+    Form the prompt based on the text and image input, then llm generate output
+    """
+    placeholders = [{
+        "type": "image_url",
+        "image_url": {
+            "url": f"data:image;base64,{encode_image_base64(image_pil)}"
+        }
+    } for image_pil in image_urls]
+
+    messages = [
+        {
+            "role": "system",
+            "content": "You are a helpful assistant."
+        },
+        {
+            "role": "user",
+            "content": [
+                *placeholders,
+                {
+                    "type": "text",
+                    "text": question
+                },
+            ],
+        },
+    ]
+
+    prompt = processor.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True)
+
+    outputs = llm.generate(
+        {
+            "prompt":
+            prompt,
+            **({
+                "multi_modal_data": {
+                    "image": [*image_urls]
+                }
+            } if image_urls else {})
+        },
+        sampling_params=SAMPLING_PARAMS,
+    )
+
+    print("-" * 50)
+    print("Output:")
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+        print("-" * 50)
+
+
+def test_shared_storage_connector_hashes(tmp_path):
+    """
+    Tests that SharedStorageConnector saves KV to the storage locations
+    with proper hashes; that are unique for inputs with identical text but 
+    differnt images (same size), or same multiple images but different orders.
+    """
+    # Using tmp_path as the storage path to store KV
+    print(f"KV storage path at: {str(tmp_path)}")
+
+    # Configure the SharedStorageConnector
+    kv_transfer_config = KVTransferConfig(
+        kv_connector="SharedStorageConnector",
+        kv_role="kv_both",
+        kv_connector_extra_config={"shared_storage_path": str(tmp_path)})
+
+    engine_args = EngineArgs(
+        model=MODEL_NAME,
+        max_model_len=8192,
+        max_num_seqs=1,
+        gpu_memory_utilization=0.4,
+        enforce_eager=True,
+        kv_transfer_config=kv_transfer_config,
+        limit_mm_per_prompt={"image": 2},
+    )
+
+    # don't put this import at the top level
+    # it will call torch.cuda.device_count()
+    from transformers import AutoProcessor  # noqa: F401
+
+    # Create processor to handle the chat prompt
+    processor = AutoProcessor.from_pretrained(MODEL_NAME)
+
+    # Prepare images for the tests
+    # Resize to the same size to check hashes correctness
+    image_1 = ImageAsset("stop_sign").pil_image.resize((1280, 720))
+    image_2 = ImageAsset("cherry_blossom").pil_image.resize((1280, 720))
+
+    # Make sure that they are not the same picture
+    assert image_1 != image_2, "The images should not be identical"
+
+    # Create the LLM instance
+    engine_args = asdict(engine_args)
+    llm = LLM(**engine_args)
+
+    # Prepare the input cases
+    input_cases = [
+        InputCase(text=TEXT_PROMPTS[0],
+                  img=[image_1],
+                  expected_len=1,
+                  info="image_1 single input the first time."),
+        InputCase(text=TEXT_PROMPTS[0],
+                  img=[image_2],
+                  expected_len=2,
+                  info=("image_2 single input the first time. "
+                        "It is in same pixel size with image_1, yet it "
+                        "should be able to form a new unique hash.")),
+        InputCase(text=TEXT_PROMPTS[0],
+                  img=[image_1],
+                  expected_len=2,
+                  info=("image_1 single input the 2nd time. "
+                        "It should not form aother new hash.")),
+        InputCase(text=TEXT_PROMPTS[0],
+                  img=[image_2],
+                  expected_len=2,
+                  info=("image_2 single input the 2nd time. "
+                        "It should not form aother new hash.")),
+        InputCase(text=TEXT_PROMPTS[0],
+                  img=[image_1, image_2],
+                  expected_len=3,
+                  info="image_1 with image_2 input the first time."),
+        InputCase(text=TEXT_PROMPTS[0],
+                  img=[image_2, image_1],
+                  expected_len=4,
+                  info="The image order is swapped. Should form new hash."),
+        InputCase(text=TEXT_PROMPTS[0],
+                  img=[image_1, image_2],
+                  expected_len=4,
+                  info=("[image_1, image_2] input the 2nd time. "
+                        "It should not form aother new hash.")),
+        InputCase(text=TEXT_PROMPTS[0],
+                  img=[image_2, image_1],
+                  expected_len=4,
+                  info=("[image_2, image_1] input the 2nd time. "
+                        "It should not form aother new hash.")),
+        InputCase(text=TEXT_PROMPTS[0],
+                  img=[],
+                  expected_len=5,
+                  info="Pure text input test as a case-control"),
+        InputCase(text=TEXT_PROMPTS[0],
+                  img=[],
+                  expected_len=5,
+                  info="Identical pure text input as a case-control"),
+        InputCase(text=TEXT_PROMPTS[1],
+                  img=[],
+                  expected_len=6,
+                  info="Another pure text input as a case-control"),
+    ]
+
+    # Run tests
+    for case_id, (text, img, expected_len, info) in enumerate(input_cases):
+        print("\n", "=" * 25, f"Below running input case: {case_id}", "=" * 25)
+        run_test(tmp_path, processor, llm, text, img, expected_len, info)
+
+    print("All tests passed successfully!")
diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py
index 480a7074cdf4e..291c84d117cb6 100644
--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -17,7 +17,7 @@ from vllm.v1.core.kv_cache_manager import KVCacheBlocks
 from vllm.v1.core.sched.scheduler import Scheduler
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheGroupSpec)
-from vllm.v1.outputs import ModelRunnerOutput
+from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput
 from vllm.v1.request import Request
 from vllm.v1.structured_output import StructuredOutputManager
 
@@ -188,8 +188,10 @@ def create_model_runner_output(
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=None,
-        finished_sending=finished_sending,
-        finished_recving=finished_recving,
+        kv_connector_output=KVConnectorOutput(
+            finished_sending=finished_sending,
+            finished_recving=finished_recving,
+        ),
     )
 
 
diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py
index 680e2ce98bb27..8bd142e87b06e 100644
--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@@ -429,6 +429,33 @@ def test_zero_logprobs(vllm_model, example_prompts,
             assert len(prompt_token_ids) == len(prompt_logprobs)
 
 
+def test_all_logprobs(example_prompts, monkeypatch: pytest.MonkeyPatch):
+    """Engine should return all vocabulary logprobs
+
+    Args:
+      example_prompts: list of example prompts (test fixture)
+    """
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        runner = VllmRunner(
+            "facebook/opt-125m",
+            max_logprobs=-1,
+            enable_prefix_caching=False,
+            # 2 other llms alive during whole session
+            gpu_memory_utilization=0.15,
+            max_model_len=256)
+        sampling_params_logprobs_all = SamplingParams(max_tokens=5,
+                                                      logprobs=-1)
+        results_logprobs_all = runner.llm.generate(
+            example_prompts, sampling_params=sampling_params_logprobs_all)
+        vocab_size = runner.llm.llm_engine.get_model_config().get_vocab_size()
+        for i in range(len(results_logprobs_all)):
+            logprobs = results_logprobs_all[i].outputs[0].logprobs
+            assert logprobs is not None
+            for logprob in logprobs:
+                assert len(logprob) == vocab_size
+
+
 @pytest.mark.parametrize(
     "logprobs_mode",
     ["raw_logprobs", "raw_logits", "processed_logprobs", "processed_logits"])
diff --git a/tests/v1/sample/test_logprobs_e2e.py b/tests/v1/sample/test_logprobs_e2e.py
index 50b14a15dc164..7f41355ff7ce4 100644
--- a/tests/v1/sample/test_logprobs_e2e.py
+++ b/tests/v1/sample/test_logprobs_e2e.py
@@ -15,7 +15,7 @@ EXPECTED_VALUE = 0.62
 MODEL = "meta-llama/Llama-3.2-1B-Instruct"
 MODEL_ARGS = f"pretrained={MODEL},enforce_eager=True,enable_prefix_caching=False,gpu_memory_utilization=0.8"  # noqa: E501
 SERVER_ARGS = [
-    "--enforce_eager", "--no_enable_prefix_caching", "--disable-log-requests",
+    "--enforce_eager", "--no_enable_prefix_caching",
     "--gpu-memory-utilization=0.8"
 ]
 NUM_CONCURRENT = 100
diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
index da7e5e2c467dc..05f6dd40a9ea9 100644
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -202,7 +202,9 @@ def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method,
 
 
 @pytest.mark.parametrize("num_speculative_tokens", [1, 3, 8])
-def test_propose(num_speculative_tokens):
+@pytest.mark.parametrize("backend",
+                         [_Backend.FLASH_ATTN_VLLM_V1, _Backend.TREE_ATTN])
+def test_propose(num_speculative_tokens, backend):
     # Use GPU device
     device = torch.device(current_platform.device_type)
 
@@ -301,10 +303,10 @@ def test_propose(num_speculative_tokens):
                                    device=device)
     sampling_metadata = mock.MagicMock()
 
-    attn_metadata_builder_cls, _ = get_attention_backend(
-        _Backend.FLASH_ATTN_VLLM_V1)
+    attn_metadata_builder_cls, _ = get_attention_backend(backend)
     attn_metadata_builder = attn_metadata_builder_cls(
         kv_cache_spec=create_standard_kv_cache_spec(proposer.vllm_config),
+        layer_names=proposer.attn_layer_names,
         vllm_config=proposer.vllm_config,
         device=device,
     )
diff --git a/tests/v1/spec_decode/test_tree_attention.py b/tests/v1/spec_decode/test_tree_attention.py
new file mode 100644
index 0000000000000..456ce712d36e4
--- /dev/null
+++ b/tests/v1/spec_decode/test_tree_attention.py
@@ -0,0 +1,299 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import math
+from typing import Optional
+
+import torch
+
+from tests.v1.attention.utils import (_Backend, create_standard_kv_cache_spec,
+                                      create_vllm_config,
+                                      get_attention_backend)
+from vllm.config import ParallelConfig, SpeculativeConfig
+from vllm.v1.attention.backends.utils import CommonAttentionMetadata
+
+
+class MockAttentionLayer(torch.nn.Module):
+    _q_scale = torch.tensor(1.0, dtype=torch.float32, device="cuda")
+    _k_scale = torch.tensor(1.0, dtype=torch.float32, device="cuda")
+    _v_scale = torch.tensor(1.0, dtype=torch.float32, device="cuda")
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return x
+
+
+def forward_attention(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    kv_cache: torch.Tensor,
+    block_table: torch.Tensor,
+    slot_mapping: torch.Tensor,
+    seqlen_k: int,
+    backend: _Backend,
+    spec_token_tree: Optional[str] = None,
+    num_spec_tokens: int = 0,
+) -> torch.Tensor:
+    batch_size, q_len, num_heads, dim_per_head = q.shape
+    num_kv_heads = k.shape[-2]
+    # Initialize the query and KV sequence lengths.
+    query_start_loc = q_len * torch.arange(
+        batch_size + 1, device=q.device, dtype=torch.int32)
+    query_lens = torch.diff(query_start_loc)
+    seq_lens = torch.full(
+        (batch_size, ),
+        seqlen_k,
+        device=q.device,
+        dtype=torch.int32,
+    )
+    context_lens = seq_lens - query_lens
+    max_query_len = q_len
+    num_actual_tokens = query_start_loc[-1]
+
+    softmax_scale = q.shape[-1]**(-0.5)
+    layer = MockAttentionLayer()
+
+    # Build common metadata.
+    model_name = "meta-llama/Meta-Llama-3-8B"
+    builder_cls, impl_cls = get_attention_backend(backend)
+    vllm_config = create_vllm_config(model_name=model_name,
+                                     max_model_len=max(seq_lens))
+    if spec_token_tree is not None:
+        # Create speculative config if token tree is specified.
+        vllm_config.speculative_config = SpeculativeConfig(
+            target_model_config=vllm_config.model_config,
+            target_parallel_config=ParallelConfig(),
+            model=model_name,
+            method="eagle",
+            num_speculative_tokens=num_spec_tokens,
+            speculative_token_tree=spec_token_tree)
+    kv_cache_spec = create_standard_kv_cache_spec(vllm_config)
+    builder = builder_cls(kv_cache_spec, [], vllm_config, q.device)
+    common_attn_metadata = CommonAttentionMetadata(
+        query_start_loc=query_start_loc,
+        query_start_loc_cpu=query_start_loc.cpu(),
+        seq_lens=seq_lens,
+        seq_lens_cpu=seq_lens.cpu(),
+        num_computed_tokens_cpu=context_lens.cpu(),
+        num_reqs=batch_size,
+        num_actual_tokens=num_actual_tokens,
+        max_query_len=max_query_len,
+        block_table_tensor=block_table,
+        slot_mapping=slot_mapping,
+    )
+
+    # Build attention metadata.
+    attn_metadata = builder.build(
+        common_prefix_len=0,
+        common_attn_metadata=common_attn_metadata,
+    )
+
+    # Initialize the backend implementation.
+    instance = impl_cls(
+        num_heads=num_heads,
+        head_size=dim_per_head,
+        scale=softmax_scale,
+        num_kv_heads=num_kv_heads,
+        alibi_slopes=None,
+        sliding_window=None,
+        kv_cache_dtype="auto",
+    )
+
+    # Run forward pass and return output.
+    query = q.view(-1, num_heads, dim_per_head)
+    key = k.view(-1, num_kv_heads, dim_per_head)
+    value = v.view(-1, num_kv_heads, dim_per_head)
+    output = torch.empty_like(query)
+    return instance.forward(
+        layer=layer,
+        query=query,
+        key=key,
+        value=value,
+        kv_cache=kv_cache.clone(),
+        attn_metadata=attn_metadata,
+        output=output,
+    )
+
+
+def test_tree_attn_correctness() -> None:
+    torch.manual_seed(42)
+    torch.cuda.manual_seed_all(42)
+
+    device = "cuda"
+    tree_attn_masks = {
+        # Chain.
+        "[(0,), (0, 0), (0, 0, 0)]":
+        torch.tensor(
+            [
+                [1, 0, 0, 0],
+                [1, 1, 0, 0],
+                [1, 1, 1, 0],
+                [1, 1, 1, 1],
+            ],
+            device=device,
+            dtype=torch.int32,
+        ),
+        # Tree.
+        "[(0,), (1,), (0, 0), (0, 1), (1, 0), (1, 1)]":
+        torch.tensor(
+            [
+                [1, 0, 0, 0, 0, 0, 0],
+                [1, 1, 0, 0, 0, 0, 0],
+                [1, 0, 1, 0, 0, 0, 0],
+                [1, 1, 0, 1, 0, 0, 0],
+                [1, 1, 0, 0, 1, 0, 0],
+                [1, 0, 1, 0, 0, 1, 0],
+                [1, 0, 1, 0, 0, 0, 1],
+            ],
+            device=device,
+            dtype=torch.int32,
+        ),
+    }
+
+    dim_per_head = 128
+    num_kv_heads = 2
+    block_size = 32
+    max_sequence_length = 8192
+    randomize_blocks = True
+    for batch_size in [1, 16, 32]:
+        for num_heads in [2, 4]:
+            for sequence_position in [16, 1024, 2048]:
+                for spec_token_tree, tree_attn_mask in tree_attn_masks.items():
+                    # Assert that the number of heads is divisible
+                    # by the number of KV heads.
+                    assert num_heads % num_kv_heads == 0
+
+                    # Initialize q, k, and v.
+                    tree_size_q = tree_attn_mask.shape[0]
+                    seqlen_k = sequence_position + tree_size_q
+                    q = torch.randn(
+                        (batch_size, tree_size_q, num_heads, dim_per_head),
+                        device=device,
+                        dtype=torch.bfloat16,
+                    )
+                    k = torch.randn(
+                        (batch_size, tree_size_q, num_kv_heads, dim_per_head),
+                        device=device,
+                        dtype=torch.bfloat16,
+                    )
+                    v = torch.randn(
+                        (batch_size, tree_size_q, num_kv_heads, dim_per_head),
+                        device=device,
+                        dtype=torch.bfloat16,
+                    )
+
+                    # Setup the block table and KV cache for paged KV.
+                    assert max_sequence_length % block_size == 0
+                    max_blocks_per_batch = max_sequence_length // block_size
+                    kv_cache = torch.randn(
+                        (
+                            2,
+                            batch_size * max_blocks_per_batch,
+                            block_size,
+                            num_kv_heads,
+                            dim_per_head,
+                        ),
+                        device=q.device,
+                        dtype=torch.bfloat16,
+                    )
+                    num_alloc_blocks_per_batch = math.ceil(seqlen_k /
+                                                           block_size)
+                    block_table = torch.zeros(
+                        (batch_size, max_blocks_per_batch),
+                        device=q.device,
+                        dtype=torch.int32,
+                    )
+                    block_ids = torch.arange(
+                        0,
+                        batch_size * num_alloc_blocks_per_batch,
+                        device=q.device,
+                        dtype=torch.int32,
+                    )
+                    if randomize_blocks:
+                        # Randomize the block ids.
+                        block_ids = block_ids[torch.randperm(
+                            block_ids.numel())]
+                    block_table[:, :
+                                num_alloc_blocks_per_batch] = block_ids.view(
+                                    -1, num_alloc_blocks_per_batch)
+
+                    # Setup the slot mapping for the input KVs.
+                    tree_positions = sequence_position + torch.arange(
+                        0,
+                        tree_size_q,
+                        device=q.device,
+                        dtype=torch.int64,
+                    ).repeat(batch_size, 1)
+                    tree_slot_mapping = _gen_slot_mapping(
+                        tree_positions, block_table, block_size)
+
+                    # Compute attention for the tree.
+                    tree_attn_output = forward_attention(
+                        q=q,
+                        k=k,
+                        v=v,
+                        kv_cache=kv_cache,
+                        block_table=block_table,
+                        slot_mapping=tree_slot_mapping,
+                        seqlen_k=seqlen_k,
+                        backend=_Backend.TREE_ATTN,
+                        spec_token_tree=spec_token_tree,
+                        num_spec_tokens=tree_size_q - 1,
+                    ).view(batch_size, -1, num_heads, dim_per_head)
+
+                    # Verify that the chain attention output for each
+                    # branch of the tree (computed using FA3) matches
+                    # the tree attention output.
+                    for q_index in range(tree_size_q):
+                        # Get the q, k, and v for the branch.
+                        branch_mask = tree_attn_mask[q_index, :]
+                        branch_indices = torch.nonzero(branch_mask,
+                                                       as_tuple=True)[0]
+                        q_len = branch_indices.shape[0]
+                        q_branch = q[:, branch_indices]
+                        k_branch = k[:, branch_indices]
+                        v_branch = v[:, branch_indices]
+
+                        # Setup slot mapping for the branch.
+                        branch_positions = sequence_position + torch.arange(
+                            0,
+                            q_len,
+                            device=q.device,
+                            dtype=torch.int64,
+                        ).repeat(batch_size, 1)
+                        branch_slot_mapping = _gen_slot_mapping(
+                            branch_positions, block_table, block_size)
+
+                        # Compute flash attention for the branch.
+                        flash_attn_output = forward_attention(
+                            q=q_branch,
+                            k=k_branch,
+                            v=v_branch,
+                            kv_cache=kv_cache,
+                            block_table=block_table,
+                            slot_mapping=branch_slot_mapping,
+                            seqlen_k=sequence_position + q_len,
+                            backend=_Backend.FLASH_ATTN_VLLM_V1,
+                        ).view(batch_size, -1, num_heads, dim_per_head)
+
+                        # Compare the outputs.
+                        assert torch.allclose(
+                            tree_attn_output[:, branch_indices],
+                            flash_attn_output,
+                            atol=7.81e-3,
+                        ), (f"outputs are not close for "
+                            f"batch_size: {batch_size}, "
+                            f"num_heads: {num_heads}, "
+                            f"sequence_position: {sequence_position}, "
+                            f"tree_attn_mask: {tree_attn_mask}, "
+                            f"q_index: {q_index}.")
+
+
+def _gen_slot_mapping(positions: torch.Tensor, block_table: torch.Tensor,
+                      block_size: int):
+    block_indices = positions // block_size
+    blocks = block_table.gather(dim=1, index=block_indices)
+    return (blocks * block_size + positions % block_size).view(-1)
diff --git a/tests/v1/test_async_llm_dp.py b/tests/v1/test_async_llm_dp.py
index 6716d27f571f9..c2610a87ac780 100644
--- a/tests/v1/test_async_llm_dp.py
+++ b/tests/v1/test_async_llm_dp.py
@@ -25,7 +25,6 @@ DP_SIZE = int(os.getenv("DP_SIZE", 2))
 engine_args = AsyncEngineArgs(
     model="ibm-research/PowerMoE-3b",
     enforce_eager=True,
-    disable_log_requests=True,
     tensor_parallel_size=int(os.getenv("TP_SIZE", 1)),
     data_parallel_size=DP_SIZE,
 )
diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py
index cc59287a9fbe6..b68ed298a1895 100644
--- a/tests/v1/test_oracle.py
+++ b/tests/v1/test_oracle.py
@@ -47,13 +47,6 @@ def test_unsupported_configs(monkeypatch):
                 },
             ).create_engine_config()
 
-        with pytest.raises(NotImplementedError):
-            AsyncEngineArgs(
-                model=MODEL,
-                guided_decoding_backend="lm-format-enforcer",
-                guided_decoding_disable_fallback=True,
-            ).create_engine_config()
-
         with pytest.raises(NotImplementedError):
             AsyncEngineArgs(
                 model=MODEL,
diff --git a/tests/v1/tpu/test_mha_attn.py b/tests/v1/tpu/test_mha_attn.py
index 55fee4ee1ad43..9d690851b70eb 100644
--- a/tests/v1/tpu/test_mha_attn.py
+++ b/tests/v1/tpu/test_mha_attn.py
@@ -12,17 +12,10 @@ import torch_xla
 import torch_xla.core
 import torch_xla.core.xla_model
 
-from vllm import envs
 from vllm.attention.layer import MultiHeadAttention
 from vllm.attention.selector import _cached_get_attn_backend
 from vllm.platforms import current_platform
 
-if not envs.VLLM_USE_V1:
-    pytest.skip(
-        "Skipping V1 tests. Rerun with `VLLM_USE_V1=1` to test.",
-        allow_module_level=True,
-    )
-
 
 @pytest.fixture(autouse=True)
 def clear_cache():
diff --git a/tests/v1/tpu/test_multimodal.py b/tests/v1/tpu/test_multimodal.py
index a61773a4f611b..bcc2993028dd6 100644
--- a/tests/v1/tpu/test_multimodal.py
+++ b/tests/v1/tpu/test_multimodal.py
@@ -4,19 +4,12 @@
 import openai
 import pytest
 
-from vllm import envs
 from vllm.multimodal.utils import encode_image_base64, fetch_image
 from vllm.platforms import current_platform
 
 from ...entrypoints.openai.test_vision import TEST_IMAGE_URLS
 from ...utils import RemoteOpenAIServer
 
-if not envs.VLLM_USE_V1:
-    pytest.skip(
-        "Skipping V1 tests. Rerun with `VLLM_USE_V1=1` to test.",
-        allow_module_level=True,
-    )
-
 
 @pytest.fixture(scope="session")
 def base64_encoded_image() -> dict[str, str]:
diff --git a/tests/v1/tpu/test_sampler.py b/tests/v1/tpu/test_sampler.py
index 198bb1e16ed9f..fa950e5f7f85b 100644
--- a/tests/v1/tpu/test_sampler.py
+++ b/tests/v1/tpu/test_sampler.py
@@ -4,16 +4,10 @@ import random
 
 import pytest
 
-from vllm import LLM, envs
+from vllm import LLM
 from vllm.platforms import current_platform
 from vllm.sampling_params import SamplingParams
 
-if not envs.VLLM_USE_V1:
-    pytest.skip(
-        "Skipping V1 tests. Rerun with `VLLM_USE_V1=1` to test.",
-        allow_module_level=True,
-    )
-
 
 @pytest.mark.parametrize("model_name", ["Qwen/Qwen2.5-1.5B-Instruct"])
 @pytest.mark.skipif(not current_platform.is_tpu(),
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index e14fbe1e47ecf..231dfcbb68848 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -745,7 +745,8 @@ def test_hybrid_attention_mamba_tensor_shapes(monkeypatch):
     layer_4 = "model.layers.4.mixer"
     layer_5 = "model.layers.5.mixer"
 
-    with set_current_vllm_config(vllm_config):
+    with set_current_vllm_config(vllm_config), monkeypatch.context() as m:
+        m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
         hf_config = vllm_config.model_config.hf_config
         fwd_context = {}
         for key in [layer_0, layer_1]:
diff --git a/tools/check_pickle_imports.py b/tools/check_pickle_imports.py
index ef197d1fbace1..5e99dc63ebe0c 100644
--- a/tools/check_pickle_imports.py
+++ b/tools/check_pickle_imports.py
@@ -34,7 +34,6 @@ ALLOWED_FILES = set([
     'vllm/model_executor/models/registry.py',
     'tests/test_utils.py',
     'tests/tokenization/test_cached_tokenizer.py',
-    'tests/model_executor/test_guided_processors.py',
     'vllm/distributed/utils.py',
     'vllm/distributed/parallel_state.py',
     'vllm/engine/multiprocessing/client.py',
diff --git a/tools/ep_kernels/README.md b/tools/ep_kernels/README.md
index f1479146f053c..273e0f378e343 100644
--- a/tools/ep_kernels/README.md
+++ b/tools/ep_kernels/README.md
@@ -1,6 +1,9 @@
+# Expert parallel kernels
+
 Large-scale cluster-level expert parallel, as described in the [DeepSeek-V3 Technical Report](http://arxiv.org/abs/2412.19437), is an efficient way to deploy sparse MoE models with many experts. However, such deployment requires many components beyond a normal Python package, including system package support and system driver support. It is impossible to bundle all these components into a Python package.
 
 Here we break down the requirements in 2 steps:
+
 1. Build and install the Python libraries (both [pplx-kernels](https://github.com/ppl-ai/pplx-kernels) and [DeepEP](https://github.com/deepseek-ai/DeepEP)), including necessary dependencies like NVSHMEM. This step does not require any privileged access. Any user can do this.
 2. Configure NVIDIA driver to enable IBGDA. This step requires root access, and must be done on the host machine.
 
@@ -8,15 +11,15 @@ Here we break down the requirements in 2 steps:
 
 All scripts accept a positional argument as workspace path for staging the build, defaulting to `$(pwd)/ep_kernels_workspace`.
 
-# Usage
+## Usage
 
-## Single-node
+### Single-node
 
 ```bash
 bash install_python_libraries.sh
 ```
 
-## Multi-node
+### Multi-node
 
 ```bash
 bash install_python_libraries.sh
diff --git a/tools/ep_kernels/configure_system_drivers.sh b/tools/ep_kernels/configure_system_drivers.sh
index cf15c1daccaec..b8bd8b8f6f550 100644
--- a/tools/ep_kernels/configure_system_drivers.sh
+++ b/tools/ep_kernels/configure_system_drivers.sh
@@ -2,6 +2,16 @@ set -ex
 
 # turn on IBGDA
 echo 'options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1;"' | tee -a /etc/modprobe.d/nvidia.conf
-update-initramfs -u
+
+if command -v update-initramfs &> /dev/null; then
+    # for Debian/Ubuntu
+    sudo update-initramfs -u
+elif command -v dracut &> /dev/null; then
+    # for Fedora/CentOS
+    sudo dracut --force
+else
+    echo "No supported initramfs update tool found."
+    exit 1
+fi
 
 echo "Please reboot the system to apply the changes"
diff --git a/tools/ep_kernels/install_python_libraries.sh b/tools/ep_kernels/install_python_libraries.sh
index 83643c084bf9a..9d1b2da3b4122 100644
--- a/tools/ep_kernels/install_python_libraries.sh
+++ b/tools/ep_kernels/install_python_libraries.sh
@@ -53,9 +53,45 @@ popd
 
 export CMAKE_PREFIX_PATH=$WORKSPACE/nvshmem_install:$CMAKE_PREFIX_PATH
 
+is_git_dirty() {
+    local dir=$1
+    pushd "$dir" > /dev/null
+
+    if [ -d ".git" ] && [ -n "$(git status --porcelain 2>/dev/null)" ]; then
+        popd > /dev/null
+        return 0  # dirty (true)
+    else
+        popd > /dev/null
+        return 1  # clean (false)
+    fi
+}
+
+# Function to handle git repository cloning with dirty/incomplete checks
+clone_repo() {
+    local repo_url=$1
+    local dir_name=$2
+    local key_file=$3
+
+    if [ -d "$dir_name" ]; then
+        # Check if directory has uncommitted changes (dirty)
+        if is_git_dirty "$dir_name"; then
+            echo "$dir_name directory is dirty, skipping clone"
+        # Check if clone failed (directory exists but not a valid git repo or missing key files)
+        elif [ ! -d "$dir_name/.git" ] || [ ! -f "$dir_name/$key_file" ]; then
+            echo "$dir_name directory exists but clone appears incomplete, cleaning up and re-cloning"
+            rm -rf "$dir_name"
+            git clone "$repo_url"
+        else
+            echo "$dir_name directory exists and appears complete; manually update if needed"
+        fi
+    else
+        git clone "$repo_url"
+    fi
+}
+
 # build and install pplx, require pytorch installed
 pushd $WORKSPACE
-git clone https://github.com/ppl-ai/pplx-kernels
+clone_repo "https://github.com/ppl-ai/pplx-kernels" "pplx-kernels" "setup.py"
 cd pplx-kernels
 # see https://github.com/pypa/pip/issues/9955#issuecomment-838065925
 # PIP_NO_BUILD_ISOLATION=0 disables build isolation
@@ -64,7 +100,7 @@ popd
 
 # build and install deepep, require pytorch installed
 pushd $WORKSPACE
-git clone https://github.com/deepseek-ai/DeepEP
+clone_repo "https://github.com/deepseek-ai/DeepEP" "DeepEP" "setup.py"
 cd DeepEP
 export NVSHMEM_DIR=$WORKSPACE/nvshmem_install
 PIP_NO_BUILD_ISOLATION=0 pip install -vvv -e  .
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 35345b1be01c2..e6f69e2344efa 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1279,14 +1279,13 @@ def scaled_fp8_quant(
                                 device=input.device,
                                 dtype=torch.float32)
             torch.ops._C.dynamic_per_token_scaled_fp8_quant(
-                output, input.contiguous(), scale, scale_ub)
+                output, input, scale, scale_ub)
         else:
-            scale = torch.zeros(1, device=input.device, dtype=torch.float32)
-            torch.ops._C.dynamic_scaled_fp8_quant(output, input.contiguous(),
-                                                  scale)
+            scale = torch.empty(1, device=input.device, dtype=torch.float32)
+            torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale)
     else:
         assert scale.numel() == 1, f"{scale.shape}"
-        torch.ops._C.static_scaled_fp8_quant(output, input.contiguous(), scale)
+        torch.ops._C.static_scaled_fp8_quant(output, input, scale)
 
     return output, scale
 
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index e6e60e7562482..78d8a67e37f8f 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -44,9 +44,9 @@ from vllm.attention.layer import Attention
 from vllm.attention.ops.paged_attn import PagedAttention
 from vllm.config import VllmConfig, get_layers_from_vllm_config
 from vllm.logger import init_logger
-from vllm.platforms import current_platform
 from vllm.utils import (async_tensor_h2d, get_kv_cache_torch_dtype,
                         make_tensor_with_pad)
+from vllm.utils.flashinfer import use_trtllm_attention
 
 logger = init_logger(__name__)
 
@@ -56,7 +56,6 @@ if TYPE_CHECKING:
 
 
 class FlashInferBackend(AttentionBackend):
-    cached_sm100a_supported: Optional[bool] = None
 
     @staticmethod
     def get_name() -> str:
@@ -123,47 +122,6 @@ class FlashInferBackend(AttentionBackend):
         else:
             raise ValueError(f"Unrecognized FP8 dtype: {kv_cache_dtype}")
 
-    @staticmethod
-    def use_trtllm_decode_attention(
-        batch_size: int,
-        max_seq_len: int,
-        kv_cache_dtype: str,
-        num_qo_heads: Optional[int],
-        num_kv_heads: Optional[int],
-        attn_head_size: Optional[int],
-    ) -> bool:
-        if FlashInferBackend.cached_sm100a_supported is None:
-            FlashInferBackend.cached_sm100a_supported = (
-                current_platform.has_device_capability(100))
-        if not FlashInferBackend.cached_sm100a_supported:
-            return False
-        # Check if the dimensions are supported by TRTLLM decode attention
-        if (attn_head_size is None or num_qo_heads is None
-                or num_kv_heads is None or num_qo_heads // num_kv_heads > 8
-                or num_qo_heads % num_kv_heads != 0 or attn_head_size != 128):
-            return False
-        env_value = envs.VLLM_USE_TRTLLM_DECODE_ATTENTION
-        if env_value is not None:
-            logger.info_once("VLLM_USE_TRTLLM_DECODE_ATTENTION is set to %s",
-                             env_value)
-            # Environment variable is set - respect it
-            # Making the conditional check for zero because
-            # the path is automatically enabled if the batch size condition
-            # is satisfied.
-            no_use_trtllm = (env_value == "0")
-            if not no_use_trtllm:
-                logger.info_once("Using TRTLLM decode attention.")
-            return not no_use_trtllm
-        else:
-            # Environment variable not set - use auto-detection
-            use_trtllm = (FlashInferBackend.cached_sm100a_supported
-                          and batch_size <= 256 and max_seq_len < 131072
-                          and kv_cache_dtype == "auto")
-            if use_trtllm:
-                logger.warning_once(
-                    "Using TRTLLM decode attention (auto-detected).")
-        return use_trtllm
-
 
 @dataclass
 class PerLayerParameters:
@@ -1104,7 +1062,12 @@ class FlashInferImpl(AttentionImpl):
         window_left = window_size[0] if window_size is not None else -1
 
         prefill_output: Optional[torch.Tensor] = None
-        decode_output: Optional[torch.Tensor] = None
+        if num_decode_tokens > 0:
+            decode_output = torch.empty(decode_query.shape,
+                                        dtype=decode_query.dtype,
+                                        device=decode_query.device)
+        else:
+            decode_output = None
         stride_order = FlashInferBackend.get_kv_cache_stride_order()
         if prefill_meta := attn_metadata.prefill_metadata:
             # We will use flash attention for prefill
@@ -1151,21 +1114,22 @@ class FlashInferImpl(AttentionImpl):
             assert decode_meta.decode_wrapper._sm_scale == softmax_scale
             # TODO: @pavanimajety Remove this once the switch happens
             # inside flashinfer.
-            if not FlashInferBackend.use_trtllm_decode_attention(
+            if not use_trtllm_attention(
                     num_decode_tokens, attn_metadata.max_decode_seq_len,
                     kv_cache_dtype, attn_metadata.num_qo_heads,
                     attn_metadata.num_kv_heads, attn_metadata.head_dim):
-                decode_output = decode_meta.decode_wrapper.run(
+                decode_meta.decode_wrapper.run(
                     decode_query,
                     kv_cache.permute(*stride_order),
                     k_scale=layer._k_scale_float,
                     v_scale=layer._v_scale_float,
+                    out=decode_output,
                 )
             else:
                 workspace_buffer = (
-                    decode_meta.decode_wrapper._int_workspace_buffer)
+                    decode_meta.decode_wrapper._float_workspace_buffer)
                 assert FlashInferState.get_kv_cache_layout() == "HND"
-                decode_output = trtllm_batch_decode_with_kv_cache(
+                trtllm_batch_decode_with_kv_cache(
                     query=decode_query,
                     kv_cache=kv_cache.permute(*stride_order),
                     workspace_buffer=workspace_buffer,
@@ -1174,6 +1138,7 @@ class FlashInferImpl(AttentionImpl):
                     max_seq_len=attn_metadata.max_decode_seq_len,
                     bmm1_scale=layer._k_scale_float * softmax_scale,
                     bmm2_scale=layer._v_scale_float,
+                    out=decode_output,
                 )
 
         if prefill_output is None and decode_output is not None:
diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/attention/ops/triton_unified_attention.py
index eb9c4f1c1030a..0fdba569f93f2 100644
--- a/vllm/attention/ops/triton_unified_attention.py
+++ b/vllm/attention/ops/triton_unified_attention.py
@@ -55,6 +55,7 @@ def kernel_unified_attention_2d(
         block_tables_ptr,  # [num_seqs, max_num_blocks_per_seq]
         seq_lens_ptr,  # [num_seqs]
         alibi_slopes_ptr,  # [num_query_heads]
+        qq_bias_ptr,  # [num_query_tokens, num_query_tokens]
         scale,  # float32
         k_scale,  # float32
         v_scale,  # float32
@@ -66,10 +67,12 @@ def kernel_unified_attention_2d(
         query_stride_1: tl.int64,  # int, should be equal to head_size
         output_stride_0: tl.int64,  # int
         output_stride_1: tl.int64,  # int, should be equal to head_size
+        qq_bias_stride_0: tl.int64,  # int
         BLOCK_SIZE: tl.constexpr,  # int
         HEAD_SIZE: tl.constexpr,  # int
         HEAD_SIZE_PADDED: tl.constexpr,  # int, must be power of 2
         USE_ALIBI_SLOPES: tl.constexpr,  # bool
+        USE_QQ_BIAS: tl.constexpr,  # bool
         USE_SOFTCAP: tl.constexpr,  # bool
         SLIDING_WINDOW: tl.constexpr,  # int
         stride_k_cache_0: tl.int64,  # int
@@ -144,6 +147,11 @@ def kernel_unified_attention_2d(
                               mask=query_mask_1,
                               other=0.0)
 
+    # query-query attention bias
+    if USE_QQ_BIAS:
+        qq_bias_row_ptrs = (qq_bias_ptr + query_pos[:, None] * qq_bias_stride_0
+                            )  # shape: [BLOCK_M]
+
     # compute the length of the longest sequence prefix spanned by any
     # query token in the current q_block (q_block_local_idx)
     max_seq_prefix_len = context_len + q_block_local_idx * BLOCK_Q + (
@@ -223,6 +231,18 @@ def kernel_unified_attention_2d(
         if USE_ALIBI_SLOPES:
             S += alibi_slope[:, None] * (seq_offset - context_len)
 
+        if USE_QQ_BIAS:
+            # compute key positions relative to query section
+            key_rel_pos = seq_offset - context_len  # shape: [BLOCK_SIZE]
+            # load bias only for keys that correspond to queries
+            is_query_key = key_rel_pos >= 0 and key_rel_pos < qq_bias_stride_0
+            qq_bias = tl.load(
+                qq_bias_row_ptrs + key_rel_pos[None, :],
+                mask=is_query_key[None, :],  # avoid OOB for context keys
+                other=0.0,
+            )
+            S += qq_bias
+
         # compute running maximum
         # m_j : (BLOCK_M,)
         m_j = tl.maximum(M, tl.max(S, axis=1))
@@ -275,6 +295,7 @@ def kernel_unified_attention_3d(
         block_tables_ptr,  # [num_seqs, max_num_blocks_per_seq]
         seq_lens_ptr,  # [num_seqs]
         alibi_slopes_ptr,  # [num_query_heads]
+        qq_bias_ptr,  # [num_query_tokens, num_query_tokens]
         scale,  # float32
         k_scale,  # float32
         v_scale,  # float32
@@ -284,10 +305,12 @@ def kernel_unified_attention_3d(
         block_table_stride: tl.int64,  # int
         query_stride_0: tl.int64,  # int
         query_stride_1: tl.int64,  # int, should be equal to head_size
+        qq_bias_stride_0: tl.int64,  # int
         BLOCK_SIZE: tl.constexpr,  # int
         HEAD_SIZE: tl.constexpr,  # int
         HEAD_SIZE_PADDED: tl.constexpr,  # int, must be power of 2
         USE_ALIBI_SLOPES: tl.constexpr,  # bool
+        USE_QQ_BIAS: tl.constexpr,  # bool
         USE_SOFTCAP: tl.constexpr,  # bool
         SLIDING_WINDOW: tl.constexpr,  # int
         stride_k_cache_0: tl.int64,  # int
@@ -373,6 +396,11 @@ def kernel_unified_attention_3d(
                               mask=query_mask_1,
                               other=0.0)
 
+    # query-query attention bias
+    if USE_QQ_BIAS:
+        qq_bias_row_ptrs = (qq_bias_ptr + query_pos[:, None] * qq_bias_stride_0
+                            )  # shape: [BLOCK_M]
+
     num_blocks = cdiv_fn(seq_len, BLOCK_SIZE)
 
     # iterate through tiles within current segment
@@ -442,6 +470,18 @@ def kernel_unified_attention_3d(
         if USE_ALIBI_SLOPES:
             S += alibi_slope[:, None] * (seq_offset - context_len)
 
+        if USE_QQ_BIAS:
+            # compute key positions relative to query section
+            key_rel_pos = seq_offset - context_len  # shape: [BLOCK_SIZE]
+            # load bias only for keys that correspond to queries
+            is_query_key = key_rel_pos >= 0 and key_rel_pos < qq_bias_stride_0
+            qq_bias = tl.load(
+                qq_bias_row_ptrs + key_rel_pos[None, :],
+                mask=is_query_key[None, :],  # avoid OOB for context keys
+                other=0.0,
+            )
+            S += qq_bias
+
         # compute running maximum
         # m_j : (BLOCK_M,)
         m_j = tl.maximum(M, tl.max(S, axis=1))
@@ -586,6 +626,7 @@ def unified_attention(
     k_descale,
     v_descale,
     alibi_slopes=None,
+    qq_bias=None,
 ):
     assert causal, "Only causal attention is supported"
     assert q_descale is None, "Q scales not supported"
@@ -595,6 +636,7 @@ def unified_attention(
         "Block size must be at least 32 for fp8"
 
     use_alibi_slopes = alibi_slopes is not None
+    use_qq_bias = qq_bias is not None
 
     block_size = v.shape[1]
     num_seqs = len(seqused_k)
@@ -630,6 +672,7 @@ def unified_attention(
             block_tables_ptr=block_table,
             seq_lens_ptr=seqused_k,
             alibi_slopes_ptr=alibi_slopes,
+            qq_bias_ptr=qq_bias,
             scale=softmax_scale,
             k_scale=k_descale,
             v_scale=v_descale,
@@ -641,10 +684,12 @@ def unified_attention(
             query_stride_1=q.stride(1),
             output_stride_0=out.stride(0),
             output_stride_1=out.stride(1),
+            qq_bias_stride_0=qq_bias.stride(0) if use_qq_bias else 0,
             BLOCK_SIZE=block_size,
             HEAD_SIZE=head_size,
             HEAD_SIZE_PADDED=triton.next_power_of_2(head_size),
             USE_ALIBI_SLOPES=use_alibi_slopes,
+            USE_QQ_BIAS=use_qq_bias,
             USE_SOFTCAP=(softcap > 0),
             SLIDING_WINDOW=(1 + window_size[0]),
             stride_k_cache_0=k.stride(0),
@@ -699,6 +744,7 @@ def unified_attention(
                 block_tables_ptr=block_table,
                 seq_lens_ptr=seqused_k,
                 alibi_slopes_ptr=alibi_slopes,
+                qq_bias_ptr=qq_bias,
                 scale=softmax_scale,
                 k_scale=k_descale,
                 v_scale=v_descale,
@@ -708,10 +754,12 @@ def unified_attention(
                 block_table_stride=block_table.stride(0),
                 query_stride_0=q.stride(0),
                 query_stride_1=q.stride(1),
+                qq_bias_stride_0=qq_bias.stride(0) if use_qq_bias else 0,
                 BLOCK_SIZE=block_size,
                 HEAD_SIZE=head_size,
                 HEAD_SIZE_PADDED=triton.next_power_of_2(head_size),
                 USE_ALIBI_SLOPES=use_alibi_slopes,
+                USE_QQ_BIAS=use_qq_bias,
                 USE_SOFTCAP=(softcap > 0),
                 SLIDING_WINDOW=(1 + window_size[0]),
                 stride_k_cache_0=k.stride(0),
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index 2e3c8638125f7..596c556e54f06 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -193,6 +193,10 @@ def _cached_get_attn_backend(
         backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND
         if backend_by_env_var is not None:
             selected_backend = backend_name_to_enum(backend_by_env_var)
+            if selected_backend is None:
+                raise ValueError(
+                    f"Invalid attention backend: '{backend_by_env_var}'. "
+                    f"Valid backends are: {list(_Backend.__members__.keys())}")
 
     # get device-specific attn_backend
     attention_cls = current_platform.get_attn_backend_cls(
diff --git a/vllm/benchmarks/endpoint_request_func.py b/vllm/benchmarks/endpoint_request_func.py
deleted file mode 100644
index 60ae520db3862..0000000000000
--- a/vllm/benchmarks/endpoint_request_func.py
+++ /dev/null
@@ -1,393 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""The request function for API endpoints."""
-
-import io
-import json
-import os
-import sys
-import time
-import traceback
-from dataclasses import dataclass, field
-from typing import Optional
-
-import aiohttp
-from tqdm.asyncio import tqdm
-
-AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
-
-
-@dataclass
-class RequestFuncInput:
-    """The input for the request function."""
-    prompt: str
-    api_url: str
-    prompt_len: int
-    output_len: int
-    model: str
-    model_name: Optional[str] = None
-    logprobs: Optional[int] = None
-    extra_body: Optional[dict] = None
-    multi_modal_content: Optional[dict] = None
-    ignore_eos: bool = False
-    language: Optional[str] = None
-
-
-@dataclass
-class RequestFuncOutput:
-    """The output of the request function including metrics."""
-    generated_text: str = ""
-    success: bool = False
-    latency: float = 0.0
-    output_tokens: int = 0
-    ttft: float = 0.0  # Time to first token
-    itl: list[float] = field(
-        default_factory=list)  # list of inter-token latencies
-    tpot: float = 0.0  # avg next-token latencies
-    prompt_len: int = 0
-    error: str = ""
-
-
-async def async_request_openai_completions(
-    request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
-) -> RequestFuncOutput:
-    """The async request function for the OpenAI Completions API.
-
-    Args:
-        request_func_input: The input for the request function.
-        pbar: The progress bar to display the progress.
-
-    Returns:
-        The output of the request function.
-    """
-    api_url = request_func_input.api_url
-    assert api_url.endswith(
-        ("completions", "profile")
-    ), "OpenAI Completions API URL must end with 'completions' or 'profile'."
-
-    async with aiohttp.ClientSession(trust_env=True,
-                                     timeout=AIOHTTP_TIMEOUT) as session:
-        payload = {
-            "model": request_func_input.model_name \
-                if request_func_input.model_name else request_func_input.model,
-            "prompt": request_func_input.prompt,
-            "temperature": 0.0,
-            "repetition_penalty": 1.0,
-            "max_tokens": request_func_input.output_len,
-            "logprobs": request_func_input.logprobs,
-            "stream": True,
-            "stream_options": {
-                "include_usage": True,
-            },
-        }
-        if request_func_input.ignore_eos:
-            payload["ignore_eos"] = request_func_input.ignore_eos
-        if request_func_input.extra_body:
-            payload.update(request_func_input.extra_body)
-        headers = {
-            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
-        }
-
-        output = RequestFuncOutput()
-        output.prompt_len = request_func_input.prompt_len
-
-        generated_text = ""
-        st = time.perf_counter()
-        most_recent_timestamp = st
-        try:
-            async with session.post(url=api_url, json=payload,
-                                    headers=headers) as response:
-                if response.status == 200:
-                    first_chunk_received = False
-                    async for chunk_bytes in response.content:
-                        chunk_bytes = chunk_bytes.strip()
-                        if not chunk_bytes:
-                            continue
-                        chunk_bytes = chunk_bytes.decode("utf-8")
-                        # NOTE: SSE comments (often used as pings) start with
-                        # a colon. These are not JSON data payload and should
-                        # be skipped.
-                        if chunk_bytes.startswith(":"):
-                            continue
-
-                        chunk = chunk_bytes.removeprefix("data: ")
-
-                        if chunk != "[DONE]":
-                            data = json.loads(chunk)
-
-                            # NOTE: Some completion API might have a last
-                            # usage summary response without a token so we
-                            # want to check a token was generated
-                            if choices := data.get("choices"):
-                                # Note that text could be empty here
-                                # e.g. for special tokens
-                                text = choices[0].get("text")
-                                timestamp = time.perf_counter()
-                                # First token
-                                if not first_chunk_received:
-                                    first_chunk_received = True
-                                    ttft = time.perf_counter() - st
-                                    output.ttft = ttft
-
-                                # Decoding phase
-                                else:
-                                    output.itl.append(timestamp -
-                                                      most_recent_timestamp)
-
-                                most_recent_timestamp = timestamp
-                                generated_text += text or ""
-                            elif usage := data.get("usage"):
-                                output.output_tokens = usage.get(
-                                    "completion_tokens")
-                    if first_chunk_received:
-                        output.success = True
-                    else:
-                        output.success = False
-                        output.error = (
-                            "Never received a valid chunk to calculate TTFT."
-                            "This response will be marked as failed!")
-                    output.generated_text = generated_text
-                    output.latency = most_recent_timestamp - st
-                else:
-                    output.error = response.reason or ""
-                    output.success = False
-        except Exception:
-            output.success = False
-            exc_info = sys.exc_info()
-            output.error = "".join(traceback.format_exception(*exc_info))
-
-    if pbar:
-        pbar.update(1)
-    return output
-
-
-async def async_request_openai_chat_completions(
-    request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
-) -> RequestFuncOutput:
-    api_url = request_func_input.api_url
-    assert api_url.endswith(("chat/completions", "profile")), (
-        "OpenAI Chat Completions API URL must end with 'chat/completions'.")
-
-    async with aiohttp.ClientSession(trust_env=True,
-                                     timeout=AIOHTTP_TIMEOUT) as session:
-        content = [{"type": "text", "text": request_func_input.prompt}]
-        if request_func_input.multi_modal_content:
-            content.append(request_func_input.multi_modal_content)
-        payload = {
-            "model":
-            request_func_input.model_name
-            if request_func_input.model_name else request_func_input.model,
-            "messages": [
-                {
-                    "role": "user",
-                    "content": content
-                },
-            ],
-            "temperature":
-            0.0,
-            "max_completion_tokens":
-            request_func_input.output_len,
-            "stream":
-            True,
-            "stream_options": {
-                "include_usage": True,
-            },
-        }
-        if request_func_input.ignore_eos:
-            payload["ignore_eos"] = request_func_input.ignore_eos
-        if request_func_input.extra_body:
-            payload.update(request_func_input.extra_body)
-        headers = {
-            "Content-Type": "application/json",
-            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
-        }
-
-        output = RequestFuncOutput()
-        output.prompt_len = request_func_input.prompt_len
-
-        generated_text = ""
-        ttft = 0.0
-        st = time.perf_counter()
-        most_recent_timestamp = st
-        try:
-            async with session.post(url=api_url, json=payload,
-                                    headers=headers) as response:
-                if response.status == 200:
-                    async for chunk_bytes in response.content:
-                        chunk_bytes = chunk_bytes.strip()
-                        if not chunk_bytes:
-                            continue
-                        chunk_bytes = chunk_bytes.decode("utf-8")
-                        # NOTE: SSE comments (often used as pings) start with
-                        # a colon. These are not JSON data payload and should
-                        # be skipped.
-                        if chunk_bytes.startswith(":"):
-                            continue
-
-                        chunk = chunk_bytes.removeprefix("data: ")
-
-                        if chunk != "[DONE]":
-                            timestamp = time.perf_counter()
-                            data = json.loads(chunk)
-
-                            if choices := data.get("choices"):
-                                content = choices[0]["delta"].get("content")
-                                # First token
-                                if ttft == 0.0:
-                                    ttft = timestamp - st
-                                    output.ttft = ttft
-
-                                # Decoding phase
-                                else:
-                                    output.itl.append(timestamp -
-                                                      most_recent_timestamp)
-
-                                generated_text += content or ""
-                            elif usage := data.get("usage"):
-                                output.output_tokens = usage.get(
-                                    "completion_tokens")
-
-                            most_recent_timestamp = timestamp
-
-                    output.generated_text = generated_text
-                    output.success = True
-                    output.latency = most_recent_timestamp - st
-                else:
-                    output.error = response.reason or ""
-                    output.success = False
-        except Exception:
-            output.success = False
-            exc_info = sys.exc_info()
-            output.error = "".join(traceback.format_exception(*exc_info))
-
-    if pbar:
-        pbar.update(1)
-    return output
-
-
-async def async_request_openai_audio(
-    request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
-) -> RequestFuncOutput:
-    # Lazy import without PlaceholderModule to avoid vllm dep.
-    import soundfile
-
-    api_url = request_func_input.api_url
-    assert api_url.endswith(("transcriptions", "translations")), (
-        "OpenAI Chat Completions API URL must end with 'transcriptions' ")
-    "or `translations`."
-
-    async with aiohttp.ClientSession(trust_env=True,
-                                     timeout=AIOHTTP_TIMEOUT) as session:
-        content = [{"type": "text", "text": request_func_input.prompt}]
-        payload = {
-            "model":
-            request_func_input.model_name
-            if request_func_input.model_name else request_func_input.model,
-            "temperature":
-            0.0,
-            "max_completion_tokens":
-            request_func_input.output_len,
-            "stream":
-            True,
-            "language":
-            "en",
-            # Flattened due to multipart/form-data
-            "stream_include_usage":
-            True,
-            "stream_continuous_usage_stats":
-            True,
-        }
-        if request_func_input.extra_body:
-            payload.update(request_func_input.extra_body)
-        headers = {
-            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
-        }
-
-        # Send audio file
-        def to_bytes(y, sr):
-            buffer = io.BytesIO()
-            soundfile.write(buffer, y, sr, format="WAV")
-            buffer.seek(0)
-            return buffer
-
-        with to_bytes(*request_func_input.multi_modal_content["audio"]) as f:
-            form = aiohttp.FormData()
-            form.add_field("file", f, content_type="audio/wav")
-            for key, value in payload.items():
-                form.add_field(key, str(value))
-
-            output = RequestFuncOutput()
-            output.prompt_len = request_func_input.prompt_len
-
-            generated_text = ""
-            ttft = 0.0
-            st = time.perf_counter()
-            most_recent_timestamp = st
-            try:
-                async with session.post(url=api_url,
-                                        data=form,
-                                        headers=headers) as response:
-                    if response.status == 200:
-                        async for chunk_bytes in response.content:
-                            chunk_bytes = chunk_bytes.strip()
-                            if not chunk_bytes:
-                                continue
-
-                            chunk = chunk_bytes.decode("utf-8").removeprefix(
-                                "data: ")
-                            if chunk != "[DONE]":
-                                timestamp = time.perf_counter()
-                                data = json.loads(chunk)
-
-                                if choices := data.get("choices"):
-                                    content = choices[0]["delta"].get(
-                                        "content")
-                                    # First token
-                                    if ttft == 0.0:
-                                        ttft = timestamp - st
-                                        output.ttft = ttft
-
-                                    # Decoding phase
-                                    else:
-                                        output.itl.append(
-                                            timestamp - most_recent_timestamp)
-
-                                    generated_text += content or ""
-                                elif usage := data.get("usage"):
-                                    output.output_tokens = usage.get(
-                                        "completion_tokens")
-
-                                most_recent_timestamp = timestamp
-
-                        output.generated_text = generated_text
-                        output.success = True
-                        output.latency = most_recent_timestamp - st
-                    else:
-                        output.error = response.reason or ""
-                        output.success = False
-            except Exception:
-                output.success = False
-                exc_info = sys.exc_info()
-                output.error = "".join(traceback.format_exception(*exc_info))
-
-        if pbar:
-            pbar.update(1)
-        return output
-
-
-# TODO: Add more request functions for different API protocols.
-ASYNC_REQUEST_FUNCS = {
-    "vllm": async_request_openai_completions,
-    "openai": async_request_openai_completions,
-    "openai-chat": async_request_openai_chat_completions,
-    "openai-audio": async_request_openai_audio,
-}
-
-OPENAI_COMPATIBLE_BACKENDS = [
-    k for k, v in ASYNC_REQUEST_FUNCS.items()
-    if v in (async_request_openai_completions,
-             async_request_openai_chat_completions)
-]
diff --git a/vllm/benchmarks/latency.py b/vllm/benchmarks/latency.py
index 5c6124db80b4f..cebdf56c45b1b 100644
--- a/vllm/benchmarks/latency.py
+++ b/vllm/benchmarks/latency.py
@@ -14,8 +14,8 @@ from tqdm import tqdm
 
 import vllm.envs as envs
 from vllm import LLM, SamplingParams
-from vllm.benchmarks.utils import (convert_to_pytorch_benchmark_format,
-                                   write_to_json)
+from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format,
+                                       write_to_json)
 from vllm.engine.arg_utils import EngineArgs
 from vllm.inputs import PromptType
 from vllm.sampling_params import BeamSearchParams
diff --git a/vllm/benchmarks/lib/__init__.py b/vllm/benchmarks/lib/__init__.py
new file mode 100644
index 0000000000000..005e87af61949
--- /dev/null
+++ b/vllm/benchmarks/lib/__init__.py
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Benchmark library utilities."""
diff --git a/vllm/benchmarks/lib/endpoint_request_func.py b/vllm/benchmarks/lib/endpoint_request_func.py
new file mode 100644
index 0000000000000..2d64cc115f00f
--- /dev/null
+++ b/vllm/benchmarks/lib/endpoint_request_func.py
@@ -0,0 +1,390 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""The request function for API endpoints."""
+
+import io
+import json
+import os
+import sys
+import time
+import traceback
+from dataclasses import dataclass, field
+from typing import Optional
+
+import aiohttp
+from tqdm.asyncio import tqdm
+
+AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
+
+
+@dataclass
+class RequestFuncInput:
+    """The input for the request function."""
+    prompt: str
+    api_url: str
+    prompt_len: int
+    output_len: int
+    model: str
+    model_name: Optional[str] = None
+    logprobs: Optional[int] = None
+    extra_body: Optional[dict] = None
+    multi_modal_content: Optional[dict] = None
+    ignore_eos: bool = False
+    language: Optional[str] = None
+
+
+@dataclass
+class RequestFuncOutput:
+    """The output of the request function including metrics."""
+    generated_text: str = ""
+    success: bool = False
+    latency: float = 0.0
+    output_tokens: int = 0
+    ttft: float = 0.0  # Time to first token
+    itl: list[float] = field(
+        default_factory=list)  # list of inter-token latencies
+    tpot: float = 0.0  # avg next-token latencies
+    prompt_len: int = 0
+    error: str = ""
+
+
+async def async_request_openai_completions(
+    request_func_input: RequestFuncInput,
+    session: aiohttp.ClientSession,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    """The async request function for the OpenAI Completions API.
+
+    Args:
+        request_func_input: The input for the request function.
+        pbar: The progress bar to display the progress.
+
+    Returns:
+        The output of the request function.
+    """
+    api_url = request_func_input.api_url
+    assert api_url.endswith(
+        ("completions", "profile")
+    ), "OpenAI Completions API URL must end with 'completions' or 'profile'."
+
+    payload = {
+        "model": request_func_input.model_name \
+            if request_func_input.model_name else request_func_input.model,
+        "prompt": request_func_input.prompt,
+        "temperature": 0.0,
+        "repetition_penalty": 1.0,
+        "max_tokens": request_func_input.output_len,
+        "logprobs": request_func_input.logprobs,
+        "stream": True,
+        "stream_options": {
+            "include_usage": True,
+        },
+    }
+    if request_func_input.ignore_eos:
+        payload["ignore_eos"] = request_func_input.ignore_eos
+    if request_func_input.extra_body:
+        payload.update(request_func_input.extra_body)
+    headers = {
+        "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
+    }
+
+    output = RequestFuncOutput()
+    output.prompt_len = request_func_input.prompt_len
+
+    generated_text = ""
+    st = time.perf_counter()
+    most_recent_timestamp = st
+    try:
+        async with session.post(url=api_url, json=payload,
+                                headers=headers) as response:
+            if response.status == 200:
+                first_chunk_received = False
+                async for chunk_bytes in response.content:
+                    chunk_bytes = chunk_bytes.strip()
+                    if not chunk_bytes:
+                        continue
+                    chunk_bytes = chunk_bytes.decode("utf-8")
+                    # NOTE: SSE comments (often used as pings) start with
+                    # a colon. These are not JSON data payload and should
+                    # be skipped.
+                    if chunk_bytes.startswith(":"):
+                        continue
+
+                    chunk = chunk_bytes.removeprefix("data: ")
+
+                    if chunk != "[DONE]":
+                        data = json.loads(chunk)
+
+                        # NOTE: Some completion API might have a last
+                        # usage summary response without a token so we
+                        # want to check a token was generated
+                        if choices := data.get("choices"):
+                            # Note that text could be empty here
+                            # e.g. for special tokens
+                            text = choices[0].get("text")
+                            timestamp = time.perf_counter()
+                            # First token
+                            if not first_chunk_received:
+                                first_chunk_received = True
+                                ttft = time.perf_counter() - st
+                                output.ttft = ttft
+
+                            # Decoding phase
+                            else:
+                                output.itl.append(timestamp -
+                                                    most_recent_timestamp)
+
+                            most_recent_timestamp = timestamp
+                            generated_text += text or ""
+                        elif usage := data.get("usage"):
+                            output.output_tokens = usage.get(
+                                "completion_tokens")
+                if first_chunk_received:
+                    output.success = True
+                else:
+                    output.success = False
+                    output.error = (
+                        "Never received a valid chunk to calculate TTFT."
+                        "This response will be marked as failed!")
+                output.generated_text = generated_text
+                output.latency = most_recent_timestamp - st
+            else:
+                output.error = response.reason or ""
+                output.success = False
+    except Exception:
+        output.success = False
+        exc_info = sys.exc_info()
+        output.error = "".join(traceback.format_exception(*exc_info))
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
+async def async_request_openai_chat_completions(
+    request_func_input: RequestFuncInput,
+    session: aiohttp.ClientSession,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith(("chat/completions", "profile")), (
+        "OpenAI Chat Completions API URL must end with 'chat/completions'.")
+
+    content = [{"type": "text", "text": request_func_input.prompt}]
+    if request_func_input.multi_modal_content:
+        content.append(request_func_input.multi_modal_content)
+    payload = {
+        "model":
+        request_func_input.model_name
+        if request_func_input.model_name else request_func_input.model,
+        "messages": [
+            {
+                "role": "user",
+                "content": content
+            },
+        ],
+        "temperature":
+        0.0,
+        "max_completion_tokens":
+        request_func_input.output_len,
+        "stream":
+        True,
+        "stream_options": {
+            "include_usage": True,
+        },
+    }
+    if request_func_input.ignore_eos:
+        payload["ignore_eos"] = request_func_input.ignore_eos
+    if request_func_input.extra_body:
+        payload.update(request_func_input.extra_body)
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+    }
+
+    output = RequestFuncOutput()
+    output.prompt_len = request_func_input.prompt_len
+
+    generated_text = ""
+    ttft = 0.0
+    st = time.perf_counter()
+    most_recent_timestamp = st
+    try:
+        async with session.post(url=api_url, json=payload,
+                                headers=headers) as response:
+            if response.status == 200:
+                async for chunk_bytes in response.content:
+                    chunk_bytes = chunk_bytes.strip()
+                    if not chunk_bytes:
+                        continue
+                    chunk_bytes = chunk_bytes.decode("utf-8")
+                    # NOTE: SSE comments (often used as pings) start with
+                    # a colon. These are not JSON data payload and should
+                    # be skipped.
+                    if chunk_bytes.startswith(":"):
+                        continue
+
+                    chunk = chunk_bytes.removeprefix("data: ")
+
+                    if chunk != "[DONE]":
+                        timestamp = time.perf_counter()
+                        data = json.loads(chunk)
+
+                        if choices := data.get("choices"):
+                            content = choices[0]["delta"].get("content")
+                            # First token
+                            if ttft == 0.0:
+                                ttft = timestamp - st
+                                output.ttft = ttft
+
+                            # Decoding phase
+                            else:
+                                output.itl.append(timestamp -
+                                                    most_recent_timestamp)
+
+                            generated_text += content or ""
+                        elif usage := data.get("usage"):
+                            output.output_tokens = usage.get(
+                                "completion_tokens")
+
+                        most_recent_timestamp = timestamp
+
+                output.generated_text = generated_text
+                output.success = True
+                output.latency = most_recent_timestamp - st
+            else:
+                output.error = response.reason or ""
+                output.success = False
+    except Exception:
+        output.success = False
+        exc_info = sys.exc_info()
+        output.error = "".join(traceback.format_exception(*exc_info))
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
+async def async_request_openai_audio(
+    request_func_input: RequestFuncInput,
+    session: aiohttp.ClientSession,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    # Lazy import without PlaceholderModule to avoid vllm dep.
+    import soundfile
+
+    api_url = request_func_input.api_url
+    assert api_url.endswith(("transcriptions", "translations")), (
+        "OpenAI Chat Completions API URL must end with 'transcriptions' ")
+    "or `translations`."
+
+    content = [{"type": "text", "text": request_func_input.prompt}]
+    payload = {
+        "model":
+        request_func_input.model_name
+        if request_func_input.model_name else request_func_input.model,
+        "temperature":
+        0.0,
+        "max_completion_tokens":
+        request_func_input.output_len,
+        "stream":
+        True,
+        "language":
+        "en",
+        # Flattened due to multipart/form-data
+        "stream_include_usage":
+        True,
+        "stream_continuous_usage_stats":
+        True,
+    }
+    if request_func_input.extra_body:
+        payload.update(request_func_input.extra_body)
+    headers = {
+        "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+    }
+
+    # Send audio file
+    def to_bytes(y, sr):
+        buffer = io.BytesIO()
+        soundfile.write(buffer, y, sr, format="WAV")
+        buffer.seek(0)
+        return buffer
+
+    with to_bytes(*request_func_input.multi_modal_content["audio"]) as f:
+        form = aiohttp.FormData()
+        form.add_field("file", f, content_type="audio/wav")
+        for key, value in payload.items():
+            form.add_field(key, str(value))
+
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        generated_text = ""
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url,
+                                    data=form,
+                                    headers=headers) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = chunk_bytes.decode("utf-8").removeprefix(
+                            "data: ")
+                        if chunk != "[DONE]":
+                            timestamp = time.perf_counter()
+                            data = json.loads(chunk)
+
+                            if choices := data.get("choices"):
+                                content = choices[0]["delta"].get(
+                                    "content")
+                                # First token
+                                if ttft == 0.0:
+                                    ttft = timestamp - st
+                                    output.ttft = ttft
+
+                                # Decoding phase
+                                else:
+                                    output.itl.append(
+                                        timestamp - most_recent_timestamp)
+
+                                generated_text += content or ""
+                            elif usage := data.get("usage"):
+                                output.output_tokens = usage.get(
+                                    "completion_tokens")
+
+                            most_recent_timestamp = timestamp
+
+                    output.generated_text = generated_text
+                    output.success = True
+                    output.latency = most_recent_timestamp - st
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
+# TODO: Add more request functions for different API protocols.
+ASYNC_REQUEST_FUNCS = {
+    "vllm": async_request_openai_completions,
+    "openai": async_request_openai_completions,
+    "openai-chat": async_request_openai_chat_completions,
+    "openai-audio": async_request_openai_audio,
+}
+
+OPENAI_COMPATIBLE_BACKENDS = [
+    k for k, v in ASYNC_REQUEST_FUNCS.items()
+    if v in (async_request_openai_completions,
+             async_request_openai_chat_completions)
+]
diff --git a/vllm/benchmarks/lib/ready_checker.py b/vllm/benchmarks/lib/ready_checker.py
new file mode 100644
index 0000000000000..7e836158386a9
--- /dev/null
+++ b/vllm/benchmarks/lib/ready_checker.py
@@ -0,0 +1,72 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Utilities for checking endpoint readiness."""
+
+import asyncio
+import time
+
+import aiohttp
+from tqdm.asyncio import tqdm
+
+from .endpoint_request_func import RequestFuncInput, RequestFuncOutput
+
+
+async def wait_for_endpoint(
+    request_func,
+    test_input: RequestFuncInput,
+    session: aiohttp.ClientSession,
+    timeout_seconds: int = 600,
+    retry_interval: int = 5,
+) -> RequestFuncOutput:
+    """
+    Wait for an endpoint to become available before starting benchmarks.
+    
+    Args:
+        request_func: The async request function to call
+        test_input: The RequestFuncInput to test with
+        timeout_seconds: Maximum time to wait in seconds (default: 10 minutes)
+        retry_interval: Time between retries in seconds (default: 5 seconds)
+        
+    Returns:
+        RequestFuncOutput: The successful response
+        
+    Raises:
+        ValueError: If the endpoint doesn't become available within the timeout
+    """
+    deadline = time.perf_counter() + timeout_seconds
+    output = RequestFuncOutput(success=False)
+    print(f"Waiting for endpoint to become up in {timeout_seconds} seconds")
+    
+    with tqdm(
+        total=timeout_seconds, 
+        bar_format="{desc} |{bar}| {elapsed} elapsed, {remaining} remaining",
+        unit="s",
+    ) as pbar:
+
+        while True:            
+            # update progress bar
+            remaining = deadline - time.perf_counter()
+            elapsed = timeout_seconds - remaining
+            update_amount = min(elapsed - pbar.n, timeout_seconds - pbar.n)
+            pbar.update(update_amount)
+            pbar.refresh()
+            if remaining <= 0:
+                pbar.close()
+                break
+
+            # ping the endpoint using request_func
+            try:
+                output = await request_func(
+                    request_func_input=test_input, session=session)
+                if output.success:
+                    pbar.close()
+                    return output
+            except aiohttp.ClientConnectorError:
+                pass
+            
+            # retry after a delay
+            sleep_duration = min(retry_interval, remaining)
+            if sleep_duration > 0:
+                await asyncio.sleep(sleep_duration)
+    
+    return output
diff --git a/vllm/benchmarks/utils.py b/vllm/benchmarks/lib/utils.py
similarity index 100%
rename from vllm/benchmarks/utils.py
rename to vllm/benchmarks/lib/utils.py
diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index 635363440c081..ca8d218581e77 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -28,18 +28,19 @@ from dataclasses import dataclass
 from datetime import datetime
 from typing import Any, Literal, Optional
 
+import aiohttp
 import numpy as np
 from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase
 
 from vllm.benchmarks.datasets import (SampleRequest, add_dataset_parser,
                                       get_samples)
-from vllm.benchmarks.endpoint_request_func import (ASYNC_REQUEST_FUNCS,
-                                                   OPENAI_COMPATIBLE_BACKENDS,
-                                                   RequestFuncInput,
-                                                   RequestFuncOutput)
-from vllm.benchmarks.utils import (convert_to_pytorch_benchmark_format,
-                                   write_to_json)
+from vllm.benchmarks.lib.endpoint_request_func import (
+    ASYNC_REQUEST_FUNCS, OPENAI_COMPATIBLE_BACKENDS, RequestFuncInput,
+    RequestFuncOutput)
+from vllm.benchmarks.lib.ready_checker import wait_for_endpoint
+from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format,
+                                       write_to_json)
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
@@ -331,12 +332,31 @@ async def benchmark(
     ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None,
     ramp_up_start_rps: Optional[int] = None,
     ramp_up_end_rps: Optional[int] = None,
+    ready_check_timeout_sec: int = 600,
 ):
     if endpoint_type in ASYNC_REQUEST_FUNCS:
         request_func = ASYNC_REQUEST_FUNCS[endpoint_type]
     else:
         raise ValueError(f"Unknown endpoint_type: {endpoint_type}")
 
+    # Reuses connections across requests to reduce TLS handshake overhead.
+    connector = aiohttp.TCPConnector(
+        limit=max_concurrency or 0,
+        limit_per_host=max_concurrency or 0,
+        ttl_dns_cache=300,
+        use_dns_cache=True,
+        keepalive_timeout=60,
+        enable_cleanup_closed=True,
+        force_close=False,
+        ssl=("https://" in api_url),
+    )
+
+    session = aiohttp.ClientSession(
+        connector=connector,
+        trust_env=True,
+        timeout=aiohttp.ClientTimeout(total=6 * 60 * 60),
+    )
+
     print("Starting initial single prompt test run...")
     test_prompt, test_prompt_len, test_output_len, test_mm_content = (
         input_requests[0].prompt,
@@ -359,7 +379,12 @@ async def benchmark(
         extra_body=extra_body,
     )
 
-    test_output = await request_func(request_func_input=test_input)
+    test_output = await wait_for_endpoint(
+        request_func,
+        test_input,
+        session,
+        timeout_seconds=ready_check_timeout_sec,
+    )
     if not test_output.success:
         raise ValueError(
             "Initial test run failed - Please make sure benchmark arguments "
@@ -384,7 +409,8 @@ async def benchmark(
                                          multi_modal_content=test_mm_content,
                                          ignore_eos=ignore_eos,
                                          extra_body=extra_body)
-        profile_output = await request_func(request_func_input=profile_input)
+        profile_output = await request_func(
+            request_func_input=profile_input, session=session)
         if profile_output.success:
             print("Profiler started")
 
@@ -410,12 +436,14 @@ async def benchmark(
     semaphore = (asyncio.Semaphore(max_concurrency)
                  if max_concurrency else None)
 
-    async def limited_request_func(request_func_input, pbar):
+    async def limited_request_func(request_func_input, session, pbar):
         if semaphore is None:
             return await request_func(request_func_input=request_func_input,
+                                      session=session,
                                       pbar=pbar)
         async with semaphore:
-            return await request_func(request_func_input=request_func_input,
+            return await request_func(request_func_input=request_func_input, 
+                                      session=session,
                                       pbar=pbar)
 
     benchmark_start_time = time.perf_counter()
@@ -467,6 +495,7 @@ async def benchmark(
         tasks.append(
             asyncio.create_task(
                 limited_request_func(request_func_input=request_func_input,
+                                     session=session,
                                      pbar=pbar)))
     outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
 
@@ -486,6 +515,12 @@ async def benchmark(
 
     print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
     print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
+    if max_concurrency is not None:
+        print("{:<40} {:<10}".format("Maximum request concurrency:",
+                                     max_concurrency))
+    if request_rate != float('inf'):
+        print("{:<40} {:<10.2f}".format("Request rate configured (RPS):",
+                                        request_rate ))
     print("{:<40} {:<10.2f}".format("Benchmark duration (s):",
                                     benchmark_duration))
     print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
@@ -572,9 +607,12 @@ async def benchmark(
             output_len=test_output_len,
             logprobs=logprobs,
         )
-        profile_output = await request_func(request_func_input=profile_input)
+        profile_output = await request_func(
+            request_func_input=profile_input, session=session)
         if profile_output.success:
             print("Profiler stopped")
+
+    await session.close()
     return result
 
 
@@ -901,6 +939,13 @@ def add_cli_args(parser: argparse.ArgumentParser):
         help="The ending request rate for ramp-up (RPS). "
         "Needs to be specified when --ramp-up-strategy is used.",
     )
+    parser.add_argument(
+        "--ready-check-timeout-sec",
+        type=int,
+        default=600,
+        help="Maximum time to wait for the endpoint to become ready "
+        "in seconds (default: 600 seconds / 10 minutes).",
+    )
 
 
 def main(args: argparse.Namespace):
@@ -1006,6 +1051,7 @@ def main(args: argparse.Namespace):
             ramp_up_strategy=args.ramp_up_strategy,
             ramp_up_start_rps=args.ramp_up_start_rps,
             ramp_up_end_rps=args.ramp_up_end_rps,
+            ready_check_timeout_sec=args.ready_check_timeout_sec,
         ))
 
     # Save config and results to json
diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py
index 0fe042e2736da..bbd18ca3ae22e 100644
--- a/vllm/benchmarks/throughput.py
+++ b/vllm/benchmarks/throughput.py
@@ -21,8 +21,8 @@ from vllm.benchmarks.datasets import (AIMODataset, BurstGPTDataset,
                                       InstructCoderDataset, RandomDataset,
                                       SampleRequest, ShareGPTDataset,
                                       SonnetDataset, VisionArenaDataset)
-from vllm.benchmarks.utils import (convert_to_pytorch_benchmark_format,
-                                   write_to_json)
+from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format,
+                                       write_to_json)
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.entrypoints.openai.api_server import (
     build_async_engine_client_from_engine_args)
diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py
index 0e7961841bd33..6ae50245ed3a8 100644
--- a/vllm/compilation/collective_fusion.py
+++ b/vllm/compilation/collective_fusion.py
@@ -15,10 +15,13 @@ from vllm.distributed import get_tp_group, tensor_model_parallel_all_reduce
 from vllm.distributed.parallel_state import (
     get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 from vllm.utils import direct_register_custom_op
 
 from .vllm_inductor_pass import VllmInductorPass
 
+FP8_DTYPE = current_platform.fp8_dtype()
+
 if find_spec("flashinfer"):
     try:
         import flashinfer.comm as flashinfer_comm
@@ -28,13 +31,14 @@ if find_spec("flashinfer"):
         flashinfer_comm = None
 else:
     flashinfer_comm = None
-from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
 
 ALLREDUCE_OP = torch.ops.vllm.all_reduce.default
 RMS_OP = torch.ops._C.rms_norm.default
 RMS_ADD_OP = torch.ops._C.fused_add_rms_norm.default
+STATIC_FP8_QUANT_OP = torch.ops._C.static_scaled_fp8_quant.default
+STATIC_FP4_QUANT_OP = torch.ops._C.scaled_fp4_quant.default
 
 
 class BasePattern:
@@ -118,6 +122,230 @@ class AllGatherGEMMPattern(BasePattern):
                                 pm.fwd_only, pm_pass)
 
 
+class ScaledMMReduceScatterPattern(BasePattern):
+
+    def get_inputs(self):
+        input = torch.empty([16, 16], device=self.device, dtype=FP8_DTYPE)
+        mm_weight = torch.empty([16, 16], device=self.device,
+                                dtype=FP8_DTYPE).contiguous().transpose(0, 1)
+        scale_a = torch.empty([16, 1], device=self.device, dtype=torch.float32)
+        scale_b = torch.empty([1, 16], device=self.device, dtype=torch.float32)
+        return [input, mm_weight, scale_a, scale_b]
+
+    def register(self, pm_pass: PatternMatcherPass):
+
+        def pattern(input: torch.Tensor, mat2: torch.Tensor,
+                    scale_a: torch.Tensor,
+                    scale_b: torch.Tensor) -> torch.Tensor:
+            scaled_mm = torch.ops.aten._scaled_mm.default(input,
+                                                          mat2=mat2,
+                                                          scale_a=scale_a,
+                                                          scale_b=scale_b,
+                                                          bias=None,
+                                                          scale_result=None,
+                                                          out_dtype=self.dtype)
+            reduce_scatter = torch.ops.vllm.reduce_scatter.default(
+                scaled_mm,
+                dim=0,
+                world_size=self.tp_size,
+                group_name=self.tp.unique_name)
+            return reduce_scatter
+
+        def replacement(input: torch.Tensor, mat2: torch.Tensor,
+                        scale_a: torch.Tensor,
+                        scale_b: torch.Tensor) -> torch.Tensor:
+            gemm_rs = torch.ops.symm_mem.fused_scaled_matmul_reduce_scatter(
+                input,
+                mat2,
+                scale_a,
+                scale_b,
+                "avg",
+                scatter_dim=0,
+                out_dtype=self.dtype,
+                group_name=self.tp.device_group.group_name,
+            )
+
+            return gemm_rs
+
+        pm.register_replacement(pattern, replacement, self.get_inputs(),
+                                pm.fwd_only, pm_pass)
+
+
+class AllGatherScaledMMPattern(BasePattern):
+
+    def get_inputs(self):
+        x = torch.empty([8, 16], device=self.device, dtype=FP8_DTYPE)
+        weight = torch.empty([16, 16], device=self.device,
+                             dtype=FP8_DTYPE).contiguous().transpose(0, 1)
+
+        s1 = x.shape[0] * self.tp_size
+
+        scale_a = torch.empty([s1, 1], device=self.device, dtype=torch.float32)
+        scale_b = torch.empty([1, 16], device=self.device, dtype=torch.float32)
+
+        return [x, weight, scale_a, scale_b]
+
+    def register(self, pm_pass: PatternMatcherPass):
+
+        def pattern(
+            x: torch.Tensor,
+            weight: torch.Tensor,
+            scale_a: torch.Tensor,
+            scale_b: torch.Tensor,
+        ) -> torch.Tensor:
+            all_gather = torch.ops.vllm.all_gather.default(
+                x,
+                dim=0,
+                world_size=self.tp_size,
+                group_name=self.tp.unique_name)
+
+            return torch.ops.aten._scaled_mm.default(all_gather,
+                                                     mat2=weight,
+                                                     scale_a=scale_a,
+                                                     scale_b=scale_b,
+                                                     bias=None,
+                                                     scale_result=None,
+                                                     out_dtype=self.dtype)
+
+        def replacement(x: torch.Tensor, weight: torch.Tensor,
+                        scale_a: torch.Tensor,
+                        scale_b: torch.Tensor) -> torch.Tensor:
+            ag_output, mm_outputs = torch.ops.symm_mem.fused_all_gather_scaled_matmul(  # noqa
+                x,
+                [weight],
+                scale_a,
+                [scale_b],
+                gather_dim=0,
+                biases=[None],
+                result_scales=[None],
+                out_dtypes=[self.dtype],
+                use_fast_accum=[False],
+                group_name=self.tp.device_group.group_name,
+            )
+            return mm_outputs
+
+        pm.register_replacement(pattern, replacement, self.get_inputs(),
+                                pm.fwd_only, pm_pass)
+
+
+class CutlassScaledMMReduceScatterPattern(BasePattern):
+
+    def get_inputs(self):
+        input = torch.empty([16, 16], device=self.device, dtype=FP8_DTYPE)
+        mm_weight = torch.empty([16, 16], device=self.device,
+                                dtype=FP8_DTYPE).contiguous().transpose(0, 1)
+        scale_a = torch.empty([16, 1], device=self.device, dtype=torch.float32)
+        scale_b = torch.empty([1, 16], device=self.device, dtype=torch.float32)
+
+        cutlass_mm_output = torch.empty([16, 16],
+                                        device=self.device,
+                                        dtype=self.dtype)
+        return [input, mm_weight, scale_a, scale_b, cutlass_mm_output]
+
+    def register(self, pm_pass: PatternMatcherPass):
+
+        def pattern(input: torch.Tensor, weight: torch.Tensor,
+                    scale_a: torch.Tensor, scale_b: torch.Tensor,
+                    cutlass_mm_output: torch.Tensor) -> torch.Tensor:
+            cutlass_scaled_mm = torch.ops.higher_order.auto_functionalized(
+                torch.ops._C.cutlass_scaled_mm.default,
+                out=cutlass_mm_output,
+                a=input,
+                b=weight,
+                a_scales=scale_a,
+                b_scales=scale_b,
+                bias=None)
+
+            reduce_scatter = torch.ops.vllm.reduce_scatter.default(
+                cutlass_scaled_mm[1],
+                dim=0,
+                world_size=self.tp_size,
+                group_name=self.tp.unique_name)
+            return reduce_scatter
+
+        def replacement(input: torch.Tensor, mat2: torch.Tensor,
+                        scale_a: torch.Tensor, scale_b: torch.Tensor,
+                        cutlass_mm_output: torch.Tensor) -> torch.Tensor:
+            gemm_rs = torch.ops.symm_mem.fused_scaled_matmul_reduce_scatter(
+                input,
+                mat2,
+                scale_a,
+                scale_b,
+                "avg",
+                scatter_dim=0,
+                out_dtype=self.dtype,
+                group_name=self.tp.device_group.group_name,
+            )
+
+            return gemm_rs
+
+        pm.register_replacement(pattern, replacement, self.get_inputs(),
+                                pm.fwd_only, pm_pass)
+
+
+class AllGatherCutlassScaledMMPattern(BasePattern):
+
+    def get_inputs(self):
+        x = torch.empty([8, 16], device=self.device, dtype=FP8_DTYPE)
+        weight = torch.empty([16, 16], device=self.device,
+                             dtype=FP8_DTYPE).contiguous().transpose(0, 1)
+
+        s1 = x.shape[0] * self.tp_size
+
+        scale_a = torch.empty([s1, 1], device=self.device, dtype=torch.float32)
+        scale_b = torch.empty([1, 16], device=self.device, dtype=torch.float32)
+
+        s2 = weight.shape[1]
+        output = torch.empty([s1, s2], device=self.device, dtype=self.dtype)
+
+        return [x, weight, scale_a, scale_b, output]
+
+    def register(self, pm_pass: PatternMatcherPass):
+
+        def pattern(
+            x: torch.Tensor,
+            weight: torch.Tensor,
+            scale_a: torch.Tensor,
+            scale_b: torch.Tensor,
+            output: torch.Tensor,
+        ) -> torch.Tensor:
+            all_gather = torch.ops.vllm.all_gather.default(
+                x,
+                dim=0,
+                world_size=self.tp_size,
+                group_name=self.tp.unique_name)
+
+            cutlass_scaled_mm = torch.ops.higher_order.auto_functionalized(
+                torch.ops._C.cutlass_scaled_mm.default,
+                out=output,
+                a=all_gather,
+                b=weight,
+                a_scales=scale_a,
+                b_scales=scale_b,
+                bias=None)
+            return cutlass_scaled_mm[1]
+
+        def replacement(x: torch.Tensor, weight: torch.Tensor,
+                        scale_a: torch.Tensor, scale_b: torch.Tensor,
+                        output: torch.Tensor) -> torch.Tensor:
+            ag_output, mm_outputs = torch.ops.symm_mem.fused_all_gather_scaled_matmul(  # noqa
+                x,
+                [weight],
+                scale_a,
+                [scale_b],
+                gather_dim=0,
+                biases=[None],
+                result_scales=[None],
+                out_dtypes=[self.dtype],
+                use_fast_accum=[False],
+                group_name=self.tp.device_group.group_name,
+            )
+            return mm_outputs
+
+        pm.register_replacement(pattern, replacement, self.get_inputs(),
+                                pm.fwd_only, pm_pass)
+
+
 class AsyncTPPass(VllmInductorPass):
 
     def __init__(self, config: VllmConfig):
@@ -133,6 +361,20 @@ class AsyncTPPass(VllmInductorPass):
         AllGatherGEMMPattern(self.model_dtype,
                              self.device).register(self.patterns)
 
+        # These fusions are enabled only for bfloat16 models because
+        # `scaled_mm` or `cutlass_scaled_mm` with per-token (row-wise) scaling
+        # only supports bfloat16 as the output dtype.
+        if self.model_dtype == torch.bfloat16:
+            ScaledMMReduceScatterPattern(self.model_dtype,
+                                         self.device).register(self.patterns)
+            AllGatherScaledMMPattern(self.model_dtype,
+                                     self.device).register(self.patterns)
+
+            CutlassScaledMMReduceScatterPattern(
+                self.model_dtype, self.device).register(self.patterns)
+            AllGatherCutlassScaledMMPattern(
+                self.model_dtype, self.device).register(self.patterns)
+
     def is_applicable_for_shape(self, shape: Optional[int]) -> bool:
         # only do replace for specific shapes
         tp_size = get_tensor_model_parallel_world_size()
@@ -142,7 +384,7 @@ class AsyncTPPass(VllmInductorPass):
         self.begin()
         self.dump_graph(graph, "before_async_tp_pass")
         count = self.patterns.apply(graph)
-        logger.debug("Replaced %s patterns", count)
+        logger.debug("Replaced %s patterns with async TP pass.", count)
         self.dump_graph(graph, "after_async_tp_pass")
         self.end_and_log()
 
@@ -154,7 +396,7 @@ if flashinfer_comm is not None:
     # Max size of the input tensor per world size
     # to use flashinfer fused allreduce
     _FI_MAX_SIZES = {
-        2: MiB,  # 1MB
+        2: 64 * MiB,  # 64MB
         4: MiB,  # 1MB
         6: MiB // 2,  # 512KB
         8: MiB // 2,  # 512KB
@@ -174,9 +416,13 @@ if flashinfer_comm is not None:
         trigger_completion_at_end: bool,
         fp32_acc: bool,
         max_token_num: int,
+        pattern_code: int,
+        fuse_rms_quant: bool,
         norm_out: Optional[torch.Tensor] = None,
+        quant_out: Optional[torch.Tensor] = None,
+        scale_out: Optional[torch.Tensor] = None,
+        scale_factor: Optional[torch.Tensor] = None,
     ) -> None:
-
         num_tokens, hidden_size = allreduce_in.shape
         element_size = allreduce_in.element_size()
         current_tensor_size = num_tokens * hidden_size * element_size
@@ -185,7 +431,6 @@ if flashinfer_comm is not None:
             _FI_MAX_SIZES.get(world_size, _DEFAULT_FI_MAX_SIZE),
             max_fusion_size,
         )
-
         if use_flashinfer:
             assert (_FI_WORKSPACE_TENSOR is not None
                     ), "Flashinfer must be enabled when using flashinfer"
@@ -215,37 +460,65 @@ if flashinfer_comm is not None:
                 use_oneshot=True,
                 trigger_completion_at_end=trigger_completion_at_end,
                 fp32_acc=fp32_acc,
-                pattern_code=flashinfer_comm.AllReduceFusionPattern.
-                kARResidualRMSNorm,
+                pattern_code=pattern_code,
                 allreduce_out=None,
-                quant_out=None,
-                scale_out=None,
-                layout_code=None,
-                scale_factor=None,
+                quant_out=quant_out,
+                scale_out=scale_out,
+                # in vllm we only support swizzled layout
+                layout_code=flashinfer_comm.FP4QuantizationSFLayout.SWIZZLED,
+                scale_factor=scale_factor,
             )
         else:
             allreduce_out = tensor_model_parallel_all_reduce(allreduce_in)
-            if norm_out is None:
-                torch.ops._C.fused_add_rms_norm(allreduce_out, residual,
-                                                rms_gamma, rms_eps)
+            if (scale_factor is not None and scale_out is None
+                    and fuse_rms_quant):
+                # Do fused rms norm static fp8 quant fused op
+                if norm_out is None:
+                    torch.ops._C.fused_add_rms_norm_static_fp8_quant(
+                        quant_out, allreduce_out, residual, rms_gamma,
+                        scale_factor, rms_eps)
+                else:
+                    torch.ops._C.rms_norm_static_fp8_quant(
+                        quant_out, allreduce_out, rms_gamma, scale_factor,
+                        rms_eps)
             else:
-                torch.ops._C.rms_norm(norm_out, allreduce_out, rms_gamma,
-                                      rms_eps)
-            allreduce_in.copy_(allreduce_out)
+                if norm_out is None:
+                    torch.ops._C.fused_add_rms_norm(allreduce_out, residual,
+                                                    rms_gamma, rms_eps)
+                    norm_out = allreduce_out
+                else:
+                    torch.ops._C.rms_norm(norm_out, allreduce_out, rms_gamma,
+                                          rms_eps)
+                if scale_factor is not None:
+                    if scale_out is not None:
+                        torch.ops._C.scaled_fp4_quant(quant_out, norm_out,
+                                                      scale_out, scale_factor)
+                    else:
+                        torch.ops._C.static_scaled_fp8_quant(
+                            quant_out, norm_out, scale_factor)
+            if scale_factor is None or norm_out is not None:
+                # we need to return allreduce outpput
+                # in cases of non quant fused AR + RMS norm
+                # and fused AR + RMS norm + quant without fused add
+                allreduce_in.copy_(allreduce_out)
 
     def call_trtllm_fused_allreduce_norm_fake(
-        allreduce_in: torch.Tensor,
-        residual: torch.Tensor,
-        rms_gamma: torch.Tensor,
-        rms_eps: float,
-        world_rank: int,
-        world_size: int,
-        launch_with_pdl: bool,
-        trigger_completion_at_end: bool,
-        fp32_acc: bool,
-        max_token_num: int,
-        norm_out: Optional[torch.Tensor] = None,
-    ) -> None:
+            allreduce_in: torch.Tensor,
+            residual: torch.Tensor,
+            rms_gamma: torch.Tensor,
+            rms_eps: float,
+            world_rank: int,
+            world_size: int,
+            launch_with_pdl: bool,
+            trigger_completion_at_end: bool,
+            fp32_acc: bool,
+            max_token_num: int,
+            pattern_code: int,
+            fuse_rms_quant: bool,
+            norm_out: Optional[torch.Tensor] = None,
+            quant_out: Optional[torch.Tensor] = None,
+            scale_out: Optional[torch.Tensor] = None,
+            scale_factor: Optional[torch.Tensor] = None) -> None:
         pass
 
     direct_register_custom_op(
@@ -255,6 +528,8 @@ if flashinfer_comm is not None:
             "allreduce_in",
             "residual",
             "norm_out",
+            "quant_out",
+            "scale_out",
         ],
         fake_impl=call_trtllm_fused_allreduce_norm_fake,
         dispatch_key=current_platform.dispatch_key,
@@ -272,6 +547,7 @@ class FlashInferFusedAllReduceParams:
         world_size: int,
         use_fp32_lamport: bool = False,
         max_token_num: int = 1024,
+        fuse_rms_quant: bool = False,
     ):
         self.rank = rank
         self.world_size = world_size
@@ -281,6 +557,7 @@ class FlashInferFusedAllReduceParams:
         self.fp32_acc = True
         self.use_oneshot = False
         self.max_token_num = max_token_num
+        self.fuse_rms_quant = fuse_rms_quant
 
     def get_trtllm_fused_allreduce_kwargs(self):
         return {
@@ -290,10 +567,16 @@ class FlashInferFusedAllReduceParams:
             "trigger_completion_at_end": self.trigger_completion_at_end,
             "fp32_acc": self.fp32_acc,
             "max_token_num": self.max_token_num,
+            "fuse_rms_quant": self.fuse_rms_quant,
         }
 
 
-class AllReduceRMSNORMPattern(BasePattern):
+class AllReduceRMSNormPattern(BasePattern):
+    """
+    This pattern replaces the allreduce + rms norm (without residual) 
+    with fused flashinfer implementation.
+    Applies to allreduce + rmsnorm before attn in the first Transformer block.
+    """
 
     def __init__(
         self,
@@ -319,29 +602,34 @@ class AllReduceRMSNORMPattern(BasePattern):
 
         def pattern(input: torch.Tensor, rms_result: torch.Tensor,
                     weight: torch.Tensor):
-            all_reduce_output = tensor_model_parallel_all_reduce(input)
+            allreduce_output = tensor_model_parallel_all_reduce(input)
             rms = auto_functionalized(
                 RMS_OP,
                 result=rms_result,
-                input=all_reduce_output,
+                input=allreduce_output,
                 weight=weight,
                 epsilon=self.epsilon,
             )
-            return rms[1], all_reduce_output
+            # rms_result, allreduce_output
+            return rms[1], allreduce_output
 
         def replacement(input: torch.Tensor, rms_result: torch.Tensor,
                         weight: torch.Tensor):
             residual = torch.zeros_like(input)
             allreduce = auto_functionalized(
-                torch.ops.vllm.flashinfer_trtllm_fused_allreduce_norm.default,
+                flashinfer_trtllm_fused_allreduce_norm,
                 allreduce_in=input,
                 residual=residual,
                 norm_out=rms_result,
+                quant_out=None,
+                scale_out=None,
                 rms_gamma=weight,
                 rms_eps=self.epsilon,
+                pattern_code=flashinfer_comm.AllReduceFusionPattern.
+                kARResidualRMSNorm,
                 **self.allreduce_params.get_trtllm_fused_allreduce_kwargs(),
             )
-
+            # rms_result, allreduce_in
             return allreduce[3], allreduce[1]
 
         pm.register_replacement(pattern, replacement, self.get_inputs(),
@@ -349,6 +637,11 @@ class AllReduceRMSNORMPattern(BasePattern):
 
 
 class AllReduceFusedAddRMSNormPattern(BasePattern):
+    """
+    This pattern replaces the allreduce + rms norm (with residual) 
+    with fused flashinfer implementation.
+    Applies to o_proj + rmsnorm after attn and mlp + rmsnorm before attn.
+    """
 
     def __init__(
         self,
@@ -375,33 +668,390 @@ class AllReduceFusedAddRMSNormPattern(BasePattern):
 
         def pattern(residual: torch.Tensor, input: torch.Tensor,
                     weight: torch.Tensor):
-            all_reduce_output = tensor_model_parallel_all_reduce(input)
+            allreduce_output = tensor_model_parallel_all_reduce(input)
             rms = auto_functionalized(
                 RMS_ADD_OP,
-                input=all_reduce_output,
+                input=allreduce_output,
                 residual=residual,
                 weight=weight,
                 epsilon=self.epsilon,
             )
+            # input, residual
             return rms[1], rms[2]
 
         def replacement(residual: torch.Tensor, input: torch.Tensor,
                         weight: torch.Tensor):
             allreduce = auto_functionalized(
-                torch.ops.vllm.flashinfer_trtllm_fused_allreduce_norm.default,
+                flashinfer_trtllm_fused_allreduce_norm,
                 allreduce_in=input,
                 residual=residual,
+                norm_out=None,
+                quant_out=None,
+                scale_out=None,
                 rms_gamma=weight,
                 rms_eps=self.epsilon,
-                norm_out=None,
+                pattern_code=flashinfer_comm.AllReduceFusionPattern.
+                kARResidualRMSNorm,
                 **self.allreduce_params.get_trtllm_fused_allreduce_kwargs(),
             )
+            # allreduce_in, residual
             return allreduce[1], allreduce[2]
 
         pm.register_replacement(pattern, replacement, self.get_inputs(),
                                 pm.fwd_only, pm_pass)
 
 
+class AllReduceFusedRMSNormStaticQuantFP8Pattern(BasePattern):
+    """
+    This pattern replaces the allreduce + rms norm (without residual) 
+    + static fp8 quant with fused flashinfer implementation.
+    Applies to allreduce + rmsnorm + quant before attn 
+    in the first Transformer block.
+    """
+
+    def __init__(self, epsilon: float, dtype: torch.dtype, device: str,
+                 allreduce_params: FlashInferFusedAllReduceParams):
+        super().__init__(dtype, device)
+        self.epsilon = epsilon
+        self.allreduce_params = allreduce_params
+        self.quant_dtype = torch.float8_e4m3fn
+
+    def register(self, pm_pass: PatternMatcherPass):
+
+        def get_inputs():
+            input = torch.zeros([1, 8, 4],
+                                device=self.device,
+                                dtype=self.dtype)
+            rmsnorm_result = torch.empty([1, 8, 4],
+                                         device=self.device,
+                                         dtype=self.dtype)
+            quant_result = torch.empty([1, 8, 4],
+                                       device=self.device,
+                                       dtype=self.quant_dtype)
+            weight = torch.empty([4], device=self.device, dtype=self.dtype)
+            scale = torch.tensor(1.0, device=self.device, dtype=torch.float32)
+            return [input, rmsnorm_result, quant_result, weight, scale]
+
+        def pattern(
+            input: torch.Tensor,
+            rmsnorm_result: torch.Tensor,
+            quant_result: torch.Tensor,
+            weight: torch.Tensor,
+            scale: torch.Tensor,
+        ):
+            all_reduce = tensor_model_parallel_all_reduce(input)
+            rmsnorm_out_tuple = auto_functionalized(RMS_OP,
+                                                    result=rmsnorm_result,
+                                                    input=all_reduce,
+                                                    weight=weight,
+                                                    epsilon=self.epsilon)
+
+            quant_out_tuple = auto_functionalized(STATIC_FP8_QUANT_OP,
+                                                  result=quant_result,
+                                                  input=rmsnorm_out_tuple[1],
+                                                  scale=scale)
+
+            # quant_out, allreduce_output
+            return quant_out_tuple[1], all_reduce
+
+        def replacement(input: torch.Tensor, result_rms: torch.Tensor,
+                        quant_result: torch.Tensor, weight: torch.Tensor,
+                        scale: torch.Tensor):
+            residual = torch.zeros_like(input)
+            allreduce = auto_functionalized(
+                flashinfer_trtllm_fused_allreduce_norm,
+                allreduce_in=input,
+                residual=residual,
+                norm_out=result_rms,
+                quant_out=quant_result,
+                scale_out=None,
+                rms_gamma=weight,
+                rms_eps=self.epsilon,
+                pattern_code=flashinfer_comm.AllReduceFusionPattern.
+                kARResidualRMSNormFP8Quant,  # we don't use norm_out afterwards
+                scale_factor=scale,
+                **self.allreduce_params.get_trtllm_fused_allreduce_kwargs(),
+            )
+
+            # quant_out, allreduce_output
+            return allreduce[4], allreduce[1]
+
+        pm.register_replacement(pattern, replacement, get_inputs(),
+                                pm.fwd_only, pm_pass)
+
+
+class AllReduceFusedAddRMSNormStaticQuantFP8Pattern(BasePattern):
+    """
+    This pattern replaces the allreduce + rms norm (with residual)
+    + static fp8 quant with fused flashinfer implementation.
+    Applies to o_proj + rmsnorm after attn + quant and 
+    mlp + rmsnorm + quant before attn.
+    """
+
+    def __init__(self, epsilon: float, dtype: torch.dtype, device: str,
+                 allreduce_params: FlashInferFusedAllReduceParams):
+        super().__init__(dtype, device)
+        self.epsilon = epsilon
+        self.allreduce_params = allreduce_params
+        self.quant_dtype = torch.float8_e4m3fn
+
+    def register(self, pm_pass: PatternMatcherPass):
+
+        def get_inputs():
+            input = torch.empty([4, 4], device=self.device, dtype=self.dtype)
+
+            residual = torch.empty([4, 4],
+                                   device=self.device,
+                                   dtype=self.dtype)
+            weight = torch.empty([4, 4], device=self.device, dtype=self.dtype)
+            quant_result = torch.empty([4, 4],
+                                       device=self.device,
+                                       dtype=self.quant_dtype)
+            scale = torch.empty([1, 1],
+                                device=self.device,
+                                dtype=torch.float32)
+
+            return [
+                quant_result,
+                residual,
+                input,
+                weight,
+                scale,
+            ]
+
+        def pattern(
+            quant_result: torch.Tensor,
+            residual: torch.Tensor,
+            input: torch.Tensor,
+            weight: torch.Tensor,
+            scale: torch.Tensor,
+        ):
+            allreduce_output = tensor_model_parallel_all_reduce(input)
+
+            fused_add_rmsnorm_out_tuple = \
+            auto_functionalized(
+                RMS_ADD_OP,
+                input=allreduce_output,
+                residual=residual,
+                weight=weight,
+                epsilon=self.epsilon)
+            quant_out_tuple = auto_functionalized(
+                STATIC_FP8_QUANT_OP,
+                result=quant_result,
+                input=fused_add_rmsnorm_out_tuple[1],
+                scale=scale)
+
+            # quant_out, allreduce_output
+            return quant_out_tuple[1], fused_add_rmsnorm_out_tuple[2]
+
+        def replacement(quant_result: torch.Tensor, residual: torch.Tensor,
+                        input: torch.Tensor, weight: torch.Tensor,
+                        scale: torch.Tensor):
+            allreduce = auto_functionalized(
+                flashinfer_trtllm_fused_allreduce_norm,
+                allreduce_in=input,
+                residual=residual,
+                norm_out=None,
+                quant_out=quant_result,
+                scale_out=None,
+                rms_gamma=weight,
+                rms_eps=self.epsilon,
+                pattern_code=flashinfer_comm.AllReduceFusionPattern.
+                kARResidualRMSNormFP8Quant,  # we don't use norm_out afterwards
+                scale_factor=scale,
+                **self.allreduce_params.get_trtllm_fused_allreduce_kwargs(),
+            )
+            # # quant_out, rms_norm_residual
+            return allreduce[4], allreduce[2]
+
+        pm.register_replacement(pattern, replacement, get_inputs(),
+                                pm.fwd_only, pm_pass)
+
+
+class AllReduceFusedRMSNormStaticQuantNVFP4Pattern(BasePattern):
+    """
+    This pattern replaces the allreduce + rms norm (without residual) 
+    + static nvfp4 quant with fused flashinfer implementation.
+    Applies to allreduce + rmsnorm + quant before attn 
+    in the first Transformer block.
+    """
+
+    def __init__(self, epsilon: float, dtype: torch.dtype, device: str,
+                 allreduce_params: FlashInferFusedAllReduceParams):
+        super().__init__(dtype, device)
+        self.epsilon = epsilon
+        self.allreduce_params = allreduce_params
+
+    def register(self, pm_pass: PatternMatcherPass):
+
+        def get_inputs():
+            input = torch.empty([1, 16, 16],
+                                device=self.device,
+                                dtype=self.dtype)
+
+            rmsnorm_result = torch.empty([1, 16, 16],
+                                         device=self.device,
+                                         dtype=self.dtype)
+            quant_result = torch.empty((16, 8),
+                                       device=self.device,
+                                       dtype=torch.uint8)
+            input_global_scale = torch.empty([1, 1],
+                                             device=self.device,
+                                             dtype=torch.float32)
+            weight = torch.empty([16], device=self.device, dtype=self.dtype)
+            output_scale = torch.empty([128, 4],
+                                       device=self.device,
+                                       dtype=torch.int32)
+
+            return [
+                input, rmsnorm_result, quant_result, weight,
+                input_global_scale, output_scale
+            ]
+
+        def pattern(
+            input: torch.Tensor,
+            rmsnorm_result: torch.Tensor,
+            quant_result: torch.Tensor,
+            weight: torch.Tensor,
+            input_global_scale: torch.Tensor,
+            output_scale: torch.Tensor,
+        ):
+            all_reduce = tensor_model_parallel_all_reduce(input)
+            rmsnorm_out_tuple = auto_functionalized(RMS_OP,
+                                                    result=rmsnorm_result,
+                                                    input=all_reduce,
+                                                    weight=weight,
+                                                    epsilon=self.epsilon)
+
+            quant_out_tuple = auto_functionalized(
+                STATIC_FP4_QUANT_OP,
+                output=quant_result,
+                input=rmsnorm_out_tuple[1],
+                output_scale=output_scale,
+                input_scale=input_global_scale)
+
+            # quant_out, allreduce_output, output_scale
+            return quant_out_tuple[1], all_reduce, quant_out_tuple[2]
+
+        def replacement(input: torch.Tensor, result_rms: torch.Tensor,
+                        quant_result: torch.Tensor, weight: torch.Tensor,
+                        input_global_scale: torch.Tensor,
+                        output_scale: torch.Tensor):
+            residual = torch.zeros_like(input)
+            allreduce = auto_functionalized(
+                flashinfer_trtllm_fused_allreduce_norm,
+                allreduce_in=input,
+                residual=residual,
+                norm_out=result_rms,
+                quant_out=quant_result,
+                scale_out=output_scale,
+                rms_gamma=weight,
+                rms_eps=self.epsilon,
+                pattern_code=flashinfer_comm.AllReduceFusionPattern.
+                kARResidualRMSNormFP4Quant,  # we don't use norm_out afterwards
+                scale_factor=input_global_scale,
+                **self.allreduce_params.get_trtllm_fused_allreduce_kwargs(),
+            )
+
+            # quant_out, allreduce_output, output_scale
+            return allreduce[4], allreduce[1], allreduce[5]
+
+        pm.register_replacement(pattern, replacement, get_inputs(),
+                                pm.fwd_only, pm_pass)
+
+
+class AllReduceFusedAddRMSNormStaticQuantNVFP4Pattern(BasePattern):
+    """
+    This pattern replaces the allreduce + rms norm (with residual)
+    + static nvfp4 quant with fused flashinfer implementation.
+    Applies to o_proj + rmsnorm after attn + quant and 
+    mlp + rmsnorm + quant before attn.
+    """
+
+    def __init__(self, epsilon: float, dtype: torch.dtype, device: str,
+                 allreduce_params: FlashInferFusedAllReduceParams):
+        super().__init__(dtype, device)
+        self.epsilon = epsilon
+        self.allreduce_params = allreduce_params
+
+    def register(self, pm_pass: PatternMatcherPass):
+
+        def get_inputs():
+            input = torch.empty([16, 16], device=self.device, dtype=self.dtype)
+
+            residual = torch.empty([16, 16],
+                                   device=self.device,
+                                   dtype=self.dtype)
+            weight = torch.empty([16, 16],
+                                 device=self.device,
+                                 dtype=self.dtype)
+            quant_result = torch.empty((16, 8),
+                                       device=self.device,
+                                       dtype=torch.uint8)
+            input_global_scale = torch.empty([1, 1],
+                                             device=self.device,
+                                             dtype=torch.float32)
+            output_scale = torch.empty([128, 4],
+                                       device=self.device,
+                                       dtype=torch.int32)
+
+            return [
+                quant_result,
+                residual,
+                input,
+                output_scale,
+                weight,
+                input_global_scale,
+            ]
+
+        def pattern(quant_result: torch.Tensor, residual: torch.Tensor,
+                    input: torch.Tensor, output_scale: torch.Tensor,
+                    weight: torch.Tensor, input_global_scale: torch.Tensor):
+            allreduce_output = tensor_model_parallel_all_reduce(input)
+
+            fused_add_rmsnorm_out_tuple = \
+            auto_functionalized(
+                RMS_ADD_OP,
+                input=allreduce_output,
+                residual=residual,
+                weight=weight,
+                epsilon=self.epsilon)
+            quant_out_tuple = auto_functionalized(
+                STATIC_FP4_QUANT_OP,
+                output=quant_result,
+                input=fused_add_rmsnorm_out_tuple[1],
+                output_scale=output_scale,
+                input_scale=input_global_scale)
+
+            # quant_out, allreduce_output, output_scale
+            return quant_out_tuple[1], fused_add_rmsnorm_out_tuple[
+                2], quant_out_tuple[2]
+
+        def replacement(quant_result: torch.Tensor, residual: torch.Tensor,
+                        input: torch.Tensor, output_scale: torch.Tensor,
+                        weight: torch.Tensor,
+                        input_global_scale: torch.Tensor):
+            allreduce = auto_functionalized(
+                flashinfer_trtllm_fused_allreduce_norm,
+                allreduce_in=input,
+                residual=residual,
+                norm_out=None,
+                quant_out=quant_result,
+                scale_out=output_scale,
+                rms_gamma=weight,
+                rms_eps=self.epsilon,
+                pattern_code=flashinfer_comm.AllReduceFusionPattern.
+                kARResidualRMSNormFP4Quant,  # we don't use norm_out afterwards
+                scale_factor=input_global_scale,
+                **self.allreduce_params.get_trtllm_fused_allreduce_kwargs(),
+            )
+            # quant_out, rms_norm_residual, output_scale
+            return allreduce[4], allreduce[2], allreduce[5]
+
+        pm.register_replacement(pattern, replacement, get_inputs(),
+                                pm.fwd_only, pm_pass)
+
+
 class AllReduceFusionPass(VllmInductorPass):
 
     def __init__(self, config: VllmConfig):
@@ -431,13 +1081,16 @@ class AllReduceFusionPass(VllmInductorPass):
                 self.tp_size,
             )
             return
-
+        max_num_token = min(
+            _FI_MAX_SIZES.get(self.tp_size, _DEFAULT_FI_MAX_SIZE) //
+            (self.hidden_dim * self.tp_size * (4 if use_fp32_lamport else 2)),
+            config.compilation_config.pass_config.
+            fi_allreduce_fusion_max_token_num)
         self.ipc_handles, workspace_tensor = (
             flashinfer_comm.trtllm_create_ipc_workspace_for_all_reduce_fusion(
                 tp_rank=rank,
                 tp_size=self.tp_size,
-                max_token_num=config.compilation_config.pass_config.
-                fi_allreduce_fusion_max_token_num,
+                max_token_num=max_num_token,
                 hidden_dim=self.hidden_dim,
                 group=self.group,
                 use_fp32_lamport=use_fp32_lamport,
@@ -449,12 +1102,38 @@ class AllReduceFusionPass(VllmInductorPass):
             rank=rank,
             world_size=self.tp_size,
             use_fp32_lamport=use_fp32_lamport,
-            max_token_num=config.compilation_config.pass_config.
-            fi_allreduce_fusion_max_token_num,
-        )
+            max_token_num=max_num_token,
+            # fuse rms norm static fp8 quant fused op
+            # in fallback path, when we don't use flashinfer
+            fuse_rms_quant=config.compilation_config.pass_config.enable_fusion)
 
         for epsilon in [1e-5, 1e-6]:
-            AllReduceRMSNORMPattern(
+            AllReduceFusedRMSNormStaticQuantFP8Pattern(
+                epsilon,
+                self.model_dtype,
+                self.device,
+                self.allreduce_params,
+            ).register(self.patterns)
+            AllReduceFusedAddRMSNormStaticQuantFP8Pattern(
+                epsilon,
+                self.model_dtype,
+                self.device,
+                self.allreduce_params,
+            ).register(self.patterns)
+            if current_platform.has_device_capability(100):
+                AllReduceFusedRMSNormStaticQuantNVFP4Pattern(
+                    epsilon,
+                    self.model_dtype,
+                    self.device,
+                    self.allreduce_params,
+                ).register(self.patterns)
+                AllReduceFusedAddRMSNormStaticQuantNVFP4Pattern(
+                    epsilon,
+                    self.model_dtype,
+                    self.device,
+                    self.allreduce_params,
+                ).register(self.patterns)
+            AllReduceRMSNormPattern(
                 epsilon,
                 self.model_dtype,
                 self.device,
@@ -467,6 +1146,10 @@ class AllReduceFusionPass(VllmInductorPass):
                 self.allreduce_params,
             ).register(self.patterns)
 
+            # WARNING: This is a hack to clear the pattern matcher cache
+            # and allow multiple values of epsilon.
+            torch._inductor.pattern_matcher._seen_patterns.clear()
+
         self.disabled = False
 
     def __call__(self, graph: fx.Graph):
@@ -483,5 +1166,5 @@ class AllReduceFusionPass(VllmInductorPass):
         if self.disabled:
             return
         if flashinfer_comm is not None:
-            flashinfer_comm.trtllm_destroy_ipc_workspace(
+            flashinfer_comm.trtllm_destroy_ipc_workspace_for_all_reduce(
                 self.ipc_handles, self.group)
diff --git a/vllm/compilation/counter.py b/vllm/compilation/counter.py
index 6acb8abb3deb1..e01dd3915a3a1 100644
--- a/vllm/compilation/counter.py
+++ b/vllm/compilation/counter.py
@@ -27,6 +27,8 @@ class CompilationCounter:
     num_cache_entries_updated: int = 0
     # The number of standalone_compile compiled artifacts saved
     num_compiled_artifacts_saved: int = 0
+    # Number of times a model was loaded with CompilationLevel.DYNAMO_AS_IS
+    dynamo_as_is_count: int = 0
 
     def clone(self) -> "CompilationCounter":
         return copy.deepcopy(self)
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index f3592324d8cfa..1370862d580a5 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -108,7 +108,7 @@ def support_torch_compile(
     During runtime, when we actually mark dimensions of tensors,
      it depends on the value of arguments:
 
-    - if it is a single integer (can be negative), the corresponding dimension 
+    - if it is a single integer (can be negative), the corresponding dimension
         of the argument will be marked as dynamic.
     - if it is `None`, ignored.
     - if it is `IntermediateTensors`, all the tensors in the intermediate
diff --git a/vllm/compilation/fusion_attn.py b/vllm/compilation/fusion_attn.py
index 79518b6f4f965..a40a8caf34a88 100644
--- a/vllm/compilation/fusion_attn.py
+++ b/vllm/compilation/fusion_attn.py
@@ -164,3 +164,6 @@ class AttnFusionPass(VllmInductorPass):
         logger.debug("Fused quantization onto %s attention nodes", count)
         self.dump_graph(graph, "after_attn_fusion")
         self.end_and_log()
+
+    def uuid(self):
+        return VllmInductorPass.hash_source(self, AttentionStaticQuantPattern)
diff --git a/vllm/compilation/inductor_pass.py b/vllm/compilation/inductor_pass.py
index 810d0801e9f38..2a149c65b3877 100644
--- a/vllm/compilation/inductor_pass.py
+++ b/vllm/compilation/inductor_pass.py
@@ -76,9 +76,10 @@ class InductorPass(CustomGraphPass):
         for src in srcs:
             if isinstance(src, str):
                 src_str = src
-            elif isinstance(src, types.FunctionType):
+            elif isinstance(src, (types.FunctionType, type)):
                 src_str = inspect.getsource(src)
             else:
+                # object instance
                 src_str = inspect.getsource(src.__class__)
             hasher.update(src_str.encode("utf-8"))
         return hasher.hexdigest()
diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py
index 11e03daced160..e07e52be9fdf6 100644
--- a/vllm/compilation/pass_manager.py
+++ b/vllm/compilation/pass_manager.py
@@ -9,9 +9,11 @@ from vllm.platforms import current_platform
 
 if current_platform.is_cuda_alike():
     from .fusion import FusionPass
-    from .collective_fusion import AllReduceFusionPass, AsyncTPPass
     from .fusion_attn import AttnFusionPass
 
+if current_platform.is_cuda():
+    from .collective_fusion import AllReduceFusionPass, AsyncTPPass
+
 from .activation_quant_fusion import ActivationQuantFusionPass
 from .fix_functionalization import FixFunctionalizationPass
 from .inductor_pass import CustomGraphPass, InductorPass, get_pass_context
diff --git a/vllm/compilation/sequence_parallelism.py b/vllm/compilation/sequence_parallelism.py
index 6107046e40dcd..ebc025cba71ed 100644
--- a/vllm/compilation/sequence_parallelism.py
+++ b/vllm/compilation/sequence_parallelism.py
@@ -477,6 +477,6 @@ class SequenceParallelismPass(VllmInductorPass):
         self.begin()
         self.dump_graph(graph, "before_sequence_parallelism_pass")
         count = self.patterns.apply(graph)
-        logger.debug("Replaced %s patterns", count)
+        logger.debug("Replaced %s patterns with sequence parallelism", count)
         self.dump_graph(graph, "after_sequence_parallelism_pass")
         self.end_and_log()
diff --git a/vllm/config.py b/vllm/config.py
index 1c6d4a87d098b..366622819a891 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -11,10 +11,11 @@ import textwrap
 import uuid
 import warnings
 from collections import Counter
+from collections.abc import Mapping
 from contextlib import contextmanager
 from dataclasses import (MISSING, Field, asdict, field, fields, is_dataclass,
                          replace)
-from functools import cached_property
+from functools import cached_property, lru_cache
 from importlib.util import find_spec
 from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Literal, Optional,
                     Protocol, TypeVar, Union, cast, get_args)
@@ -38,8 +39,8 @@ from vllm.transformers_utils.config import (
     ConfigFormat, get_config, get_hf_image_processor_config,
     get_hf_text_config, get_pooling_config,
     get_sentence_transformer_tokenizer_config, is_encoder_decoder,
-    try_get_generation_config, try_get_safetensors_metadata,
-    try_get_tokenizer_config, uses_mrope)
+    maybe_override_with_speculators_target_model, try_get_generation_config,
+    try_get_safetensors_metadata, try_get_tokenizer_config, uses_mrope)
 from vllm.transformers_utils.s3_utils import S3Model
 from vllm.transformers_utils.utils import is_s3, maybe_model_redirect
 # yapf conflicts with isort for this block
@@ -56,6 +57,7 @@ from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS,
 
 if TYPE_CHECKING:
     from _typeshed import DataclassInstance
+    from ray.runtime_env import RuntimeEnv
     from ray.util.placement_group import PlacementGroup
     from transformers.configuration_utils import PretrainedConfig
 
@@ -73,6 +75,7 @@ if TYPE_CHECKING:
 else:
     DataclassInstance = Any
     PlacementGroup = Any
+    RuntimeEnv = Any
     PretrainedConfig = Any
     ExecutorBase = Any
     QuantizationConfig = Any
@@ -374,7 +377,8 @@ class ModelConfig:
     max_logprobs: int = 20
     """Maximum number of log probabilities to return when `logprobs` is
     specified in `SamplingParams`. The default value comes the default for the
-    OpenAI Chat Completions API."""
+    OpenAI Chat Completions API. -1 means no cap, i.e. all (output_length *
+    vocab_size) logprobs are allowed to be returned and it may cause OOM."""
     logprobs_mode: LogprobsMode = "raw_logprobs"
     """Indicates the content returned in the logprobs and prompt_logprobs.
     Supported mode:
@@ -534,6 +538,15 @@ class ModelConfig:
                     "affect the random state of the Python process that "
                     "launched vLLM.", self.seed)
 
+        if self.runner != "draft":
+            # If we're not running the draft model, check for speculators config
+            # If speculators config, set model / tokenizer to be target model
+            self.model, self.tokenizer = maybe_override_with_speculators_target_model(  # noqa: E501
+                model=self.model,
+                tokenizer=self.tokenizer,
+                revision=self.revision,
+                trust_remote_code=self.trust_remote_code)
+
         # Keep set served_model_name before maybe_model_redirect(self.model)
         self.served_model_name = get_served_model_name(self.model,
                                                        self.served_model_name)
@@ -605,8 +618,8 @@ class ModelConfig:
                                self.config_format,
                                hf_overrides_kw=hf_overrides_kw,
                                hf_overrides_fn=hf_overrides_fn)
-        self.hf_config = hf_config
 
+        self.hf_config = hf_config
         self.hf_text_config = get_hf_text_config(self.hf_config)
         self.attention_chunk_size = getattr(self.hf_text_config,
                                             "attention_chunk_size", None)
@@ -723,11 +736,16 @@ class ModelConfig:
         )
 
         # Workaround for Gemma 2 which uses interleaved sliding window
-        # attention, but it's not specified in its config. TODO: remove this
-        # when Gemma 2 is fixed in Transformers.
+        # attention, but it's not specified in its config.
+        # TODO: remove this when Gemma 2 config updated in HuggingFace.
         if self.hf_text_config.model_type == "gemma2":
             self.hf_text_config.sliding_window_pattern = 2
 
+        # TODO: remove this when Gemma 3n config updated in HuggingFace.
+        if self.hf_text_config.model_type == "gemma3n_text":
+            # 4 sliding window attention followed by 1 full attention
+            self.hf_text_config.sliding_window_pattern = "LLLLG"
+
         sliding_window = getattr(self.hf_text_config, "sliding_window", None)
         sliding_window_pattern = getattr(self.hf_text_config,
                                          "sliding_window_pattern", None)
@@ -735,8 +753,8 @@ class ModelConfig:
             isinstance(sliding_window, list))
 
         if not self.disable_sliding_window and has_interleaved_attention:
-            if (backend :=
-                    envs.VLLM_ATTENTION_BACKEND) in ("XFORMERS", "FLASHINFER"):
+            if not envs.VLLM_USE_V1 and (backend := envs.VLLM_ATTENTION_BACKEND
+                                         ) in ("XFORMERS", "FLASHINFER"):
                 sliding_window_len_min = get_min_sliding_window(
                     self.hf_text_config.sliding_window)
 
@@ -771,6 +789,9 @@ class ModelConfig:
             raise ValueError(
                 "`override_neuron_config` is only supported on Neuron.")
 
+        # Avoid running try_verify_and_update_config multiple times
+        self.config_updated = False
+
         self._verify_quantization()
         self._verify_cuda_graph()
         self._verify_bnb_config()
@@ -794,12 +815,17 @@ class ModelConfig:
     def _get_transformers_backend_cls(self) -> str:
         """Determine which Transformers backend class will be used if
         `model_impl` is set to `transformers` or `auto`."""
+        if getattr(self, "runner_type", self.runner) == "pooling":
+            return "TransformersModel"
         if self.hf_config != self.hf_text_config:
             # If 'hf_text_config' is the same as 'hf_config'. If not, it is
             # probably a composite config, i.e. multimodal
             return "TransformersForMultimodalLM"
-        else:
-            return "TransformersForCausalLM"
+        return "TransformersForCausalLM"
+
+    def using_transformers_backend(self) -> bool:
+        """Check if the model is using the Transformers backend class."""
+        return self.architecture == self._get_transformers_backend_cls()
 
     @property
     def registry(self):
@@ -851,7 +877,7 @@ class ModelConfig:
             self.tokenizer = s3_tokenizer.dir
 
     def _init_multimodal_config(self) -> Optional["MultiModalConfig"]:
-        if self.registry.is_multimodal_model(self.architectures, self):
+        if self._model_info.supports_multimodal:
             return MultiModalConfig(
                 limit_per_prompt=self.limit_mm_per_prompt,
                 media_io_kwargs=self.media_io_kwargs,
@@ -860,21 +886,14 @@ class ModelConfig:
                 disable_mm_preprocessor_cache,
                 interleave_mm_strings=self.interleave_mm_strings)
 
-        if self.limit_mm_per_prompt:
-            raise ValueError("`limit_mm_per_prompt` is only supported for "
-                             "multimodal models.")
-        if self.mm_processor_kwargs:
-            raise ValueError("`mm_processor_kwargs` is only supported for "
-                             "multimodal models.")
-        if self.disable_mm_preprocessor_cache:
-            raise ValueError("`disable_mm_preprocessor_cache` is only "
-                             "supported for multimodal models.")
-        if self.interleave_mm_strings:
-            raise ValueError("`interleave_mm_strings` is only "
-                             "supported for multimodal models.")
-
         return None
 
+    def set_disable_mm_preprocessor_cache(self, value: bool) -> None:
+        mm_config = self.get_multimodal_config()
+
+        self.disable_mm_preprocessor_cache = value
+        mm_config.disable_mm_preprocessor_cache = value
+
     def _get_encoder_config(self):
         return get_sentence_transformer_tokenizer_config(
             self.model, self.revision)
@@ -894,15 +913,6 @@ class ModelConfig:
                     if getattr(pooler_config, k) is None:
                         setattr(pooler_config, k, v)
 
-            if self.is_matryoshka:
-                if pooler_config.normalize is None:
-                    pooler_config.normalize = True
-                elif not pooler_config.normalize:
-                    raise ValueError(
-                        "`normalize` must be enabled (set to True) "
-                        "for models that are compatible with "
-                        "Matryoshka Representation.")
-
             return pooler_config
 
         return None
@@ -1089,6 +1099,21 @@ class ModelConfig:
         if quant_cfg is None:
             # compressed-tensors uses a "compression_config" key
             quant_cfg = getattr(self.hf_config, "compression_config", None)
+
+        else:
+            # Set quant_method for ModelOpt models.
+            producer_name = quant_cfg.get("producer", {}).get("name")
+            if producer_name == "modelopt":
+                quant_algo = quant_cfg.get("quantization",
+                                           {}).get("quant_algo")
+                if quant_algo == "FP8":
+                    quant_cfg["quant_method"] = "modelopt"
+                elif quant_algo == "NVFP4":
+                    quant_cfg["quant_method"] = "modelopt_fp4"
+                elif quant_algo is not None:
+                    raise ValueError(
+                        f"Unknown ModelOpt quant algo: {quant_algo}")
+
         return quant_cfg
 
     def _verify_quantization(self) -> None:
@@ -1564,7 +1589,18 @@ class ModelConfig:
         return self.multimodal_config
 
     def try_get_generation_config(self) -> dict[str, Any]:
-        if self.generation_config in ("auto", "vllm"):
+        """
+        This method attempts to retrieve the non-default values of the
+        generation config for this model.
+
+        The generation config can contain information about special tokens, as
+        well as sampling parameters. Which is why this method exists separately
+        to `get_diff_sampling_param`.
+
+        Returns:
+            A dictionary containing the non-default generation config.
+        """
+        if self.generation_config in {"auto", "vllm"}:
             config = try_get_generation_config(
                 self.hf_config_path or self.model,
                 trust_remote_code=self.trust_remote_code,
@@ -1583,13 +1619,18 @@ class ModelConfig:
 
     def get_diff_sampling_param(self) -> dict[str, Any]:
         """
-        This method returns a dictionary containing the parameters
-        that differ from the default sampling parameters. If
-        `generation_config` is `"vllm"`, an empty dictionary is returned.
+        This method returns a dictionary containing the non-default sampling
+        parameters with `override_generation_config` applied.
+
+        The default sampling parameters are:
+
+        - vLLM's neutral defaults if `self.generation_config="vllm"`
+        - the model's defaults if `self.generation_config="auto"`
+        - as defined in `generation_config.json` if
+            `self.generation_config="path/to/generation_config/dir"`
 
         Returns:
-            dict[str, Any]: A dictionary with the differing sampling
-            parameters, if `generation_config` is `"vllm"` an empty dictionary.
+            A dictionary containing the non-default sampling parameters.
         """
         if self.generation_config == "vllm":
             config = {}
@@ -1774,8 +1815,8 @@ class CacheConfig:
     - "builtin" is Python's built-in hash.\n
     - "sha256" is collision resistant but with certain overheads.
     This option uses Pickle for object serialization before hashing.\n
-    - "sha256_cbor_64bit" provides a reproducible, cross-language compatible 
-    hash. It serializes objects using canonical CBOR and hashes them with 
+    - "sha256_cbor_64bit" provides a reproducible, cross-language compatible
+    hash. It serializes objects using canonical CBOR and hashes them with
     SHA-256. The resulting hash consists of the lower 64 bits of the SHA-256
     digest."""
     cpu_offload_gb: float = 0
@@ -1803,6 +1844,16 @@ class CacheConfig:
     num_cpu_blocks: Optional[int] = field(default=None, init=False)
     """The number of blocks to allocate for CPU memory."""
 
+    kv_sharing_fast_prefill: bool = False
+    """This feature is work in progress and no prefill optimization takes place
+    with this flag enabled currently.
+
+    In some KV sharing setups, e.g. YOCO (https://arxiv.org/abs/2405.05254),
+    some layers can skip tokens corresponding to prefill. This flag enables
+    attention metadata for eligible layers to be overriden with metadata
+    necessary for implementating this optimization in some models (e.g. Gemma3n)
+    """
+
     def compute_hash(self) -> str:
         """
         WARNING: Whenever a new field is added to this config,
@@ -1844,6 +1895,11 @@ class CacheConfig:
                 "GPU memory utilization must be less than 1.0. Got "
                 f"{self.gpu_memory_utilization}.")
 
+        if self.kv_sharing_fast_prefill:
+            logger.warning_once(
+                "--kv-sharing-fast-prefill is currently work in progress "
+                "and not functional yet (i.e. no prefill savings)")
+
         return self
 
     def _verify_cache_dtype(self) -> None:
@@ -2017,7 +2073,7 @@ class ParallelConfig:
     and when data_parallel_size > 0. Enables running an AsyncLLM
     and API server on a "per-node" basis where vLLM load balances
     between local data parallel ranks, but an external LB balances
-    between vLLM nodes/replicas. Set explicitly in conjunction with 
+    between vLLM nodes/replicas. Set explicitly in conjunction with
     --data-parallel-start-rank."""
     enable_expert_parallel: bool = False
     """Use expert parallelism instead of tensor parallelism for MoE layers."""
@@ -2051,6 +2107,9 @@ class ParallelConfig:
     ray_workers_use_nsight: bool = False
     """Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler."""
 
+    ray_runtime_env: Optional["RuntimeEnv"] = None
+    """Ray runtime environment to pass to distributed workers."""
+
     placement_group: Optional["PlacementGroup"] = None
     """ray distributed model workers placement group."""
 
@@ -2963,10 +3022,13 @@ class SpeculativeConfig:
                             "Chunked prefill and EAGLE are not compatible "
                             "when using V0.")
 
+                    from vllm.transformers_utils.configs import (
+                        SpeculatorsConfig)
                     from vllm.transformers_utils.configs.eagle import (
                         EAGLEConfig)
+
                     if isinstance(self.draft_model_config.hf_config,
-                                  EAGLEConfig):
+                                  (EAGLEConfig, SpeculatorsConfig)):
                         pass
                     else:
                         eagle_config = EAGLEConfig(
@@ -2994,6 +3056,19 @@ class SpeculativeConfig:
                             f"num_speculative_tokens:{self.num_speculative_tokens}"
                             f" must be divisible by {n_predict=}")
 
+                if self.speculative_token_tree is None:
+                    # Generate chain of tokens.
+                    self.speculative_token_tree = str([
+                        (i + 1) * (0, )
+                        for i in range(self.num_speculative_tokens)
+                    ])
+                else:
+                    # Sort the token tree breadth-first.
+                    tree_choices = ast.literal_eval(
+                        self.speculative_token_tree)
+                    self.speculative_token_tree = str(
+                        sorted(tree_choices, key=lambda t: (len(t), t)))
+
                 self.draft_tensor_parallel_size = \
                     SpeculativeConfig._verify_and_get_draft_tp(
                         self.target_parallel_config,
@@ -3125,10 +3200,19 @@ class SpeculativeConfig:
                              "speculative decoding is > 1, but got "
                              f"{self.disable_by_batch_size=}")
 
-        if self.method == "eagle3" and self.target_model_config and \
-            "llama" not in self.target_model_config.hf_text_config.model_type:
+        from vllm.transformers_utils.configs import SpeculatorsConfig
+
+        eagle3_target_supported = ["llama"]
+        if self.draft_model_config and isinstance(
+                self.draft_model_config.hf_config, SpeculatorsConfig):
+            eagle3_target_supported.append("qwen")
+
+        if self.method == "eagle3" and self.target_model_config and not any(
+                supported_model in
+                self.target_model_config.hf_text_config.model_type
+                for supported_model in eagle3_target_supported):
             raise ValueError(
-                "Eagle3 is only supported for Llama models. "
+                f"Eagle3 is only supported for {eagle3_target_supported} models. "  # noqa: E501
                 f"Got {self.target_model_config.hf_text_config.model_type=}")
 
         return self
@@ -3322,7 +3406,16 @@ class MultiModalConfig:
             999 if envs.VLLM_USE_V1 else 1,
         )
 
-    # TODO: Add configs to init vision tower or not.
+    def merge_mm_processor_kwargs(
+        self,
+        inference_kwargs: Mapping[str, object],
+    ) -> dict[str, object]:
+        """
+        Get the keyword arguments to pass to the multi-modal processor
+        according to the extra arguments passed during inference.
+        """
+        kwargs = self.mm_processor_kwargs or {}
+        return kwargs | dict(inference_kwargs)
 
 
 @config
@@ -3336,25 +3429,34 @@ class PoolerConfig:
     [`vllm.model_executor.layers.pooler.PoolingType`][].
     """
 
+    ## for embeddings models
     normalize: Optional[bool] = None
     """
-    Whether to normalize the pooled outputs. Usually, this should be set to
-    ``True`` for embedding outputs.
+    Whether to normalize the embeddings outputs. 
+    """
+    dimensions: Optional[int] = None
+    """
+    Reduce the dimensions of embeddings if model 
+    support matryoshka representation.
     """
 
+    ## for classification models
+    activation: Optional[bool] = None
+    """
+    Whether to apply activation function to the classification outputs. 
+    """
+
+    ## for reward models
     softmax: Optional[bool] = None
     """
-    Whether to apply softmax to the pooled outputs. Usually, this should be set
-    to ``True`` for classification outputs.
+    Whether to apply softmax to the reward outputs. 
     """
-
     step_tag_id: Optional[int] = None
     """
     If set, only the score corresponding to the ``step_tag_id`` in the
     generated sentence should be returned. Otherwise, the scores for all tokens
     are returned.
     """
-
     returned_token_ids: Optional[list[int]] = None
     """
     A list of indices for the vocabulary dimensions to be extracted,
@@ -3721,12 +3823,7 @@ def get_served_model_name(model: str,
     return served_model_name
 
 
-GuidedDecodingBackendV0 = Literal["auto", "outlines", "lm-format-enforcer",
-                                  "xgrammar", "guidance"]
-
-GuidedDecodingBackendV1 = Literal["auto", "xgrammar", "guidance", "outlines"]
-GuidedDecodingBackend = Literal[GuidedDecodingBackendV0,
-                                GuidedDecodingBackendV1]
+GuidedDecodingBackend = Literal["auto", "xgrammar", "guidance", "outlines"]
 
 
 @config
@@ -3734,7 +3831,7 @@ GuidedDecodingBackend = Literal[GuidedDecodingBackendV0,
 class DecodingConfig:
     """Dataclass which contains the decoding strategy of the engine."""
 
-    backend: GuidedDecodingBackend = "auto" if envs.VLLM_USE_V1 else "xgrammar"
+    backend: GuidedDecodingBackend = "auto"
     """Which engine will be used for guided decoding (JSON schema / regex etc)
     by default. With "auto", we will make opinionated choices based on request
     contents and what the backend libraries currently support, so the behavior
@@ -3776,13 +3873,6 @@ class DecodingConfig:
         return hash_str
 
     def __post_init__(self):
-        if envs.VLLM_USE_V1:
-            valid_guided_backends = get_args(GuidedDecodingBackendV1)
-        else:
-            valid_guided_backends = get_args(GuidedDecodingBackendV0)
-        if self.backend not in valid_guided_backends:
-            raise ValueError(f"Invalid backend '{self.backend}',"
-                             f" must be one of {valid_guided_backends}")
         if (self.disable_any_whitespace
                 and self.backend not in ("xgrammar", "guidance")):
             raise ValueError("disable_any_whitespace is only supported for "
@@ -4171,7 +4261,7 @@ class PassConfig:
     """Whether to enable async TP."""
     enable_fi_allreduce_fusion: bool = False
     """Whether to enable flashinfer allreduce fusion."""
-    fi_allreduce_fusion_max_token_num: int = 1024
+    fi_allreduce_fusion_max_token_num: int = 16384
     """Max number of tokens to used in flashinfer allreduce fusion."""
 
     # TODO(luka) better pass enabling system.
@@ -4236,9 +4326,11 @@ class CompilationConfig:
         certain small batchsizes, where inductor is good at optimizing.
     """
     # Top-level Compilation control
-    level: int = 0
+    level: Optional[int] = None
     """The level of compilation:
 
+    - None: If None, we will select the default compilation level.
+      For V1 engine this is 3, for V0 engine this is 0.
     - 0: no compilation.
     - 1: dynamo as is.
     - 2: dynamo once.
@@ -4400,12 +4492,20 @@ class CompilationConfig:
             "disabled_custom_ops": True,
             "compilation_time": True,
             "bs_to_padded_graph_size": True,
-            "pass_config": True,
             "traced_files": True,
             "inductor_compile_config": {
                 "post_grad_custom_post_pass": True,
             },
         }
+
+        # exclude default attr in pass_config
+        pass_config_exclude = {}
+        for attr, default_val in vars(PassConfig()).items():
+            if getattr(self.pass_config, attr) == default_val:
+                pass_config_exclude[attr] = True
+        if pass_config_exclude:
+            exclude["pass_config"] = pass_config_exclude
+
         # The cast to string is necessary because Pydantic is mocked in docs
         # builds and sphinx-argparse doesn't know the return type of decode()
         return str(
@@ -4800,6 +4900,22 @@ class VllmConfig:
                 "To workaround this limitation, vLLM will set 'ieee' input "
                 "precision for chunked prefill triton kernels.")
 
+        # If the user does not explicitly set a compilation level, then
+        # we use the default level. The default level depends on other
+        # settings (see the below code).
+        if self.compilation_config.level is None:
+            if envs.VLLM_USE_V1:
+                if (self.model_config is not None
+                        and not self.model_config.enforce_eager):
+                    self.compilation_config.level = CompilationLevel.PIECEWISE
+                else:
+                    self.compilation_config.level = \
+                            CompilationLevel.NO_COMPILATION
+            else:
+                # NB: Passing both --enforce-eager and a compilation level
+                # in V0 means the compilation level wins out.
+                self.compilation_config.level = CompilationLevel.NO_COMPILATION
+
         # async tp is built on top of sequence parallelism
         # and requires it to be enabled.
         if self.compilation_config.pass_config.enable_async_tp:
@@ -4812,7 +4928,6 @@ class VllmConfig:
             # By default, V1 uses piecewise CUDA graphs. If full_cuda_graph
             # is set to True, full CUDA graphs will be used.
             self.compilation_config.cudagraph_num_of_warmups = 1
-            self.compilation_config.level = CompilationLevel.PIECEWISE
             self.compilation_config.set_splitting_ops_for_v1()
 
         self._set_cudagraph_sizes()
@@ -5026,6 +5141,11 @@ class VllmConfig:
         if self.model_config is None:
             return
 
+        # Avoid running try_verify_and_update_config multiple times
+        if getattr(self.model_config, "config_updated", False):
+            return
+        self.model_config.config_updated = True
+
         architecture = self.model_config.architecture
         if architecture is None:
             return
@@ -5127,6 +5247,14 @@ def set_current_vllm_config(vllm_config: VllmConfig,
     finally:
         _current_vllm_config = old_vllm_config
         _current_prefix = old_prefix
+        # Clear the compilation config cache when context changes
+        get_cached_compilation_config.cache_clear()
+
+
+@lru_cache(maxsize=1)
+def get_cached_compilation_config():
+    """Cache config to avoid repeated calls to get_current_vllm_config()"""
+    return get_current_vllm_config().compilation_config
 
 
 def get_current_vllm_config() -> VllmConfig:
@@ -5179,13 +5307,29 @@ def assert_hashable(text):
 T = TypeVar("T")
 
 
-def get_layers_from_vllm_config(vllm_config: VllmConfig,
-                                layer_type: type[T]) -> dict[str, T]:
+def get_layers_from_vllm_config(
+        vllm_config: VllmConfig,
+        layer_type: type[T],
+        layer_names: Optional[list[str]] = None) -> dict[str, T]:
+    """
+    Get layers from the vLLM config.
+
+    Args:
+        vllm_config: The vLLM config.
+        layer_type: The type of the layer to get.
+        layer_names: The names of the layers to get. If None, return all layers.
+    """
+
+    if layer_names is None:
+        layer_names = list(
+            vllm_config.compilation_config.static_forward_context.keys())
+
+    forward_context = vllm_config.compilation_config.static_forward_context
+
     return {
-        layer_name: layer
-        for layer_name, layer in
-        vllm_config.compilation_config.static_forward_context.items()
-        if isinstance(layer, layer_type)
+        layer_name: forward_context[layer_name]
+        for layer_name in layer_names
+        if isinstance(forward_context[layer_name], layer_type)
     }
 
 
diff --git a/vllm/distributed/device_communicators/base_device_communicator.py b/vllm/distributed/device_communicators/base_device_communicator.py
index dc5923cdc5a0d..127a340fc6c6d 100644
--- a/vllm/distributed/device_communicators/base_device_communicator.py
+++ b/vllm/distributed/device_communicators/base_device_communicator.py
@@ -219,7 +219,7 @@ class DeviceCommunicatorBase:
         return output_tensor
 
     def send(self, tensor: torch.Tensor, dst: Optional[int] = None) -> None:
-        """Sends a tensor to the destination rank in a non-blocking way"""
+        """Sends a tensor to the destination rank in a blocking way"""
         """NOTE: `dst` is the local rank of the destination rank."""
         if dst is None:
             dst = (self.rank_in_group + 1) % self.world_size
diff --git a/vllm/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py
index e4804691f0f65..4ab8f3d938fcf 100644
--- a/vllm/distributed/device_communicators/cuda_communicator.py
+++ b/vllm/distributed/device_communicators/cuda_communicator.py
@@ -179,7 +179,7 @@ class CudaCommunicator(DeviceCommunicatorBase):
         return output.movedim(0, dim).contiguous()
 
     def send(self, tensor: torch.Tensor, dst: Optional[int] = None) -> None:
-        """Sends a tensor to the destination rank in a non-blocking way"""
+        """Sends a tensor to the destination rank in a blocking way"""
         """NOTE: `dst` is the local rank of the destination rank."""
         if dst is None:
             dst = (self.rank_in_group + 1) % self.world_size
diff --git a/vllm/distributed/device_communicators/ray_communicator.py b/vllm/distributed/device_communicators/ray_communicator.py
new file mode 100644
index 0000000000000..46cc1c2f52d67
--- /dev/null
+++ b/vllm/distributed/device_communicators/ray_communicator.py
@@ -0,0 +1,258 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import uuid
+from typing import Any, Optional
+
+import ray
+import torch
+from ray.exceptions import RayChannelError
+from ray.experimental.channel.communicator import (Communicator,
+                                                   TorchTensorAllocator)
+from torch.distributed import ReduceOp
+
+from vllm.distributed.device_communicators.base_device_communicator import (
+    DeviceCommunicatorBase)
+from vllm.distributed.parallel_state import get_pp_group
+from vllm.logger import init_logger
+from vllm.utils import current_stream
+
+logger = init_logger(__name__)
+
+
+class RayPPCommunicator(Communicator):
+    """
+    Communicator to be used for pipeline parallelism in Ray Compiled Graph.
+    This is wraps around the vLLM _PP GroupCoordinator.
+
+    This class is not thread-safe.
+    """
+
+    _comm: Optional[DeviceCommunicatorBase]
+
+    def __init__(
+        self,
+        world_size: int,
+        comm_id: Any,
+        rank: Optional[int],
+        actor_handles: list["ray.actor.ActorHandle"],
+        cuda_stream: Optional[torch.cuda.Stream],
+        use_communication_streams: bool = False,
+    ):
+        """
+        Initialize a RayPPCommunicator that can be used to communicate with
+        other Ray Compiled Graph actors for pipeline parallelism.
+
+        Args:
+            world_size: The number of participating actors.
+            comm_id: A unique communicator ID. This is just to conform with
+                the Ray Communicator API and is not used.
+            rank: The rank of this actor. If None, then the caller is not a
+                participant of the RayPPCommunicator group (e.g., the Ray
+                driver).
+            actor_handles: A list of actor handles.
+            cuda_stream: A CUDA stream to dispatch communication ops to. This
+                is not supported.
+            use_communication_streams: Whether to use communication streams.
+                This is not supported.
+        """
+        self._world_size = world_size
+        self._rank: Optional[int] = None
+        self._actor_handles = actor_handles
+        if use_communication_streams:
+            raise NotImplementedError(
+                "use_communication_streams is not supported")
+        if cuda_stream is not None and cuda_stream != current_stream():
+            raise ValueError(
+                "cuda_stream other than the current stream is not supported")
+
+        if rank is not None:
+            # Rank is not None, this is Ray worker
+            assert ray.get_gpu_ids(), "RayPPCommunicator has no GPUs assigned"
+
+            self._comm = get_pp_group().device_communicator
+            assert self._comm is not None
+
+            # Since we wrap around the vLLM _PP communicator, we use
+            # the rank from the vLLM communicator, and ignore the rank
+            # passed in from Ray.
+            # TODO(rui): refactor the Ray Communicator API so that
+            # it also supports no rank passed in.
+            self._rank = self._comm.rank_in_group
+
+            self._build_actor_rank_mapping()
+        else:
+            # Rank is None, this is Ray driver
+            self._comm = None
+
+        self._closed = False
+
+    def _build_actor_rank_mapping(self):
+        """
+        Use collective communication to build a mapping from actor IDs to ranks.
+        This should be called once during initialization.
+        """
+        if self._comm is None:
+            return {}
+
+        current_actor = ray.get_runtime_context().current_actor
+        actor_id_str = current_actor._actor_id.hex()
+
+        # Ray actor IDs are 32-character hex strings (128 bits)
+        ACTOR_ID_LEN = 32
+        actor_id_bytes = actor_id_str.encode('utf-8')
+        assert len(
+            actor_id_bytes
+        ) == ACTOR_ID_LEN, f"Unexpected actor ID length: {len(actor_id_bytes)}"
+
+        actor_id_tensor = torch.frombuffer(
+            actor_id_bytes, dtype=torch.uint8).to(self._comm.device)
+
+        # All-gather full actor IDs from all actors
+        gathered_ids = self._comm.all_gather(actor_id_tensor, dim=0)
+
+        # Build mapping: actor_id -> device_comm_rank
+        self._actor_id_to_rank = {}
+        for rank in range(self._world_size):
+            start_idx = rank * ACTOR_ID_LEN
+            end_idx = (rank + 1) * ACTOR_ID_LEN
+            actor_bytes = gathered_ids[start_idx:end_idx].cpu().numpy(
+            ).tobytes()
+            actor_id = actor_bytes.decode('utf-8')
+            self._actor_id_to_rank[actor_id] = rank
+
+    def initialize(self, rank: int) -> None:
+        # No additional initialization is needed.
+        pass
+
+    def get_actor_handles(self) -> list["ray.actor.ActorHandle"]:
+        return self._actor_handles
+
+    def get_rank(self, actor: ray.actor.ActorHandle) -> int:
+        """
+        Return the given actor's rank using device communicator collective ops.
+        """
+        assert hasattr(self, '_actor_id_to_rank'), (
+            "Actor rank mapping not built. "
+            "This should have been done during initialization.")
+
+        actor_id_str = actor._actor_id.hex()
+
+        if actor_id_str in self._actor_id_to_rank:
+            return self._actor_id_to_rank[actor_id_str]  # type: ignore
+        else:
+            raise ValueError(f"Actor {actor} not found in communicator group")
+
+    def get_self_rank(self) -> Optional[int]:
+        """
+        Return this actor's rank.
+        """
+        return self._rank
+
+    def get_world_size(self) -> int:
+        """
+        Return the number of ranks in the RayPPCommunicator group.
+        """
+        return self._world_size
+
+    def send(self, buf: "torch.Tensor", peer_rank: int) -> None:
+        """
+        Send a torch.Tensor to a peer.
+
+        This returns when the send kernel has been queued, but the kernel may
+        not have completed. Therefore, the caller should ensure that there are
+        no concurrent writes to the sent `buf` until the send has finished.
+        That is, either all writes should be submitted on the current stream
+        (self._cuda_stream) or, if on a different stream, that stream should
+        synchronize with the current stream.
+
+        Args:
+            buf: The torch.Tensor to send. It should already be on this
+                actor's default device.
+            peer_rank: The rank of the actor to send to.
+        """
+        if self._closed:
+            raise RayChannelError("RayPPCommunicator has been destroyed.")
+
+        assert self._comm is not None
+        self._comm.send(buf, peer_rank)
+
+    def recv(
+        self,
+        shape: tuple[int],
+        dtype: "torch.dtype",
+        peer_rank: int,
+        allocator: TorchTensorAllocator,
+    ) -> "torch.Tensor":
+        """
+        Receive a torch.Tensor from a peer and synchronize the current stream.
+
+        After this call returns, the receive buffer is safe to read from from
+        any stream. An RayChannelError will be raised if an error occurred
+        (e.g., remote actor died), and the buffer is not safe to read.
+
+        Args:
+            shape: The shape of the tensor to receive.
+            dtype: The dtype of the tensor to receive.
+            peer_rank: The rank of the actor to receive from.
+            allocator: The allocator to use to create the received tensor.
+                This is ignored for this implementation.
+        """
+        if self._closed:
+            raise RayChannelError("RayPPCommunicator has been destroyed.")
+
+        assert self._comm is not None
+        size = torch.Size(shape)
+        buf = self._comm.recv(size, dtype, src=peer_rank)
+
+        # Buffer values are undefined if NCCL ops are aborted. Therefore, we
+        # need to synchronize here and check that the channel is still
+        # open to ensure that the receive buffer is valid.
+        # TODO(swang): Avoid CUDA synchronization.
+        current_stream().synchronize()
+
+        if self._closed:
+            raise RayChannelError("RayPPCommunicator has been destroyed.")
+        return buf
+
+    def allgather(
+        self,
+        send_buf: "torch.Tensor",
+        recv_buf: "torch.Tensor",
+    ):
+        raise NotImplementedError("allgather is not supported")
+
+    def allreduce(
+        self,
+        send_buf: "torch.Tensor",
+        recv_buf: "torch.Tensor",
+        op: ReduceOp = ReduceOp.SUM,
+    ):
+        raise NotImplementedError("allreduce is not supported")
+
+    def reducescatter(
+        self,
+        send_buf: "torch.Tensor",
+        recv_buf: "torch.Tensor",
+        op: ReduceOp = ReduceOp.SUM,
+    ):
+        raise NotImplementedError("reducescatter is not supported")
+
+    @property
+    def recv_stream(self):
+        return torch.cuda.StreamContext(current_stream())
+
+    @property
+    def send_stream(self):
+        return torch.cuda.StreamContext(current_stream())
+
+    def destroy(self) -> None:
+        # Just sets a flag, vLLM manages the lifecycle of the underlying
+        # _PP GroupCoordinator.
+        self._closed = True
+
+    def get_transport_name(self) -> str:
+        return "nccl"
+
+    @classmethod
+    def generate_communicator_id(cls) -> Any:
+        return uuid.uuid4()
diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py
index af64620849688..f64b516b0d042 100644
--- a/vllm/distributed/eplb/eplb_state.py
+++ b/vllm/distributed/eplb/eplb_state.py
@@ -251,6 +251,7 @@ class EplbState:
 
         if global_expert_load is not None:
             ep_group = get_ep_group().device_group
+            assert ep_group is not None
             assert global_expert_load.shape == (model.num_moe_layers,
                                                 model.num_logical_experts)
             assert global_expert_load.dtype == torch.int64
@@ -357,6 +358,7 @@ class EplbState:
 
             # Collect load metrics from all ranks
             ep_group = get_ep_group().device_group
+            assert ep_group is not None
             num_tokens_list = [
                 torch.empty_like(num_tokens) for _ in range(ep_group.size())
             ]
@@ -412,6 +414,7 @@ class EplbState:
         """
 
         ep_group = get_ep_group().device_group
+        assert ep_group is not None
         ep_rank = ep_group.rank()
 
         time_start = None
diff --git a/vllm/distributed/kv_transfer/kv_connector/base.py b/vllm/distributed/kv_transfer/kv_connector/base.py
index 181c33925da76..011bbb69abb08 100644
--- a/vllm/distributed/kv_transfer/kv_connector/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/base.py
@@ -1,128 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-KVConnectorBase Class for Distributed KV Cache & Hidden State communication
-
-The class provides two primary abstract methods:
-1. send_kv_caches_and_hidden_states(): Send KV caches and hidden states
-2. recv_kv_caches_and_hidden_states(): Recv KV caches and hidden states
-"""
-
-from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, Union
-
-import torch
+"""Defines the base type for KV cache connectors."""
 
 from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1
-from vllm.sequence import IntermediateTensors
 
-if TYPE_CHECKING:
-    from vllm.config import VllmConfig
-    from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
+KVConnectorBase = KVConnectorBase_V1
+KVConnectorBaseType = KVConnectorBase_V1
 
-
-class KVConnectorBase(ABC):
-    """
-    Abstract base class for a KV connector.
-
-    The class provides two primary abstract methods:
-    1. send_kv_caches_and_hidden_states(): Send KV caches and hidden states
-    2. recv_kv_caches_and_hidden_states(): Recv KV caches and hidden states
-    """
-
-    @abstractmethod
-    def __init__(
-        self,
-        rank: int,
-        local_rank: int,
-        config: "VllmConfig",
-    ):
-        raise NotImplementedError
-
-    @abstractmethod
-    def close(self) -> None:
-        """Close the buffer and release resources.
-
-        This method is responsible for cleaning up resources related to the 
-        connector when it is no longer needed.
-
-        Raises:
-            NotImplementedError: This method must be implemented in subclasses.
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def send_kv_caches_and_hidden_states(
-        self,
-        model_executable: torch.nn.Module,
-        model_input: "ModelInputForGPUWithSamplingMetadata",
-        kv_caches: list[torch.Tensor],
-        hidden_or_intermediate_states: Union[torch.Tensor,
-                                             IntermediateTensors],
-    ) -> None:
-        """
-        Send KV caches and hidden states to the connector.
-
-        This method processes the input tokens, KV caches, and 
-        hidden/intermediate states for a given model and sends the data to the 
-        decode instance.
-
-        Args:
-            model_executable (torch.nn.Module): The model executable containing 
-                start and end layer information.
-            model_input (ModelInputForGPUWithSamplingMetadata): The input
-                metadata from vLLM.
-            kv_caches (list[torch.Tensor]): List of KV caches (keys and values) 
-                for each layer.
-            hidden_or_intermediate_states (Union[torch.Tensor, 
-            IntermediateTensors]): 
-                The hidden or intermediate states associated with the tokens.
-
-        Returns:
-            None
-
-        """
-
-        raise NotImplementedError
-
-    @abstractmethod
-    def recv_kv_caches_and_hidden_states(
-        self, model_executable: torch.nn.Module,
-        model_input: "ModelInputForGPUWithSamplingMetadata",
-        kv_caches: list[torch.Tensor]
-    ) -> tuple[Union[torch.Tensor, IntermediateTensors], bool,
-               "ModelInputForGPUWithSamplingMetadata"]:
-        """
-        Receive KV caches and hidden states from the connector.
-
-        This method attempts to retrieve KV caches and hidden states for input
-        tokens. If all required KV caches and hidden states are received, it
-        will bypass model input, else it will fall back to normal vLLM model 
-        forwarding.
-
-        Args:
-            model_executable (torch.nn.Module): 
-                The model executable from vLLM modelrunner.
-            model_input (ModelInputForGPUWithSamplingMetadata): 
-                The model input from vLLM modelrunner.
-            kv_caches (list[torch.Tensor]): 
-                List of KV caches for each layer.
-
-        Returns:
-            - hidden_or_intermediate_states (torch.Tensor or
-            IntermediateTensors): 
-                Concatenated hidden states if all required data is retrieved, 
-                otherwise `None`.
-            - bypass_model_exec (bool): 
-                Indicates whether the model execution can be skipped (True) or 
-                needs to be redone (False).
-            - model_input (ModelInputForGPUWithSamplingMetadata): 
-                Optionally adjusted input metadata for re-execution when 
-                `bypass_model_exec=False`.
-
-        """
-
-        raise NotImplementedError
-
-
-KVConnectorBaseType = Union[KVConnectorBase, KVConnectorBase_V1]
+__all__ = ["KVConnectorBase", "KVConnectorBaseType"]
diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py
index be9ce72dea67a..01673a0d7c876 100644
--- a/vllm/distributed/kv_transfer/kv_connector/factory.py
+++ b/vllm/distributed/kv_transfer/kv_connector/factory.py
@@ -5,13 +5,10 @@ import importlib
 from typing import TYPE_CHECKING, Callable
 
 import vllm.envs as envs
-from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBaseType
-from vllm.distributed.kv_transfer.kv_connector.v1 import (KVConnectorBase_V1,
-                                                          KVConnectorRole)
+from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase
+from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorRole
 from vllm.logger import init_logger
 
-from .base import KVConnectorBase
-
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
 
@@ -19,7 +16,7 @@ logger = init_logger(__name__)
 
 
 class KVConnectorFactory:
-    _registry: dict[str, Callable[[], type[KVConnectorBaseType]]] = {}
+    _registry: dict[str, Callable[[], type[KVConnectorBase]]] = {}
 
     @classmethod
     def register_connector(cls, name: str, module_path: str,
@@ -28,33 +25,18 @@ class KVConnectorFactory:
         if name in cls._registry:
             raise ValueError(f"Connector '{name}' is already registered.")
 
-        def loader() -> type[KVConnectorBaseType]:
+        def loader() -> type[KVConnectorBase]:
             module = importlib.import_module(module_path)
             return getattr(module, class_name)
 
         cls._registry[name] = loader
 
     @classmethod
-    def create_connector_v0(cls, rank: int, local_rank: int,
-                            config: "VllmConfig") -> KVConnectorBase:
-        if envs.VLLM_USE_V1:
-            raise ValueError("Attempting to initialize a V0 Connector, "
-                             f"but found {envs.VLLM_USE_V1=}")
-
-        connector_name = config.kv_transfer_config.kv_connector
-        if connector_name not in cls._registry:
-            raise ValueError(f"Unsupported connector type: {connector_name}")
-
-        connector_cls = cls._registry[connector_name]()
-        assert issubclass(connector_cls, KVConnectorBase)
-        return connector_cls(rank, local_rank, config)
-
-    @classmethod
-    def create_connector_v1(
+    def create_connector(
         cls,
         config: "VllmConfig",
         role: KVConnectorRole,
-    ) -> KVConnectorBase_V1:
+    ) -> KVConnectorBase:
         if not envs.VLLM_USE_V1:
             raise ValueError("Attempting to initialize a V1 Connector, "
                              f"but found {envs.VLLM_USE_V1=}")
@@ -70,9 +52,9 @@ class KVConnectorFactory:
                     f"Unsupported connector type: {connector_name}")
             connector_module = importlib.import_module(connector_module_path)
             connector_cls = getattr(connector_module, connector_name)
-        assert issubclass(connector_cls, KVConnectorBase_V1)
+        assert issubclass(connector_cls, KVConnectorBase)
         logger.info("Creating v1 connector with name: %s and engine_id: %s",
-                    connector_name, kv_transfer_config.engine_id)
+                    connector_cls.__name__, kv_transfer_config.engine_id)
         # NOTE(Kuntai): v1 connector is explicitly separated into two roles.
         # Scheduler connector:
         # - Co-locate with scheduler process
@@ -87,25 +69,6 @@ class KVConnectorFactory:
 # Register various connectors here.
 # The registration should not be done in each individual file, as we want to
 # only load the files corresponding to the current connector.
-KVConnectorFactory.register_connector(
-    "PyNcclConnector",
-    "vllm.distributed.kv_transfer.kv_connector.simple_connector",
-    "SimpleConnector")
-
-KVConnectorFactory.register_connector(
-    "MooncakeConnector",
-    "vllm.distributed.kv_transfer.kv_connector.simple_connector",
-    "SimpleConnector")
-
-KVConnectorFactory.register_connector(
-    "LMCacheConnector",
-    "vllm.distributed.kv_transfer.kv_connector.lmcache_connector",
-    "LMCacheConnector")
-
-KVConnectorFactory.register_connector(
-    "MooncakeStoreConnector",
-    "vllm.distributed.kv_transfer.kv_connector.mooncake_store_connector",
-    "MooncakeStoreConnector")
 
 KVConnectorFactory.register_connector(
     "SharedStorageConnector",
diff --git a/vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py
deleted file mode 100644
index 78bf3095613a7..0000000000000
--- a/vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-LMCache KV Cache Connector for Distributed Machine Learning Inference
-
-The LMCacheConnector can (1) transfer KV caches between prefill vLLM worker
-(KV cache producer) and decode vLLM worker (KV cache consumer) using LMCache;
-(2) offload and share KV caches.
-"""
-
-from typing import TYPE_CHECKING, Union
-
-import torch
-
-from vllm.config import VllmConfig
-from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase
-from vllm.logger import init_logger
-from vllm.sequence import IntermediateTensors
-
-if TYPE_CHECKING:
-    from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
-
-logger = init_logger(__name__)
-
-
-class LMCacheConnector(KVConnectorBase):
-
-    def __init__(
-        self,
-        rank: int,
-        local_rank: int,
-        config: VllmConfig,
-    ):
-
-        self.transfer_config = config.kv_transfer_config
-        self.vllm_config = config
-
-        from lmcache.experimental.cache_engine import LMCacheEngineBuilder
-        from lmcache.integration.vllm.utils import ENGINE_NAME
-        from lmcache.integration.vllm.vllm_adapter import (
-            RetrieveStatus, StoreStatus, init_lmcache_engine,
-            lmcache_retrieve_kv, lmcache_should_retrieve, lmcache_should_store,
-            lmcache_store_kv)
-        logger.info("Initializing LMCacheConfig under kv_transfer_config %s",
-                    self.transfer_config)
-
-        # TODO (Jiayi): Find model_config, parallel_config, and cache_config
-        self.engine = init_lmcache_engine(config.model_config,
-                                          config.parallel_config,
-                                          config.cache_config)
-        self.lmcache_engine_name = ENGINE_NAME
-        self.lmcache_engine_builder = LMCacheEngineBuilder
-
-        self.model_config = config.model_config
-        self.parallel_config = config.parallel_config
-        self.cache_config = config.cache_config
-        self.lmcache_retrieve_kv = lmcache_retrieve_kv
-        self.lmcache_store_kv = lmcache_store_kv
-        self.lmcache_should_retrieve = lmcache_should_retrieve
-        self.lmcache_should_store = lmcache_should_store
-        self.store_status = StoreStatus
-        self.retrieve_status = RetrieveStatus
-
-    def recv_kv_caches_and_hidden_states(
-        self, model_executable: torch.nn.Module,
-        model_input: "ModelInputForGPUWithSamplingMetadata",
-        kv_caches: list[torch.Tensor]
-    ) -> tuple[Union[torch.Tensor, IntermediateTensors], bool,
-               "ModelInputForGPUWithSamplingMetadata"]:
-
-        retrieve_status = self.lmcache_should_retrieve(model_input)
-        model_input, bypass_model_exec, hidden_or_intermediate_states =\
-            self.lmcache_retrieve_kv(
-                model_executable, model_input, self.cache_config, kv_caches,
-                retrieve_status)
-        return hidden_or_intermediate_states, bypass_model_exec, model_input
-
-    def send_kv_caches_and_hidden_states(
-        self,
-        model_executable: torch.nn.Module,
-        model_input: "ModelInputForGPUWithSamplingMetadata",
-        kv_caches: list[torch.Tensor],
-        hidden_or_intermediate_states: Union[torch.Tensor,
-                                             IntermediateTensors],
-    ) -> None:
-
-        store_status = self.lmcache_should_store(model_input)
-        self.lmcache_store_kv(
-            self.model_config,
-            self.parallel_config,
-            self.cache_config,
-            model_executable,
-            model_input,
-            kv_caches,
-            store_status,
-        )
-
-    def close(self):
-        self.lmcache_engine_builder.destroy(self.lmcache_engine_name)
diff --git a/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py b/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py
deleted file mode 100644
index 94a7ce91acf17..0000000000000
--- a/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py
+++ /dev/null
@@ -1,203 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-MooncakeStore Connector for Distributed Machine Learning Inference
-The MooncakeStoreConnector transfers KV caches between prefill vLLM workers
-(KV cache producer) and decode vLLM workers (KV cache consumer) using a
-database-style KVStore.
-"""
-import hashlib
-from typing import TYPE_CHECKING, Union
-
-import torch
-
-from vllm.config import VllmConfig
-from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase
-from vllm.distributed.kv_transfer.kv_connector.utils import (
-    model_aware_kv_ops_helper as kv_helper)
-from vllm.logger import init_logger
-from vllm.sequence import IntermediateTensors
-
-if TYPE_CHECKING:
-    from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
-
-logger = init_logger(__name__)
-
-
-class MooncakeStoreConnector(KVConnectorBase):
-
-    def __init__(
-        self,
-        rank: int,
-        local_rank: int,
-        config: VllmConfig,
-    ):
-        self.kv_transfer_config = config.kv_transfer_config
-        self.kv_helper = kv_helper(config)
-        self.local_tp_rank = local_rank
-
-        # Init kv_store
-        if self.kv_transfer_config.kv_connector == "MooncakeStoreConnector":
-            # Check if MOONCAKE_CONFIG_PATH is set
-            import os
-            use_mooncake_store = os.getenv('MOONCAKE_CONFIG_PATH') is not None
-
-            if not use_mooncake_store:
-                raise ValueError(
-                    "To use MooncakeStoreConnector, you need to pass the ENV: "
-                    "'MOONCAKE_CONFIG_PATH=/path/to/mooncake_config.json'.")
-            else:
-                from vllm.distributed.kv_transfer.kv_lookup_buffer.mooncake_store import (  # noqa: E501
-                    MooncakeStore)
-                logger.info(
-                    "Initializing KVStoreConnector under kv_transfer_config %s",
-                    self.kv_transfer_config)
-                self.kv_store = MooncakeStore(config)
-        else:
-            logger.error("Can not find %s",
-                         self.kv_transfer_config.kv_connector)
-
-        assert self.kv_store is not None
-
-    def close(self) -> None:
-        """Close the buffer and release resources.
-        This method is responsible for cleaning up resources related to the 
-        connector when it is no longer needed.
-        Raises:
-            NotImplementedError: This method must be implemented in subclasses.
-        """
-        self.kv_store.close()
-
-    def send_kv_caches_and_hidden_states(
-        self,
-        model_executable: torch.nn.Module,
-        model_input: "ModelInputForGPUWithSamplingMetadata",
-        kv_caches: list[torch.Tensor],
-        hidden_or_intermediate_states: Union[torch.Tensor,
-                                             IntermediateTensors],
-    ) -> None:
-        input_tokens_tensor = model_input.input_tokens
-        seq_lens = model_input.attn_metadata.seq_lens
-        slot_mapping_flat = model_input.attn_metadata.slot_mapping.flatten()
-        start_layer = model_executable.model.start_layer
-        end_layer = model_executable.model.end_layer
-        num_heads, head_size = self.kv_helper.get_model_args(model_executable)
-
-        for idx, slen in enumerate(seq_lens):
-            start_pos = sum(seq_lens[:idx])
-            end_pos = start_pos + slen
-
-            current_tokens = input_tokens_tensor[start_pos:end_pos]
-            store_key_prefix = self.tensor_hash(current_tokens)
-            keys, values = [], []
-
-            for layer_id in range(start_layer, end_layer):
-                kv_cache = kv_caches[layer_id - start_layer]
-                key_cache, value_cache = self.kv_helper.get_kv_from_cache(
-                    kv_cache, num_heads, head_size)
-                current_slot_mapping = slot_mapping_flat[start_pos:end_pos]
-
-                keys.append(key_cache[current_slot_mapping].unsqueeze(0))
-                values.append(value_cache[current_slot_mapping].unsqueeze(0))
-
-            keys = torch.cat(keys, dim=0)
-            values = torch.cat(values, dim=0)
-            kvcache_to_sent = torch.stack((keys, values), dim=0)
-            store_kvcache_key = f"{store_key_prefix}_{self.local_tp_rank}"
-            self.kv_store.put(store_kvcache_key, kvcache_to_sent)
-
-            hidden_key = f"{store_key_prefix}_hidden_{self.local_tp_rank}"
-            self.kv_store.put(hidden_key,
-                              hidden_or_intermediate_states[start_pos:end_pos])
-
-        logger.debug("[rank%d]: KV send DONE.", torch.distributed.get_rank())
-
-    def recv_kv_caches_and_hidden_states(
-        self, model_executable: torch.nn.Module,
-        model_input: "ModelInputForGPUWithSamplingMetadata",
-        kv_caches: list[torch.Tensor]
-    ) -> tuple[Union[torch.Tensor, IntermediateTensors], bool,
-               "ModelInputForGPUWithSamplingMetadata"]:
-        bypass_model_exec = True
-        input_tokens_tensor = model_input.input_tokens
-        seq_lens = model_input.attn_metadata.seq_lens
-        num_prefill_tokens = model_input.attn_metadata.num_prefill_tokens
-        slot_mapping = model_input.attn_metadata.slot_mapping.flatten()
-        start_layer = model_executable.model.start_layer
-        end_layer = model_executable.model.end_layer
-        hidden_or_intermediate_states_for_one_req = []
-
-        for idx, slen in enumerate(seq_lens):
-            start_pos = sum(seq_lens[:idx])
-            end_pos = start_pos + slen
-
-            if start_pos >= num_prefill_tokens:
-                # This can happen during inflight batching. See:
-                # vllm/worker/model_runner.py::_prepare_model_input_tensors:
-                # - input_tokens[:num_prefill_tokens] contains prefill tokens.
-                # - input_tokens[num_prefill_tokens:] contains decode tokens.
-                logger.warning("You should set --enable_chunked_prefill=False "
-                               "and --max_num_batched_tokens "
-                               "should be equal to max_seq_len_to_capture")
-                bypass_model_exec = False
-                assert start_pos == num_prefill_tokens
-                break
-
-            current_tokens = input_tokens_tensor[start_pos:end_pos]
-
-            # get roi for current seq
-            load_key_prefix = self.tensor_hash(current_tokens)
-            load_kvcache_key = f"{load_key_prefix}_{self.local_tp_rank}"
-            remote_kv = self.kv_store.get(load_kvcache_key)
-            hidden_key = f"{load_key_prefix}_hidden_{self.local_tp_rank}"
-            hidden = self.kv_store.get(hidden_key)
-
-            if remote_kv is None or hidden is None:
-                # didn't find any match.
-                bypass_model_exec = False
-                continue
-
-            num_computed_tokens = current_tokens.shape[0]
-
-            # update the end position based on how many tokens are cached.
-            end_pos = start_pos + num_computed_tokens
-
-            # call self.kv_store to get kv layer by layer
-            for layer_id in range(start_layer, end_layer):
-                layer = model_executable.model.layers[layer_id]
-                # get kvcache object
-                kv_cache = kv_caches[layer_id - start_layer]
-
-                # get remote kvcache
-                remote_k, remote_v = remote_kv[0][layer_id], remote_kv[1][
-                    layer_id]
-
-                self.kv_helper.put_kv_to_cache(model_executable, remote_k,
-                                               remote_v, layer, kv_cache,
-                                               slot_mapping, start_pos,
-                                               end_pos)
-
-            hidden_or_intermediate_states_for_one_req.append(hidden)
-
-        if not bypass_model_exec:
-            logger.warning(
-                "[rank%d]: Failed to receive all KVs and hidden "
-                "states, redo model forwarding.", torch.distributed.get_rank())
-            hidden_or_intermediate_states = None
-
-        else:
-            logger.debug(
-                "[rank%d]: Successfully received all KVs and hidden "
-                "states, skip model forwarding.", torch.distributed.get_rank())
-            hidden_or_intermediate_states = torch.cat(
-                hidden_or_intermediate_states_for_one_req, dim=0)
-
-        return hidden_or_intermediate_states, bypass_model_exec, model_input
-
-    @staticmethod
-    def tensor_hash(tensor: torch.Tensor) -> int:
-        """Calculate the hash value of the tensor."""
-        tensor_bytes = tensor.clone().detach().cpu().numpy().tobytes()
-        hash_object = hashlib.blake2b(tensor_bytes)
-        hash_hex = hash_object.hexdigest()
-        return int(hash_hex[:16], 16)
diff --git a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
deleted file mode 100644
index e7c079e1f115c..0000000000000
--- a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
+++ /dev/null
@@ -1,329 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-Simple KV Cache Connector for Distributed Machine Learning Inference
-
-The SimpleConnector transfers KV caches between prefill vLLM worker (KV cache
-producer) and decode vLLM worker (KV cache consumer) using PyNcclPipe or
-MooncakePipe.
-
-But the logic can be extended to support other pipe and lookup buffer.
-"""
-from typing import TYPE_CHECKING, Optional, Union
-
-import torch
-
-from vllm.config import VllmConfig
-from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase
-from vllm.distributed.kv_transfer.kv_connector.utils import (
-    model_aware_kv_ops_helper as kv_helper)
-from vllm.distributed.kv_transfer.kv_lookup_buffer.simple_buffer import (
-    SimpleBuffer)
-from vllm.logger import init_logger
-from vllm.sequence import IntermediateTensors
-
-if TYPE_CHECKING:
-    from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
-
-logger = init_logger(__name__)
-
-
-class SimpleConnector(KVConnectorBase):
-
-    def __init__(
-        self,
-        rank: int,
-        local_rank: int,
-        config: VllmConfig,
-    ):
-
-        self.config = config.kv_transfer_config
-        self.kv_helper = kv_helper(config)
-
-        if self.config.kv_connector == "PyNcclConnector":
-            from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import (
-                PyNcclPipe)
-            logger.info(
-                "Initializing PyNcclConfig under kv_transfer_config %s",
-                self.config)
-        elif self.config.kv_connector == "MooncakeConnector":
-            # Check if MOONCAKE_CONFIG_PATH is set
-            import os
-            use_mooncake_distributed_pipe = os.getenv(
-                'MOONCAKE_CONFIG_PATH') is not None
-
-            if not use_mooncake_distributed_pipe:
-                raise ValueError(
-                    "To use MooncakeConnector, you need to pass the ENV: "
-                    "'MOONCAKE_CONFIG_PATH=/path/to/mooncake_config.json'.")
-            else:
-                from vllm.distributed.kv_transfer.kv_pipe.mooncake_pipe import (  # noqa: E501
-                    MooncakePipe)
-                logger.info(
-                    "Initializing MooncakeConfig under kv_transfer_config %s",
-                    self.config)
-
-        self.lookup_buffer_size = self.config.kv_buffer_size
-
-        self.producer_buffer: Optional[SimpleBuffer] = None
-        self.consumer_buffer: Optional[SimpleBuffer] = None
-
-        self.producer_data_pipe: Union[PyNcclPipe, MooncakePipe]
-        self.consumer_data_pipe: Union[PyNcclPipe, MooncakePipe]
-        self.producer_signal_pipe: Union[PyNcclPipe, MooncakePipe]
-        self.consumer_signal_pipe: Union[PyNcclPipe, MooncakePipe]
-
-        # 2 pipes for every rank in the world
-        port_offset_base = 2 * rank
-
-        # In disaggregated prefill, the prefill vLLM only uses send pipe
-        # and the decode vLLM only uses recv pipe
-        if self.config.is_kv_producer:
-
-            if self.config.kv_connector == "PyNcclConnector":
-                self.producer_data_pipe = PyNcclPipe(
-                    local_rank=local_rank,
-                    config=self.config,
-                    port_offset=port_offset_base,
-                )
-                self.producer_signal_pipe = PyNcclPipe(
-                    local_rank=local_rank,
-                    config=self.config,
-                    port_offset=port_offset_base + 1,
-                    device="cpu",
-                )
-            elif self.config.kv_connector == "MooncakeConnector":
-                self.producer_data_pipe = MooncakePipe(
-                    local_rank=local_rank,
-                    config=self.config,
-                )
-                # We only need to initialize MooncakePipe once
-                self.producer_signal_pipe = self.producer_data_pipe
-
-            self.producer_buffer = SimpleBuffer(self.producer_signal_pipe,
-                                                self.producer_data_pipe,
-                                                self.config.kv_buffer_size)
-
-        else:
-
-            # the current vLLM instance is KV consumer, so it needs to connect
-            # its recv pipe to the send pipe of KV producer
-            if self.config.kv_connector == "PyNcclConnector":
-                self.consumer_data_pipe = PyNcclPipe(
-                    local_rank=local_rank,
-                    config=self.config,
-                    port_offset=port_offset_base,
-                )
-                self.consumer_signal_pipe = PyNcclPipe(
-                    local_rank=local_rank,
-                    config=self.config,
-                    port_offset=port_offset_base + 1,
-                    device="cpu",
-                )
-            elif self.config.kv_connector == "MooncakeConnector":
-                self.consumer_data_pipe = MooncakePipe(
-                    local_rank=local_rank,
-                    config=self.config,
-                )
-                self.consumer_signal_pipe = self.consumer_data_pipe
-
-            self.consumer_buffer = SimpleBuffer(
-                self.consumer_signal_pipe,
-                self.consumer_data_pipe,
-                self.config.kv_buffer_size,
-            )
-
-    def select(self, input_tokens: Optional[torch.Tensor],
-               roi: Optional[torch.Tensor]) -> list[Optional[torch.Tensor]]:
-
-        assert self.consumer_buffer is not None, "Please initialize the "\
-            "consumer buffer before calling select."
-        return self.consumer_buffer.drop_select(input_tokens, roi)
-
-    def insert(self, input_tokens: torch.Tensor, roi: torch.Tensor,
-               key: torch.Tensor, value: torch.Tensor,
-               hidden: torch.Tensor) -> None:
-
-        assert self.producer_buffer is not None, "Please initialize the "\
-            "producer buffer before calling insert."
-
-        self.producer_buffer.insert(input_tokens, roi, key, value, hidden)
-
-    def send_kv_caches_and_hidden_states(
-        self,
-        model_executable: torch.nn.Module,
-        model_input: "ModelInputForGPUWithSamplingMetadata",
-        kv_caches: list[torch.Tensor],
-        hidden_or_intermediate_states: Union[torch.Tensor,
-                                             IntermediateTensors],
-    ) -> None:
-
-        input_tokens_tensor = model_input.input_tokens
-        seq_lens = model_input.attn_metadata.seq_lens
-        slot_mapping_flat = model_input.attn_metadata.slot_mapping.flatten()
-        num_prefill_tokens = model_input.attn_metadata.num_prefill_tokens
-        start_layer = model_executable.model.start_layer
-        end_layer = model_executable.model.end_layer
-        num_heads, head_size = self.kv_helper.get_model_args(model_executable)
-
-        # query_lens contains new KV caches that are added to vLLM.
-        # so we will send them to decode instance
-        # FIXME(Kuntai): This assume that all requests are prefill.
-        for idx, slen in enumerate(seq_lens):
-            start_pos = sum(seq_lens[:idx])
-            end_pos = start_pos + slen
-
-            if start_pos >= num_prefill_tokens:
-                # vllm/worker/model_runner.py::_prepare_model_input_tensors:
-                # - input_tokens[:num_prefill_tokens] contains prefill tokens.
-                # - input_tokens[num_prefill_tokens:] contains decode tokens.
-                logger.warning("You have some decode requests while using "
-                               "SimpleConnector. Their KVCache won't be sent.")
-                break
-
-            current_tokens = input_tokens_tensor[start_pos:end_pos]
-
-            keys, values = [], []
-
-            for layer_id in range(start_layer, end_layer):
-                kv_cache = kv_caches[layer_id - start_layer]
-                key_cache, value_cache = self.kv_helper.get_kv_from_cache(
-                    kv_cache, num_heads, head_size)
-
-                current_slot_mapping = slot_mapping_flat[start_pos:end_pos]
-
-                keys.append(key_cache[current_slot_mapping].unsqueeze(0))
-                values.append(value_cache[current_slot_mapping].unsqueeze(0))
-
-            keys = torch.cat(keys, dim=0)
-            values = torch.cat(values, dim=0)
-
-            self.insert(current_tokens,
-                        torch.ones_like(current_tokens,
-                                        dtype=bool), keys, values,
-                        hidden_or_intermediate_states[start_pos:end_pos])
-
-        logger.debug("[rank%d]: KV send DONE.", torch.distributed.get_rank())
-
-    def recv_kv_caches_and_hidden_states(
-        self, model_executable: torch.nn.Module,
-        model_input: "ModelInputForGPUWithSamplingMetadata",
-        kv_caches: list[torch.Tensor]
-    ) -> tuple[Union[torch.Tensor, IntermediateTensors], bool,
-               "ModelInputForGPUWithSamplingMetadata"]:
-
-        # When bypass_model_exec is set to False, it means that at least for one
-        # request its corresponding KV cache or hidden state is missing.
-        # In this case we need to do prefilling to recompute missing KV cache
-        # and hidden states.
-        bypass_model_exec = True
-
-        input_tokens_tensor = model_input.input_tokens
-        seq_lens = model_input.attn_metadata.seq_lens
-        num_prefill_tokens = model_input.attn_metadata.num_prefill_tokens
-        slot_mapping = model_input.attn_metadata.slot_mapping.flatten()
-        start_layer = model_executable.model.start_layer
-        end_layer = model_executable.model.end_layer
-
-        hidden_or_intermediate_states_for_one_req = []
-
-        input_tokens_list = []
-        num_computed_tokens_list = []
-        start_pos_list = []
-
-        # enumerate different requests
-        # FIXME(Kuntai): This impl assumes that all requests are prefill.
-        for idx, slen in enumerate(seq_lens):
-            start_pos = sum(seq_lens[:idx])
-            end_pos = start_pos + slen
-
-            if start_pos >= num_prefill_tokens:
-                # This can happen during inflight batching. See:
-                # vllm/worker/model_runner.py::_prepare_model_input_tensors:
-                # - input_tokens[:num_prefill_tokens] contains prefill tokens.
-                # - input_tokens[num_prefill_tokens:] contains decode tokens.
-                logger.warning("You should set --enable_chunked_prefill=False "
-                               "and --max_num_batched_tokens "
-                               "should be equal to --max_seq_len_to_capture")
-                bypass_model_exec = False
-                assert start_pos == num_prefill_tokens
-                break
-
-            current_tokens = input_tokens_tensor[start_pos:end_pos]
-            num_tokens = slen
-
-            # collecting data for rebuilding the input
-            input_tokens_list.append(current_tokens)
-            start_pos_list.append(start_pos)
-
-            ret = self.select(current_tokens,
-                              torch.ones_like(current_tokens, dtype=bool))
-            if ret[0] is None:
-                # didn't find any match.
-                bypass_model_exec = False
-                num_computed_tokens_list.append(0)
-                continue
-
-            roi: torch.Tensor = ret[1]
-            keys: torch.Tensor = ret[2]
-            values: torch.Tensor = ret[3]
-            hidden: torch.Tensor = ret[4]
-
-            num_computed_tokens = roi.shape[0]
-            num_computed_tokens_list.append(num_computed_tokens)
-
-            # check if both KV cache and the hidden states are received
-            # If not, need to redo the forwarding to compute missing states
-            if not all([(num_computed_tokens == num_tokens), hidden is not None
-                        ]):
-                bypass_model_exec = False
-
-            # update the end position based on how many tokens are cached.
-            end_pos = start_pos + num_computed_tokens
-
-            # put received KV caches into paged memory
-            for cur_layer in range(start_layer, end_layer):
-
-                layer_id = cur_layer - start_layer
-                kv_cache = kv_caches[layer_id]
-                layer = model_executable.model.layers[cur_layer]
-
-                # get remote kvcache
-                remote_k, remote_v = keys[layer_id], values[layer_id]
-
-                self.kv_helper.put_kv_to_cache(model_executable, remote_k,
-                                               remote_v, layer, kv_cache,
-                                               slot_mapping, start_pos,
-                                               end_pos)
-
-            hidden_or_intermediate_states_for_one_req.append(hidden)
-
-        if not bypass_model_exec:
-            # Some of the KV cache is not retrieved
-            # Here we will fall back to normal model forwarding
-            # But optionally you can adjust model_input so that you only do
-            # prefilling on those tokens that are missing KV caches.
-            logger.warning(
-                "[rank%d]: Failed to receive all KVs and hidden "
-                "states, redo model forwarding.", torch.distributed.get_rank())
-            hidden_or_intermediate_states = None
-
-        else:
-            logger.debug(
-                "[rank%d]: Successfully received all KVs and hidden "
-                "states, skip model forwarding.", torch.distributed.get_rank())
-            hidden_or_intermediate_states = torch.cat(
-                hidden_or_intermediate_states_for_one_req, dim=0)
-
-        return hidden_or_intermediate_states, bypass_model_exec, model_input
-
-    def close(self):
-        self.producer_data_pipe.close()
-        self.consumer_data_pipe.close()
-        if self.config.kv_connector == "PyNcclConnector":
-            self.producer_signal_pipe.close()
-            self.consumer_signal_pipe.close()
-        elif self.config.kv_connector == "MooncakeConnector":
-            # MooncakePipe reuses data_pipe for signal_pipe, so we only have to
-            # close the data_pipe.
-            pass
diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py
index 459a532989140..1da41790f9fb1 100644
--- a/vllm/distributed/kv_transfer/kv_connector/utils.py
+++ b/vllm/distributed/kv_transfer/kv_connector/utils.py
@@ -13,8 +13,10 @@ import torch
 import vllm.envs as envs
 from vllm import _custom_ops as ops
 from vllm.config import VllmConfig, get_current_vllm_config
+from vllm.distributed.kv_transfer.kv_connector.v1.base import (
+    KVConnectorBase_V1)
 from vllm.logger import init_logger
-from vllm.v1.outputs import ModelRunnerOutput
+from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput
 
 logger = init_logger(__name__)
 
@@ -103,15 +105,13 @@ def get_kv_connector_cache_layout():
     # used for faster transfer.
     vllm_config = get_current_vllm_config()
     kv_config = vllm_config.kv_transfer_config
-    if kv_config is not None and vllm_config.model_config is None:
-        logger.warning_once("Unable to detect current VLLM config. " \
-        "Defaulting to NHD kv cache layout.")
-    elif kv_config is not None:
-        use_mla = vllm_config.model_config.use_mla
-        if not use_mla and kv_config.kv_connector == "NixlConnector":
-            logger.info_once("NixlConnector detected. Setting KV cache " \
-            "layout to HND for better xfer performance.")
-            return "HND"
+    if kv_config is not None:
+        required_kvcache_layout = (
+            KVConnectorBase_V1.get_required_kvcache_layout(vllm_config))
+        if required_kvcache_layout is not None:
+            return required_kvcache_layout
+        logger.info_once("Connectors do not specify a " \
+                         "kv cache layout, defaulting to NHD.")
     return "NHD"
 
 
@@ -128,7 +128,7 @@ class KVOutputAggregator:
     def aggregate(self,
                   outputs: list[ModelRunnerOutput],
                   output_rank: int = 0) -> ModelRunnerOutput:
-        # aggregate finished_sending, finished_recving from all workers
+        # aggregate kv_connector_output from all workers
 
         def update_finished_set(req_ids: Optional[set[str]],
                                 remaining_count_dict: dict[str, int],
@@ -142,6 +142,7 @@ class KVOutputAggregator:
         finished_sending = set[str]()
         finished_recving = set[str]()
         for output in outputs:
+            output = output.kv_connector_output
             update_finished_set(output.finished_sending,
                                 self._send_remaining_count, finished_sending)
             update_finished_set(output.finished_recving,
@@ -150,13 +151,10 @@ class KVOutputAggregator:
         # select output of the worker specified by output_rank
         output = outputs[output_rank]
 
-        # set the aggregated finished_sending / finished_recving
-        # if output.finished_sending/recving is not empty, but the other ranks
-        # still have unfinished send/recv, we want to set the aggregated
-        # finished_sending/recving to None until all ranks have finished
-        # send/recv
-        output.finished_sending = finished_sending if finished_sending else None
-        output.finished_recving = finished_recving if finished_recving else None
+        output.kv_connector_output = KVConnectorOutput(
+            finished_sending=finished_sending or None,
+            finished_recving=finished_recving or None,
+        )
 
         return output
 
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
index 8bbdd7e0621c6..7a2ccb58656fd 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -299,3 +299,17 @@ class KVConnectorBase_V1(ABC):
             returned by the engine.
         """
         return False, None
+
+    @classmethod
+    def get_required_kvcache_layout(
+            cls, vllm_config: "VllmConfig") -> Optional[str]:
+        """
+        Get the required KV cache layout for this connector.
+        Args:
+            vllm_config (VllmConfig): the vllm config.
+
+        Returns:
+            str: the required KV cache layout. e.g. HND, or NHD.
+            None if the connector does not require a specific layout.
+        """
+        return None
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
index a2eaa0040191e..62a4980bff975 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
@@ -52,7 +52,7 @@ class MultiConnector(KVConnectorBase_V1):
             temp_config.kv_transfer_config = KVTransferConfig(
                 **ktc, engine_id=engine_id)
             self._connectors.append(
-                KVConnectorFactory.create_connector_v1(temp_config, role))
+                KVConnectorFactory.create_connector(temp_config, role))
 
         # A mapping from request id to the index of the connector chosen to
         # load the request from (if any).
@@ -202,3 +202,36 @@ class MultiConnector(KVConnectorBase_V1):
         self._requests_to_connector.pop(request.request_id, None)
 
         return async_saves > 0, kv_txfer_params
+
+    @classmethod
+    def get_required_kvcache_layout(
+            cls, vllm_config: "VllmConfig") -> Optional[str]:
+        """
+        Get the required KV cache layout for this connector.
+        Args:
+            vllm_config (VllmConfig): the vllm config.
+
+        Returns:
+            str: the required KV cache layout. e.g. HND, or NHD.
+            None if the connector does not require a specific layout.
+        """
+        ktcs = vllm_config.kv_transfer_config.kv_connector_extra_config.get(
+            "connectors")
+        assert ktcs is not None
+        layouts: set[str] = set()
+        temp_vllm_config = copy.copy(vllm_config)
+        for ktc in ktcs:
+            kv_transfer_config = KVTransferConfig(**ktc)
+            temp_vllm_config.kv_transfer_config = kv_transfer_config
+            required_kvcache_layout = (
+                KVConnectorBase_V1.get_required_kvcache_layout(
+                    temp_vllm_config))
+            if required_kvcache_layout is not None:
+                layouts.add(required_kvcache_layout)
+
+        if len(layouts) > 1:
+            raise ValueError(f"KV cache layout mismatch: "
+                             f"found {len(layouts)} different layouts "
+                             f"({', '.join(layouts) })."
+                             f"All connectors must use the same layout.")
+        return next(iter(layouts), None)
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index 6d86ab7f7a4c2..e7fc2b118145c 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -133,6 +133,25 @@ class NixlConnector(KVConnectorBase_V1):
             self.connector_worker = NixlConnectorWorker(
                 vllm_config, self.engine_id)
 
+    ############################################################
+    # Class Methods
+    ############################################################
+    @classmethod
+    def get_required_kvcache_layout(cls, vllm_config: VllmConfig):
+        if vllm_config.model_config is None:
+            logger.warning_once("Unable to detect current VLLM config. "
+                                "Fallback to default kv cache layout.")
+            return None
+        use_mla = vllm_config.model_config.use_mla
+        if use_mla:
+            # return None when we have mla
+            # as the layout should not matter in that case,
+            # which fallback to the default behavior.
+            return None
+        logger.info_once("NixlConnector setting KV cache "
+                         "layout to HND for better xfer performance.")
+        return "HND"
+
     ############################################################
     # Scheduler Side Methods
     ############################################################
@@ -236,13 +255,13 @@ class NixlConnectorScheduler:
         """
         For remote prefill, pull all prompt blocks from remote
         asynchronously relative to engine execution.
-        
+
         Args:
             request (Request): the request object.
             num_computed_tokens (int): the number of locally
                 computed tokens for this request
         Returns:
-            * the number of tokens that can be loaded from the 
+            * the number of tokens that can be loaded from the
               external KV cache beyond what is already computed.
             * true if the external KV cache tokens will be loaded
               asynchronously (between scheduler steps).
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
index d47a75461d72e..32d0e43d71afe 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
@@ -192,8 +192,16 @@ class P2pNcclConnector(KVConnectorBase_V1):
         # Load the KV for each request each layer
         for request in metadata.requests:
             for layer_name in forward_context.no_compile_layers:
-                attn_layer = forward_context.no_compile_layers[layer_name]
-                kv_cache_layer = attn_layer.kv_cache[ \
+                layer = forward_context.no_compile_layers[layer_name]
+
+                # Only process layers that have kv_cache
+                # attribute (attention layers) Skip non-attention
+                # layers like FusedMoE
+                kv_cache = getattr(layer, 'kv_cache', None)
+                if kv_cache is None:
+                    continue
+
+                kv_cache_layer = kv_cache[ \
                     forward_context.virtual_engine]
 
                 kv_cache = self.p2p_nccl_engine.recv_tensor(
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
index 048748e6b8ecb..fd79387269d56 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
@@ -32,10 +32,11 @@ class ReqMeta:
     slot_mapping: torch.Tensor
     # Is store or load
     is_store: bool
+    mm_hashes: list[str]
 
     @staticmethod
     def make_meta(token_ids: list[int], block_ids: list[int], block_size: int,
-                  is_store: bool) -> "ReqMeta":
+                  is_store: bool, mm_hashes: list[str]) -> "ReqMeta":
         valid_num_tokens = align_to_block_size(len(token_ids), block_size)
         token_ids_tensor = torch.tensor(token_ids)[:valid_num_tokens]
         block_ids_tensor = torch.tensor(block_ids)
@@ -48,6 +49,7 @@ class ReqMeta:
             token_ids=token_ids_tensor,
             slot_mapping=slot_mapping,
             is_store=is_store,
+            mm_hashes=mm_hashes,
         )
 
 
@@ -64,9 +66,11 @@ class SharedStorageConnectorMetadata(KVConnectorMetadata):
         block_ids: list[int],
         block_size: int,
         is_store: bool,
+        mm_hashes: list[str],
     ) -> None:
         self.requests.append(
-            ReqMeta.make_meta(token_ids, block_ids, block_size, is_store))
+            ReqMeta.make_meta(token_ids, block_ids, block_size, is_store,
+                              mm_hashes))
 
 
 class SharedStorageConnector(KVConnectorBase_V1):
@@ -169,7 +173,7 @@ class SharedStorageConnector(KVConnectorBase_V1):
                         forward_context.virtual_engine]
 
                 filename = self._generate_filename_debug(
-                    layer_name, request.token_ids)
+                    layer_name, request.token_ids, request.mm_hashes)
                 kv_cache = safetensors.torch.load_file(
                     filename)["kv_cache"].cuda()
                 inject_kv_into_layer(kv_cache_layer, kv_cache,
@@ -221,7 +225,7 @@ class SharedStorageConnector(KVConnectorBase_V1):
         for request in connector_metadata.requests:
             if request.is_store:
                 filename = self._generate_filename_debug(
-                    layer_name, request.token_ids)
+                    layer_name, request.token_ids, request.mm_hashes)
                 kv_cache = extract_kv_from_layer(kv_layer,
                                                  request.slot_mapping)
                 tensors = {"kv_cache": kv_cache.detach().cpu()}
@@ -299,7 +303,8 @@ class SharedStorageConnector(KVConnectorBase_V1):
                 meta.add_request(token_ids=new_req.prompt_token_ids,
                                  block_ids=new_req.block_ids[0],
                                  block_size=self._block_size,
-                                 is_store=False)
+                                 is_store=False,
+                                 mm_hashes=new_req.mm_hashes)
                 total_need_load += 1
             else:
                 # NOTE: here, we set the store and load being exclusive,
@@ -310,7 +315,8 @@ class SharedStorageConnector(KVConnectorBase_V1):
                     meta.add_request(token_ids=new_req.prompt_token_ids,
                                      block_ids=new_req.block_ids[0],
                                      block_size=self._block_size,
-                                     is_store=True)
+                                     is_store=True,
+                                     mm_hashes=new_req.mm_hashes)
 
         cached_reqs = scheduler_output.scheduled_cached_reqs
         for i, req_id in enumerate(cached_reqs.req_ids):
@@ -338,7 +344,8 @@ class SharedStorageConnector(KVConnectorBase_V1):
                 meta.add_request(token_ids=token_ids,
                                  block_ids=block_ids,
                                  block_size=self._block_size,
-                                 is_store=False)
+                                 is_store=False,
+                                 mm_hashes=request.mm_hashes)
                 total_need_load += 1
 
         assert total_need_load == len(self._requests_need_load)
@@ -359,20 +366,28 @@ class SharedStorageConnector(KVConnectorBase_V1):
             len(request.prompt_token_ids) - 1, self._block_size)
         foldername = self._generate_foldername_debug(torch.tensor(
             request.prompt_token_ids)[:num_tokens_to_check],
+                                                     request.mm_hashes,
                                                      create_folder=False)
         return os.path.exists(foldername)
 
     def _generate_foldername_debug(
         self,
-        input_ids: torch.Tensor,
+        token_ids: torch.Tensor,
+        mm_hashes: list[str],
         create_folder=False,
     ) -> str:
         """Generate a folder name based on the hash of the bytes of the input 
         ids.
         """
-        input_ids_bytes = input_ids.numpy().tobytes()
-        input_ids_hash = hashlib.md5(input_ids_bytes,
+        token_bytes = token_ids.numpy().tobytes()
+        # Add mm_hashes to the bytes being hashed to avoid path traversal and
+        # to create a canonical key.
+        if mm_hashes:
+            mm_str = "-".join(mm_hashes)
+            token_bytes += mm_str.encode('utf-8')
+        input_ids_hash = hashlib.md5(token_bytes,
                                      usedforsecurity=False).hexdigest()
+
         foldername = os.path.join(self._storage_path, input_ids_hash)
         if create_folder:
             os.makedirs(foldername, exist_ok=True)
@@ -381,12 +396,14 @@ class SharedStorageConnector(KVConnectorBase_V1):
     def _generate_filename_debug(
         self,
         layer_name: str,
-        input_ids: torch.Tensor,
+        token_ids: torch.Tensor,
+        mm_hashes: list[str],
     ) -> str:
         """Generate a file name based on the layer name and the hash 
         of the bytes of the input ids.
         """
-        foldername = self._generate_foldername_debug(input_ids,
+        foldername = self._generate_foldername_debug(token_ids,
+                                                     mm_hashes=mm_hashes,
                                                      create_folder=True)
         return os.path.join(foldername, f"{layer_name}.safetensors")
 
diff --git a/vllm/distributed/kv_transfer/kv_connector_agent.py b/vllm/distributed/kv_transfer/kv_connector_agent.py
deleted file mode 100644
index 8633fdaf59f8b..0000000000000
--- a/vllm/distributed/kv_transfer/kv_connector_agent.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""A centralized entrypoint to perform distributed KV cache transfer.
-
-This implementation is a shim wrapper on two APIs exposed by `kv_connector`:
-1. `send_kv_caches_and_hidden_states`
-2. `recv_kv_caches_and_hidden_states
-"""
-from typing import TYPE_CHECKING, Union
-
-if TYPE_CHECKING:
-    from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
-    from vllm.config import VllmConfig
-
-import torch
-
-from vllm.distributed.kv_transfer.kv_connector.factory import (
-    KVConnectorFactory)
-from vllm.logger import init_logger
-from vllm.sequence import IntermediateTensors
-
-logger = init_logger(__name__)
-
-
-class KVTransferAgent:
-    """
-    A class designated for distributed KV transfer
-    
-    Target use cases:
-        1. Disaggregated prefill
-        2. Remote KV cache storage
-    """
-
-    def __init__(
-        self,
-        rank: int,
-        local_rank: int,
-        config: "VllmConfig",
-    ):
-
-        self.config = config
-
-        if config.kv_transfer_config is None:
-            raise ValueError("KVTransferConfig is not set in the VllmConfig,"
-                             " cannot initialize KVConnector.")
-
-        assert self.config.kv_transfer_config.is_kv_transfer_instance, "KV"\
-            "TransferAgent should only be used when kv_connector is set."
-
-        self.connector = KVConnectorFactory.create_connector_v0(
-            rank, local_rank, config)
-
-    def send_kv_caches_and_hidden_states(
-        self,
-        model_executable: torch.nn.Module,
-        model_input: "ModelInputForGPUWithSamplingMetadata",
-        kv_caches: list[torch.Tensor],
-        hidden_or_intermediate_states: Union[torch.Tensor,
-                                             IntermediateTensors],
-    ) -> None:
-
-        self.connector.send_kv_caches_and_hidden_states(
-            model_executable, model_input, kv_caches,
-            hidden_or_intermediate_states)
-
-    def close(self) -> None:
-        self.connector.close()
-
-    def recv_kv_caches_and_hidden_states(
-        self, model_executable: torch.nn.Module,
-        model_input: "ModelInputForGPUWithSamplingMetadata",
-        kv_caches: list[torch.Tensor]
-    ) -> tuple[Union[torch.Tensor, IntermediateTensors], bool,
-               "ModelInputForGPUWithSamplingMetadata"]:
-
-        return self.connector.recv_kv_caches_and_hidden_states(
-            model_executable, model_input, kv_caches)
diff --git a/vllm/distributed/kv_transfer/kv_transfer_state.py b/vllm/distributed/kv_transfer/kv_transfer_state.py
index 60f1d5d8bca75..5e0f64fca220c 100644
--- a/vllm/distributed/kv_transfer/kv_transfer_state.py
+++ b/vllm/distributed/kv_transfer/kv_transfer_state.py
@@ -8,7 +8,6 @@ from vllm.distributed.kv_transfer.kv_connector.factory import (
     KVConnectorFactory)
 from vllm.distributed.kv_transfer.kv_connector.v1 import (KVConnectorBase_V1,
                                                           KVConnectorRole)
-from vllm.distributed.parallel_state import get_world_group
 
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
@@ -61,11 +60,7 @@ def ensure_kv_transfer_initialized(vllm_config: "VllmConfig") -> None:
     if (vllm_config.kv_transfer_config.is_kv_transfer_instance
             and _KV_CONNECTOR_AGENT is None):
         if envs.VLLM_USE_V1:
-            _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector_v1(
+            _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
                 config=vllm_config, role=KVConnectorRole.WORKER)
         else:
-            _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector_v0(
-                rank=get_world_group().rank,
-                local_rank=get_world_group().local_rank,
-                config=vllm_config,
-            )
+            raise ValueError("V0 is no longer supported")
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 1f7a14920c418..6c25cdcfb7b8c 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -196,10 +196,11 @@ class GroupCoordinator:
     #   3     |   1  |  3   |     1      |       3
     local_rank: int  # local rank used to assign devices
     rank_in_group: int  # rank inside the group
-    cpu_group: ProcessGroup  # group for CPU communication
-    device_group: ProcessGroup  # group for device communication
+    cpu_group: Optional[ProcessGroup]  # group for CPU communication
+    device_group: Optional[ProcessGroup]  # group for device communication
     use_device_communicator: bool  # whether to use device communicator
-    device_communicator: DeviceCommunicatorBase  # device communicator
+    device_communicator: Optional[
+        DeviceCommunicatorBase]  # device communicator
     mq_broadcaster: Optional[Any]  # shared memory broadcaster
 
     def __init__(
@@ -250,7 +251,7 @@ class GroupCoordinator:
 
         self.use_device_communicator = use_device_communicator
 
-        self.device_communicator: DeviceCommunicatorBase = None  # type: ignore
+        self.device_communicator = None
         if use_device_communicator and self.world_size > 1:
             device_comm_cls = resolve_obj_by_qualname(
                 current_platform.get_device_communicator_cls())
@@ -364,6 +365,8 @@ class GroupCoordinator:
             return self._all_reduce_out_place(input_)
 
     def _all_reduce_out_place(self, input_: torch.Tensor) -> torch.Tensor:
+        if self.device_communicator is None:
+            raise ValueError("No device communicator found")
         return self.device_communicator.all_reduce(input_)
 
     def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
@@ -384,12 +387,16 @@ class GroupCoordinator:
 
     def _all_gather_out_place(self, input_: torch.Tensor,
                               dim: int) -> torch.Tensor:
+        if self.device_communicator is None:
+            raise ValueError("No device communicator found")
         return self.device_communicator.all_gather(input_, dim)
 
     def all_gatherv(self,
                     input_: Union[torch.Tensor, list[torch.Tensor]],
                     dim: int = 0,
                     sizes: Optional[list[int]] = None):
+        if self.device_communicator is None:
+            raise ValueError("No device communicator found")
         return self.device_communicator.all_gatherv(input_, dim, sizes)
 
     def reduce_scatter(self,
@@ -414,10 +421,14 @@ class GroupCoordinator:
                         input_: torch.Tensor,
                         dim: int = -1,
                         sizes: Optional[list[int]] = None) -> torch.Tensor:
+        if self.device_communicator is None:
+            raise ValueError("No device communicator found")
         return self.device_communicator.reduce_scatterv(input_, dim, sizes)
 
     def _reduce_scatter_out_place(self, input_: torch.Tensor,
                                   dim: int) -> torch.Tensor:
+        if self.device_communicator is None:
+            raise ValueError("No device communicator found")
         return self.device_communicator.reduce_scatter(input_, dim)
 
     def gather(self,
@@ -433,6 +444,8 @@ class GroupCoordinator:
         # Bypass the function if we are using only 1 GPU.
         if world_size == 1:
             return input_
+        if self.device_communicator is None:
+            raise ValueError("No device communicator found")
         return self.device_communicator.gather(input_, dst, dim)
 
     def broadcast(self, input_: torch.Tensor, src: int = 0):
@@ -667,6 +680,8 @@ class GroupCoordinator:
         assert dst < self.world_size, f"Invalid dst rank ({dst})"
 
         if self.use_cpu_custom_send_recv:
+            if self.device_communicator is None:
+                raise ValueError("No device communicator found")
             self.device_communicator.send_tensor_dict(  # type: ignore
                 tensor_dict, dst)
             return None
@@ -727,6 +742,8 @@ class GroupCoordinator:
         assert src < self.world_size, f"Invalid src rank ({src})"
 
         if self.use_cpu_custom_send_recv:
+            if self.device_communicator is None:
+                raise ValueError("No device communicator found")
             return self.device_communicator.recv_tensor_dict(  # type: ignore
                 src)
 
@@ -782,8 +799,10 @@ class GroupCoordinator:
         torch.distributed.barrier(group=self.cpu_group)
 
     def send(self, tensor: torch.Tensor, dst: Optional[int] = None) -> None:
-        """Sends a tensor to the destination rank in a non-blocking way"""
+        """Sends a tensor to the destination rank in a blocking way"""
         """NOTE: `dst` is the local rank of the destination rank."""
+        if self.device_communicator is None:
+            raise ValueError("No device communicator found")
         self.device_communicator.send(tensor, dst)
 
     def recv(self,
@@ -792,6 +811,8 @@ class GroupCoordinator:
              src: Optional[int] = None) -> torch.Tensor:
         """Receives a tensor from the source rank."""
         """NOTE: `src` is the local rank of the source rank."""
+        if self.device_communicator is None:
+            raise ValueError("No device communicator found")
         return self.device_communicator.recv(size, dtype, src)
 
     def destroy(self):
@@ -1013,6 +1034,7 @@ def initialize_model_parallel(
             parallelism.
         pipeline_model_parallel_size: number of GPUs used for pipeline model
             parallelism.
+        backend: name of torch distributed communication backend.
 
     Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we
     use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize
@@ -1124,14 +1146,14 @@ def ensure_model_parallel_initialized(
 
     assert (
         get_tensor_model_parallel_world_size() == tensor_model_parallel_size
-    ), ("tensor parallel group already initialized, but of unexpected size: "
-        f"{get_tensor_model_parallel_world_size()=} vs. "
-        f"{tensor_model_parallel_size=}")
+    ), ("tensor parallel group already initialized, but of unexpected size. "
+        f"got: {get_tensor_model_parallel_world_size()=} vs. "
+        f"wanted: {tensor_model_parallel_size=}")
     pp_world_size = get_pp_group().world_size
     assert (pp_world_size == pipeline_model_parallel_size), (
-        "pipeline parallel group already initialized, but of unexpected size: "
-        f"{pp_world_size=} vs. "
-        f"{pipeline_model_parallel_size=}")
+        "pipeline parallel group already initialized, but of unexpected size. "
+        f"got: {pp_world_size=} vs. "
+        f"wanted: {pipeline_model_parallel_size=}")
 
 
 def prepare_communication_buffer_for_model(model: torch.nn.Module):
@@ -1238,8 +1260,6 @@ def destroy_distributed_environment():
 def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
     destroy_model_parallel()
     destroy_distributed_environment()
-    with contextlib.suppress(AssertionError):
-        torch.distributed.destroy_process_group()
     if shutdown_ray:
         import ray  # Lazy import Ray
         ray.shutdown()
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 9af485dcfa2b7..5cd9e4d0303a7 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -18,25 +18,26 @@ from typing import (TYPE_CHECKING, Annotated, Any, Callable, Dict, List,
 import regex as re
 import torch
 from pydantic import TypeAdapter, ValidationError
-from typing_extensions import TypeIs
+from typing_extensions import TypeIs, deprecated
 
 import vllm.envs as envs
 from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig,
                          ConfigFormat, ConfigType, ConvertOption,
                          DecodingConfig, DetailedTraceModules, Device,
                          DeviceConfig, DistributedExecutorBackend,
-                         GuidedDecodingBackend, GuidedDecodingBackendV1,
-                         HfOverrides, IntermediateLoggingConfig,
-                         KVEventsConfig, KVTransferConfig, LoadConfig,
-                         LogprobsMode, LoRAConfig, ModelConfig, ModelDType,
-                         ModelImpl, MultiModalConfig, ObservabilityConfig,
-                         ParallelConfig, PoolerConfig, PrefixCachingHashAlgo,
-                         RunnerOption, SchedulerConfig, SchedulerPolicy,
-                         SpeculativeConfig, TaskOption, TokenizerMode,
-                         VllmConfig, get_attr_docs, get_field)
+                         GuidedDecodingBackend, HfOverrides,
+                         IntermediateLoggingConfig, KVEventsConfig,
+                         KVTransferConfig, LoadConfig, LogprobsMode,
+                         LoRAConfig, ModelConfig, ModelDType, ModelImpl,
+                         MultiModalConfig, ObservabilityConfig, ParallelConfig,
+                         PoolerConfig, PrefixCachingHashAlgo, RunnerOption,
+                         SchedulerConfig, SchedulerPolicy, SpeculativeConfig,
+                         TaskOption, TokenizerMode, VllmConfig, get_attr_docs,
+                         get_field)
 from vllm.logger import init_logger
 from vllm.platforms import CpuArchEnum, current_platform
 from vllm.plugins import load_general_plugins
+from vllm.ray.lazy_utils import is_ray_initialized
 from vllm.reasoning import ReasoningParserManager
 from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3
 from vllm.transformers_utils.utils import check_gguf_file
@@ -218,10 +219,12 @@ Additionally, list elements can be passed individually using `+`:
         elif contains_type(type_hints, list):
             type_hint = get_type(type_hints, list)
             types = get_args(type_hint)
-            assert len(types) == 1, (
-                "List type must have exactly one type. Got "
-                f"{type_hint} with types {types}")
-            kwargs[name]["type"] = types[0]
+            list_type = types[0]
+            if get_origin(list_type) is Union:
+                msg = "List type must contain str if it is a Union."
+                assert str in get_args(list_type), msg
+                list_type = str
+            kwargs[name]["type"] = list_type
             kwargs[name]["nargs"] = "+"
         elif contains_type(type_hints, int):
             kwargs[name]["type"] = int
@@ -450,6 +453,9 @@ class EngineArgs:
 
     intermediate_log_config: Optional[IntermediateLoggingConfig] = None
 
+    kv_sharing_fast_prefill: bool = \
+        CacheConfig.kv_sharing_fast_prefill
+
     def __post_init__(self):
         # support `EngineArgs(compilation_config={...})`
         # without having to manually construct a
@@ -702,6 +708,8 @@ class EngineArgs:
                                  **cache_kwargs["cpu_offload_gb"])
         cache_group.add_argument("--calculate-kv-scales",
                                  **cache_kwargs["calculate_kv_scales"])
+        cache_group.add_argument("--kv-sharing-fast-prefill",
+                                 **cache_kwargs["kv_sharing_fast_prefill"])
 
         # Multimodal related configs
         multimodal_kwargs = get_kwargs(MultiModalConfig)
@@ -998,8 +1006,28 @@ class EngineArgs:
         provided as a JSON string input via CLI arguments or directly as a
         dictionary from the engine.
         """
+
+        from vllm.transformers_utils.config import get_config
+        from vllm.transformers_utils.configs.speculators.base import (
+            SpeculatorsConfig)
+
         if self.speculative_config is None:
-            return None
+            hf_config = get_config(self.hf_config_path or self.model,
+                                   self.trust_remote_code, self.revision,
+                                   self.code_revision, self.config_format)
+
+            # if loading a SpeculatorsConfig, load the specualtive_config
+            # details from the config directly
+            # no user input required / expected
+            if isinstance(hf_config, SpeculatorsConfig):
+                # We create one since we dont create one
+                self.speculative_config = {}
+                self.speculative_config[
+                    "num_speculative_tokens"] = hf_config.num_lookahead_tokens
+                self.speculative_config["model"] = self.model
+                self.speculative_config["method"] = hf_config.method
+            else:
+                return None
 
         # Note(Shangming): These parameters are not obtained from the cli arg
         # '--speculative-config' and must be passed in when creating the engine
@@ -1094,8 +1122,18 @@ class EngineArgs:
             prefix_caching_hash_algo=self.prefix_caching_hash_algo,
             cpu_offload_gb=self.cpu_offload_gb,
             calculate_kv_scales=self.calculate_kv_scales,
+            kv_sharing_fast_prefill=self.kv_sharing_fast_prefill,
         )
 
+        ray_runtime_env = None
+        if is_ray_initialized():
+            # Ray Serve LLM calls `create_engine_config` in the context
+            # of a Ray task, therefore we check is_ray_initialized()
+            # as opposed to is_in_ray_actor().
+            import ray
+            ray_runtime_env = ray.get_runtime_context().runtime_env
+            logger.info("Using ray runtime env: %s", ray_runtime_env)
+
         # Get the current placement group if Ray is initialized and
         # we are in a Ray actor. If so, then the placement group will be
         # passed to spawned processes.
@@ -1208,6 +1246,7 @@ class EngineArgs:
             max_parallel_loading_workers=self.max_parallel_loading_workers,
             disable_custom_all_reduce=self.disable_custom_all_reduce,
             ray_workers_use_nsight=self.ray_workers_use_nsight,
+            ray_runtime_env=ray_runtime_env,
             placement_group=placement_group,
             distributed_executor_backend=self.distributed_executor_backend,
             worker_cls=self.worker_cls,
@@ -1216,6 +1255,18 @@ class EngineArgs:
             enable_multimodal_encoder_data_parallel,
         )
 
+        supports_mm_preprocessor_cache = (self.data_parallel_size == 1
+                                          or data_parallel_external_lb)
+        if (not supports_mm_preprocessor_cache
+                and model_config.is_multimodal_model
+                and not model_config.disable_mm_preprocessor_cache):
+            logger.warning(
+                "Multi-modal preprocessor cache is not compatible "
+                "with data parallelism when there does not exist a "
+                "one-to-one correspondance between API process and "
+                "EngineCore process, so the cache will be disabled.")
+            model_config.set_disable_mm_preprocessor_cache(True)
+
         speculative_config = self.create_speculative_config(
             target_model_config=model_config,
             target_parallel_config=parallel_config,
@@ -1370,14 +1421,6 @@ class EngineArgs:
                                recommend_to_remove=True)
             return False
 
-        if self.guided_decoding_backend not in get_args(
-                GuidedDecodingBackendV1):
-            _raise_or_fallback(
-                feature_name=
-                f"--guided-decoding-backend={self.guided_decoding_backend}",
-                recommend_to_remove=False)
-            return False
-
         # Need at least Ampere for now (FA support required).
         # Skip this check if we are running on a non-GPU platform,
         # or if the device capability is not available
@@ -1438,7 +1481,6 @@ class EngineArgs:
                 "Please consider using other speculative decoding methods "
                 "such as ngram, medusa, eagle, or deepseek_mtp.")
 
-        # No XFormers so far.
         V1_BACKENDS = [
             "FLASH_ATTN_VLLM_V1",
             "FLASH_ATTN",
@@ -1446,13 +1488,14 @@ class EngineArgs:
             "PALLAS_VLLM_V1",
             "TRITON_ATTN_VLLM_V1",
             "TRITON_MLA",
-            "CUTLASS_MLA_VLLM_V1",
+            "CUTLASS_MLA",
             "FLASHMLA",
             "FLASHINFER",
             "FLASHINFER_VLLM_V1",
             "ROCM_AITER_MLA",
             "TORCH_SDPA_VLLM_V1",
             "FLEX_ATTENTION",
+            "TREE_ATTN",
         ]
         if (envs.is_set("VLLM_ATTENTION_BACKEND")
                 and envs.VLLM_ATTENTION_BACKEND not in V1_BACKENDS):
@@ -1474,14 +1517,18 @@ class EngineArgs:
                 and _warn_or_fallback("Engine in background thread")):
             return False
 
-        if (self.pipeline_parallel_size > 1
-                and self.distributed_executor_backend
-                not in (ParallelConfig.distributed_executor_backend, "ray",
-                        "mp", "external_launcher")):
-            name = "Pipeline Parallelism without Ray distributed executor " \
-                    "or multiprocessing executor or external launcher"
-            _raise_or_fallback(feature_name=name, recommend_to_remove=False)
-            return False
+        if self.pipeline_parallel_size > 1:
+            supports_pp = getattr(self.distributed_executor_backend,
+                                  'supports_pp', False)
+            if not supports_pp and self.distributed_executor_backend not in (
+                    ParallelConfig.distributed_executor_backend, "ray", "mp",
+                    "external_launcher"):
+                name = "Pipeline Parallelism without Ray distributed " \
+                        "executor or multiprocessing executor or external " \
+                        "launcher"
+                _raise_or_fallback(feature_name=name,
+                                   recommend_to_remove=False)
+                return False
 
         # The platform may be supported on V1, but off by default for now.
         if not current_platform.default_v1(  # noqa: SIM103
@@ -1699,7 +1746,23 @@ class EngineArgs:
 @dataclass
 class AsyncEngineArgs(EngineArgs):
     """Arguments for asynchronous vLLM engine."""
-    disable_log_requests: bool = False
+    enable_log_requests: bool = False
+
+    @property
+    @deprecated(
+        "`disable_log_requests` is deprecated and has been replaced with "
+        "`enable_log_requests`. This will be removed in v0.12.0. Please use "
+        "`enable_log_requests` instead.")
+    def disable_log_requests(self) -> bool:
+        return not self.enable_log_requests
+
+    @disable_log_requests.setter
+    @deprecated(
+        "`disable_log_requests` is deprecated and has been replaced with "
+        "`enable_log_requests`. This will be removed in v0.12.0. Please use "
+        "`enable_log_requests` instead.")
+    def disable_log_requests(self, value: bool):
+        self.enable_log_requests = not value
 
     @staticmethod
     def add_cli_args(parser: FlexibleArgumentParser,
@@ -1710,9 +1773,15 @@ class AsyncEngineArgs(EngineArgs):
         load_general_plugins()
         if not async_args_only:
             parser = EngineArgs.add_cli_args(parser)
+        parser.add_argument('--enable-log-requests',
+                            action=argparse.BooleanOptionalAction,
+                            default=AsyncEngineArgs.enable_log_requests,
+                            help='Enable logging requests.')
         parser.add_argument('--disable-log-requests',
-                            action='store_true',
-                            help='Disable logging requests.')
+                            action=argparse.BooleanOptionalAction,
+                            default=not AsyncEngineArgs.enable_log_requests,
+                            help='[DEPRECATED] Disable logging requests.',
+                            deprecated=True)
         current_platform.pre_register_and_update(parser)
         return parser
 
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 39642d89167bd..1f962b008ee03 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
-import copy
 import time
 import weakref
 from functools import partial
@@ -24,8 +23,6 @@ from vllm.inputs import PromptType
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
-from vllm.model_executor.guided_decoding import (
-    get_guided_decoding_logits_processor)
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.outputs import PoolingRequestOutput, RequestOutput
 from vllm.pooling_params import PoolingParams
@@ -33,7 +30,7 @@ from vllm.sampling_params import SamplingParams
 from vllm.sequence import ExecuteModelRequest
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import Device, weak_bind
+from vllm.utils import Device, deprecate_kwargs, weak_bind
 
 logger = init_logger(__name__)
 ENGINE_ITERATION_TIMEOUT_S = envs.VLLM_ENGINE_ITERATION_TIMEOUT_S
@@ -469,19 +466,6 @@ class _AsyncLLMEngine(LLMEngine):
             tokenization_kwargs=tokenization_kwargs,
         )
 
-        if isinstance(params, SamplingParams) and \
-            params.guided_decoding is not None:
-            # Guided decoding has an async implementation for building logits
-            # processors in a separate threadpool.
-            # We want to invoke that here instead of using the blocking
-            # implementation in the LLMEngine
-            params = await build_guided_decoding_logits_processor_async(
-                sampling_params=params,
-                tokenizer=await self.get_tokenizer_async(lora_request),
-                default_guided_backend=self.decoding_config.backend,
-                reasoning_backend=self.decoding_config.reasoning_backend,
-                model_config=self.model_config)
-
         self._add_processed_request(
             request_id=request_id,
             processed_inputs=processed_inputs,
@@ -503,48 +487,6 @@ class _AsyncLLMEngine(LLMEngine):
         raise NotImplementedError
 
 
-async def build_guided_decoding_logits_processor_async(
-        sampling_params: SamplingParams, tokenizer: AnyTokenizer,
-        default_guided_backend: str, reasoning_backend: Optional[str],
-        model_config: ModelConfig) -> SamplingParams:
-    """Constructs logits processors based on the guided_decoding,
-    logits_bias, and allowed_token_ids fields in sampling_params. Deletes
-    those fields and adds the constructed logits processors to the
-    logits_processors field. Modifies sampling params in-place and returns
-    the modified sampling params."""
-    if sampling_params.guided_decoding is None:
-        return sampling_params
-
-    # Defensively copy sampling params since guided decoding logits
-    # processors can have different state for each request
-    sampling_params = copy.copy(sampling_params)
-    guided_decoding = sampling_params.guided_decoding
-
-    logger.debug(
-        "Building guided decoding logits processor. "
-        "guided_decoding: %s%s", guided_decoding,
-        f", reasoning_backend: {reasoning_backend}"
-        if reasoning_backend is not None else "")
-
-    guided_decoding.backend = guided_decoding.backend or default_guided_backend
-
-    processor = await get_guided_decoding_logits_processor(
-        guided_params=guided_decoding,
-        tokenizer=tokenizer,
-        reasoning_backend=reasoning_backend,
-        model_config=model_config)
-
-    if processor:
-        if sampling_params.logits_processors is None:
-            sampling_params.logits_processors = []
-        sampling_params.logits_processors.append(processor)
-
-    # Unset guided decoding params after constructing the lp from them
-    sampling_params.guided_decoding = None
-
-    return sampling_params
-
-
 class AsyncLLMEngine(EngineClient):
     """An asynchronous wrapper for [`LLMEngine`][vllm.LLMEngine].
 
@@ -612,14 +554,20 @@ class AsyncLLMEngine(EngineClient):
         return LLMEngine._get_executor_cls(engine_config)
 
     @classmethod
+    @deprecate_kwargs(
+        "disable_log_requests",
+        additional_message=("This argument will have no effect. "
+                            "Use `enable_log_requests` instead."),
+    )
     def from_vllm_config(
-        cls,
-        vllm_config: VllmConfig,
-        start_engine_loop: bool = True,
-        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
-        stat_loggers: Optional[dict[str, StatLoggerBase]] = None,
-        disable_log_requests: bool = False,
-        disable_log_stats: bool = False,
+            cls,
+            vllm_config: VllmConfig,
+            start_engine_loop: bool = True,
+            usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+            stat_loggers: Optional[dict[str, StatLoggerBase]] = None,
+            enable_log_requests: bool = False,
+            disable_log_stats: bool = False,
+            disable_log_requests: bool = True,  # Deprecated, will be removed
     ) -> "AsyncLLMEngine":
         """Create an AsyncLLMEngine from the EngineArgs."""
 
@@ -627,7 +575,7 @@ class AsyncLLMEngine(EngineClient):
             vllm_config=vllm_config,
             executor_class=cls._get_executor_cls(vllm_config),
             start_engine_loop=start_engine_loop,
-            log_requests=not disable_log_requests,
+            log_requests=enable_log_requests,
             log_stats=not disable_log_stats,
             usage_context=usage_context,
             stat_loggers=stat_loggers,
@@ -656,7 +604,7 @@ class AsyncLLMEngine(EngineClient):
             usage_context=usage_context,
             stat_loggers=stat_loggers,
             disable_log_stats=engine_args.disable_log_stats,
-            disable_log_requests=engine_args.disable_log_requests,
+            enable_log_requests=engine_args.enable_log_requests,
         )
 
     @property
@@ -1028,7 +976,7 @@ class AsyncLLMEngine(EngineClient):
         ```
         # Please refer to entrypoints/api_server.py for
         # the complete example.
-    
+
         # initialize the engine and the example input
         # note that engine_args here is AsyncEngineArgs instance
         engine = AsyncLLMEngine.from_engine_args(engine_args)
@@ -1036,13 +984,13 @@ class AsyncLLMEngine(EngineClient):
             "input": "What is LLM?",
             "request_id": 0,
         }
-    
+
         # start the generation
         results_generator = engine.encode(
         example_input["input"],
         PoolingParams(),
         example_input["request_id"])
-    
+
         # get the results
         final_output = None
         async for request_output in results_generator:
@@ -1052,7 +1000,7 @@ class AsyncLLMEngine(EngineClient):
                 # Return or raise an error
                 ...
             final_output = request_output
-    
+
         # Process and return the final output
         ...
         ```
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index e7919d90442f9..79255b031eeca 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import copy
 import time
 from collections import Counter as collectionsCounter
 from collections import deque
@@ -36,8 +35,6 @@ from vllm.inputs.preprocess import InputPreprocessor
 from vllm.logger import init_logger
 from vllm.logits_process import get_bad_words_logits_processors
 from vllm.lora.request import LoRARequest
-from vllm.model_executor.guided_decoding import (
-    get_local_guided_decoding_logits_processor)
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from vllm.multimodal.processing import EncDecMultiModalProcessor
@@ -686,11 +683,10 @@ class LLMEngine:
                              "Priority scheduling is not enabled.")
 
         if isinstance(params, SamplingParams) \
-            and (params.guided_decoding or params.logits_processors) \
+            and params.logits_processors \
             and self.scheduler_config.num_scheduler_steps > 1:
             raise ValueError(
-                "Guided decoding and logits processors are not supported "
-                "in multi-step decoding")
+                "Logits processors are not supported in multi-step decoding")
 
         if arrival_time is None:
             arrival_time = time.time()
@@ -1226,7 +1222,7 @@ class LLMEngine:
         engine = LLMEngine.from_engine_args(engine_args)
         example_inputs = [(0, "What is LLM?",
         SamplingParams(temperature=0.0))]
-    
+
         # Start the engine with an event loop
         while True:
             if example_inputs:
@@ -1866,8 +1862,14 @@ class LLMEngine:
                 context=trace_context,
                 start_time=arrival_time_nano_seconds) as seq_span:
             metrics = seq_group.metrics
-            ttft = metrics.first_token_time - metrics.arrival_time
-            e2e_time = metrics.finished_time - metrics.arrival_time
+
+            # Handle potential None values for cancelled/aborted requests
+            ttft = (metrics.first_token_time - metrics.arrival_time
+                    if metrics.first_token_time is not None else None)
+
+            e2e_time = (metrics.finished_time - metrics.arrival_time
+                        if metrics.finished_time is not None else None)
+
             seq_span.set_attribute(SpanAttributes.GEN_AI_RESPONSE_MODEL,
                                    self.model_config.model)
             seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_ID,
@@ -1890,11 +1892,18 @@ class LLMEngine:
                     seq.get_output_len()
                     for seq in seq_group.get_finished_seqs()
                 ]))
-            seq_span.set_attribute(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE,
-                                   metrics.time_in_queue)
-            seq_span.set_attribute(
-                SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN, ttft)
-            seq_span.set_attribute(SpanAttributes.GEN_AI_LATENCY_E2E, e2e_time)
+
+            # Only set timing attributes if the values are available
+            if metrics.time_in_queue is not None:
+                seq_span.set_attribute(
+                    SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE,
+                    metrics.time_in_queue)
+            if ttft is not None:
+                seq_span.set_attribute(
+                    SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN, ttft)
+            if e2e_time is not None:
+                seq_span.set_attribute(SpanAttributes.GEN_AI_LATENCY_E2E,
+                                       e2e_time)
             if metrics.scheduler_time is not None:
                 seq_span.set_attribute(
                     SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER,
@@ -1983,43 +1992,13 @@ class LLMEngine:
     def _build_logits_processors(
             self, sampling_params: SamplingParams,
             lora_request: Optional[LoRARequest]) -> SamplingParams:
-        """Constructs logits processors based on the guided_decoding,
-        logits_bias, and allowed_token_ids fields in sampling_params. Deletes
-        those fields and adds the constructed logits processors to the
-        logits_processors field. Returns the modified sampling params."""
+        """Constructs logits processors based on the logits_bias, and
+        allowed_token_ids fields in sampling_params. Deletes those fields and
+        adds the constructed logits processors to the logits_processors field.
+        Returns the modified sampling params."""
 
         logits_processors = []
 
-        if sampling_params.guided_decoding is not None:
-            # Defensively copy sampling params since guided decoding logits
-            # processors can have different state for each request
-            sampling_params = copy.copy(sampling_params)
-            guided_decoding = sampling_params.guided_decoding
-
-            logger.debug(
-                "Building guided decoding logits processor in "
-                "LLMEngine. Params: %s", guided_decoding)
-
-            tokenizer = self.get_tokenizer(lora_request=lora_request)
-            guided_decoding.backend = guided_decoding.backend or \
-                self.decoding_config.backend
-
-            if self.decoding_config.reasoning_backend:
-                logger.debug("Building with reasoning backend %s",
-                             self.decoding_config.reasoning_backend)
-
-            processor = get_local_guided_decoding_logits_processor(
-                guided_params=guided_decoding,
-                tokenizer=tokenizer,
-                model_config=self.model_config,
-                reasoning_backend=self.decoding_config.reasoning_backend,
-            )
-            if processor:
-                logits_processors.append(processor)
-
-            # Unset so this doesn't get passed down to the model
-            sampling_params.guided_decoding = None
-
         if (sampling_params.logit_bias or sampling_params.allowed_token_ids):
             tokenizer = self.get_tokenizer(lora_request=lora_request)
 
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index cde8fc367fb54..f69f72edf6a52 100644
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -20,8 +20,6 @@ from vllm.config import DecodingConfig, ModelConfig, VllmConfig
 from vllm.core.scheduler import SchedulerOutputs
 # yapf conflicts with isort for this block
 # yapf: disable
-from vllm.engine.async_llm_engine import (
-    build_guided_decoding_logits_processor_async)
 from vllm.engine.multiprocessing import (ENGINE_DEAD_ERROR, IPC_DATA_EXT,
                                          IPC_HEALTH_EXT, IPC_INPUT_EXT,
                                          IPC_OUTPUT_EXT, RPC_REQUEST_T,
@@ -537,22 +535,6 @@ class MQLLMEngineClient(EngineClient):
         if request_id in self.output_queues:
             raise ValueError(f"Request {request_id} already exists")
 
-        # Constructing guided decoding logits processors is expensive, so we do
-        # it here to avoid contending with cpu resources and the GIL on the
-        # backend process.
-        if isinstance(params, SamplingParams) and \
-            params.guided_decoding is not None:
-            params = await \
-                build_guided_decoding_logits_processor_async(
-                    sampling_params=params,
-                    tokenizer=await self.get_tokenizer(lora_request),
-                    default_guided_backend=(self.decoding_config.backend
-                        if self.decoding_config
-                        else DecodingConfig.backend),
-                    model_config=self.model_config,
-                    reasoning_backend=self.decoding_config.reasoning_backend,
-                )
-
         # 1) Create output queue for this requests.
         queue: asyncio.Queue[Union[RequestOutput,
                                    BaseException]] = asyncio.Queue()
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index fe6eb0d8c2f1a..903f3fd71ebcd 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -34,6 +34,7 @@ from vllm.outputs import RequestOutput
 from vllm.transformers_utils.config import (
     maybe_register_config_serialize_by_value)
 from vllm.usage.usage_lib import UsageContext
+from vllm.utils import deprecate_kwargs
 from vllm.worker.model_runner_base import InputProcessingError
 
 logger = init_logger(__name__)
@@ -120,10 +121,20 @@ class MQLLMEngine:
             return ENGINE_DEAD_ERROR()
 
     @classmethod
-    def from_vllm_config(cls, vllm_config: VllmConfig,
-                         usage_context: UsageContext,
-                         disable_log_requests: bool, disable_log_stats: bool,
-                         ipc_path: str) -> "MQLLMEngine":
+    @deprecate_kwargs(
+        "disable_log_requests",
+        additional_message=("This argument will have no effect. "
+                            "Use `enable_log_requests` instead."),
+    )
+    def from_vllm_config(
+            cls,
+            vllm_config: VllmConfig,
+            usage_context: UsageContext,
+            enable_log_requests: bool,
+            disable_log_stats: bool,
+            ipc_path: str,
+            disable_log_requests: bool = True,  # Deprecated, will be removed
+    ) -> "MQLLMEngine":
         # Setup plugins for each process
         from vllm.plugins import load_general_plugins
         load_general_plugins()
@@ -136,7 +147,7 @@ class MQLLMEngine:
             ipc_path=ipc_path,
             usage_context=usage_context,
             use_async_sockets=use_async_sockets,
-            log_requests=(not disable_log_requests),
+            log_requests=enable_log_requests,
             log_stats=(not disable_log_stats),
         )
 
@@ -150,7 +161,7 @@ class MQLLMEngine:
             ipc_path=ipc_path,
             vllm_config=vllm_config,
             usage_context=usage_context,
-            disable_log_requests=engine_args.disable_log_requests,
+            enable_log_requests=engine_args.enable_log_requests,
             disable_log_stats=engine_args.disable_log_stats,
         )
 
@@ -436,7 +447,7 @@ def signal_handler(*_) -> None:
 
 def run_mp_engine(vllm_config: VllmConfig, usage_context: UsageContext,
                   ipc_path: str, disable_log_stats: bool,
-                  disable_log_requests: bool, engine_alive):
+                  enable_log_requests: bool, engine_alive):
     try:
         # Ensure we can serialize transformer config before spawning
         maybe_register_config_serialize_by_value()
@@ -445,7 +456,7 @@ def run_mp_engine(vllm_config: VllmConfig, usage_context: UsageContext,
             vllm_config=vllm_config,
             usage_context=usage_context,
             disable_log_stats=disable_log_stats,
-            disable_log_requests=disable_log_requests,
+            enable_log_requests=enable_log_requests,
             ipc_path=ipc_path)
 
         signal.signal(signal.SIGTERM, signal_handler)
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index a6602391d4081..a658d97cc8c5e 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -48,7 +48,7 @@ from vllm.transformers_utils.chat_templates import (
 # yapf: enable
 from vllm.transformers_utils.processor import cached_get_processor
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
-from vllm.utils import deprecate_kwargs, random_uuid
+from vllm.utils import random_uuid
 
 logger = init_logger(__name__)
 
@@ -383,17 +383,12 @@ def resolve_mistral_chat_template(
     return None
 
 
-@deprecate_kwargs(
-    "trust_remote_code",
-    additional_message="Please use `model_config.trust_remote_code` instead.",
-)
 def resolve_hf_chat_template(
     tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
     chat_template: Optional[str],
     tools: Optional[list[dict[str, Any]]],
     *,
     model_config: ModelConfig,
-    trust_remote_code: Optional[bool] = None,
 ) -> Optional[str]:
     # 1st priority: The given chat template
     if chat_template is not None:
@@ -488,10 +483,6 @@ def _log_chat_template_content_format(
         )
 
 
-@deprecate_kwargs(
-    "trust_remote_code",
-    additional_message="Please use `model_config.trust_remote_code` instead.",
-)
 def resolve_chat_template_content_format(
     chat_template: Optional[str],
     tools: Optional[list[dict[str, Any]]],
@@ -499,7 +490,6 @@ def resolve_chat_template_content_format(
     tokenizer: AnyTokenizer,
     *,
     model_config: ModelConfig,
-    trust_remote_code: Optional[bool] = None,
 ) -> _ChatTemplateContentFormat:
     if given_format != "auto":
         return given_format
@@ -545,9 +535,10 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
         return self._model_config
 
     @cached_property
-    def model_cls(self):
+    def model_cls(self) -> type[SupportsMultiModal]:
         from vllm.model_executor.model_loader import get_model_cls
-        return get_model_cls(self.model_config)
+        model_cls = get_model_cls(self.model_config)
+        return cast(type[SupportsMultiModal], model_cls)
 
     @property
     def allowed_local_media_path(self):
@@ -557,39 +548,23 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
     def mm_registry(self):
         return MULTIMODAL_REGISTRY
 
+    @cached_property
+    def mm_processor(self):
+        return self.mm_registry.create_processor(self.model_config)
+
     def add(self, modality: ModalityStr, item: _T) -> Optional[str]:
         """
         Add a multi-modal item to the current prompt and returns the
         placeholder string to use, if any.
         """
-        mm_registry = self.mm_registry
-        model_config = self.model_config
-        model_cls = cast(SupportsMultiModal, self.model_cls)
-
         input_modality = modality.replace("_embeds", "")
+        num_items = len(self._items_by_modality[modality]) + 1
 
-        if mm_registry.has_processor(model_config):
-            mm_processor = mm_registry.create_processor(model_config)
-            allowed_counts = mm_processor.info.get_allowed_mm_limits()
-            allowed_count = allowed_counts.get(input_modality, 0)
-        else:
-            mm_config = model_config.multimodal_config
-            if mm_config is None:
-                msg = "This model does not support multi-modal inputs"
-                raise ValueError(msg)
-
-            allowed_count = mm_config.get_limit_per_prompt(input_modality)
-
-        current_count = len(self._items_by_modality[modality]) + 1
-        if current_count > allowed_count:
-            raise ValueError(
-                f"At most {allowed_count} {modality}(s) may be provided in "
-                "one request. You can set `--limit-mm-per-prompt` to "
-                "increase this limit if the model supports it.")
+        self.mm_processor.validate_num_items(input_modality, num_items)
 
         self._items_by_modality[modality].append(item)
 
-        return model_cls.get_placeholder_str(modality, current_count)
+        return self.model_cls.get_placeholder_str(modality, num_items)
 
     @abstractmethod
     def create_parser(self) -> "BaseMultiModalContentParser":
@@ -1285,10 +1260,6 @@ def parse_chat_messages_futures(
     return conversation, mm_tracker.all_mm_data()
 
 
-@deprecate_kwargs(
-    "trust_remote_code",
-    additional_message="Please use `model_config.trust_remote_code` instead.",
-)
 def apply_hf_chat_template(
     tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
     conversation: list[ConversationMessage],
@@ -1297,8 +1268,6 @@ def apply_hf_chat_template(
     *,
     model_config: ModelConfig,
     tokenize: bool = False,  # Different from HF's default
-    # Deprecated, explicitly capture here so it doesn't slit into kwargs.
-    trust_remote_code: Optional[bool] = None,
     **kwargs: Any,
 ) -> str:
     hf_chat_template = resolve_hf_chat_template(
diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py
index 68eb2580991c8..9762a1de9edd3 100644
--- a/vllm/entrypoints/cli/serve.py
+++ b/vllm/entrypoints/cli/serve.py
@@ -2,9 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
-import os
 import signal
-import sys
 from typing import Optional
 
 import uvloop
@@ -18,10 +16,10 @@ from vllm.entrypoints.openai.cli_args import (make_arg_parser,
                                               validate_parsed_serve_args)
 from vllm.entrypoints.utils import (VLLM_SUBCMD_PARSER_EPILOG,
                                     show_filtered_argument_or_group_from_help)
-from vllm.executor.multiproc_worker_utils import _add_prefix
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import FlexibleArgumentParser, bind_process_name, get_tcp_uri
+from vllm.utils import (FlexibleArgumentParser, decorate_logs, get_tcp_uri,
+                        set_process_title)
 from vllm.v1.engine.core import EngineCoreProc
 from vllm.v1.engine.utils import CoreEngineProcManager, launch_core_engines
 from vllm.v1.executor.abstract import Executor
@@ -77,7 +75,7 @@ def run_headless(args: argparse.Namespace):
 
     if args.api_server_count > 1:
         raise ValueError("api_server_count can't be set in headless mode")
-    bind_process_name("APIServer_Headless")
+
     # Create the EngineConfig.
     engine_args = vllm.AsyncEngineArgs.from_cli_args(args)
     usage_context = UsageContext.OPENAI_API_SERVER
@@ -140,9 +138,14 @@ def run_multi_api_server(args: argparse.Namespace):
     num_api_servers = args.api_server_count
     assert num_api_servers > 0
 
+    orig_disable_mm_preprocessor_cache = args.disable_mm_preprocessor_cache
+
     if num_api_servers > 1:
         setup_multiprocess_prometheus()
 
+        # Not compatible with API server scale-out
+        args.disable_mm_preprocessor_cache = True
+
     listen_address, sock = setup_server(args)
 
     engine_args = vllm.AsyncEngineArgs.from_cli_args(args)
@@ -159,11 +162,10 @@ def run_multi_api_server(args: argparse.Namespace):
                              "with api_server_count > 1")
 
         if model_config.is_multimodal_model and not (
-                model_config.disable_mm_preprocessor_cache):
+                orig_disable_mm_preprocessor_cache):
             logger.warning(
-                "Multi-model preprocessor cache will be disabled for"
-                " api_server_count > 1")
-            model_config.disable_mm_preprocessor_cache = True
+                "Multi-modal preprocessor cache is not compatible "
+                "with api_server_count > 1, so the cache will be disabled.")
 
     executor_class = Executor.get_class(vllm_config)
     log_stats = not engine_args.disable_log_stats
@@ -222,12 +224,10 @@ def run_api_server_worker_proc(listen_address,
                                **uvicorn_kwargs) -> None:
     """Entrypoint for individual API server worker processes."""
 
-    # Add process-specific prefix to stdout and stderr.
-    from multiprocessing import current_process
-    process_name = current_process().name
-    pid = os.getpid()
-    _add_prefix(sys.stdout, process_name, pid)
-    _add_prefix(sys.stderr, process_name, pid)
+    # Set process title and add process-specific prefix to stdout and stderr.
+    server_index = client_config.get("client_index", 0) if client_config else 0
+    set_process_title("APIServer", str(server_index))
+    decorate_logs()
 
     uvloop.run(
         run_server_worker(listen_address, sock, args, client_config,
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 04dd193966421..ca24b0c32b73b 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import itertools
-import warnings
 from collections.abc import Sequence
 from contextlib import contextmanager
 from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Optional, Union,
@@ -40,15 +39,13 @@ from vllm.inputs import PromptType, SingletonPrompt, TextPrompt, TokensPrompt
 from vllm.inputs.parse import parse_and_batch_prompt
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
-from vllm.model_executor.guided_decoding.guided_fields import (
-    GuidedDecodingRequest, LLMGuidedOptions)
 from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.outputs import (ClassificationRequestOutput, EmbeddingRequestOutput,
                           PoolingRequestOutput, RequestOutput,
                           ScoringRequestOutput)
 from vllm.pooling_params import PoolingParams
-from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams,
-                                  RequestOutputKind, SamplingParams)
+from vllm.sampling_params import (BeamSearchParams, RequestOutputKind,
+                                  SamplingParams)
 from vllm.tasks import PoolingTask
 from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
                                                get_cached_tokenizer)
@@ -330,8 +327,6 @@ class LLM:
         *,
         use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
-        guided_options_request: Optional[Union[LLMGuidedOptions,
-                                               GuidedDecodingRequest]] = None,
     ) -> list[RequestOutput]:
         ...
 
@@ -345,8 +340,6 @@ class LLM:
         prompt_token_ids: Optional[list[int]] = None,
         use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
-        guided_options_request: Optional[Union[LLMGuidedOptions,
-                                               GuidedDecodingRequest]] = None,
     ) -> list[RequestOutput]:
         ...
 
@@ -360,8 +353,6 @@ class LLM:
         prompt_token_ids: Optional[list[list[int]]] = None,
         use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
-        guided_options_request: Optional[Union[LLMGuidedOptions,
-                                               GuidedDecodingRequest]] = None,
     ) -> list[RequestOutput]:
         ...
 
@@ -376,8 +367,6 @@ class LLM:
         prompt_token_ids: list[int],
         use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
-        guided_options_request: Optional[Union[LLMGuidedOptions,
-                                               GuidedDecodingRequest]] = None,
     ) -> list[RequestOutput]:
         ...
 
@@ -392,8 +381,6 @@ class LLM:
         prompt_token_ids: list[list[int]],
         use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
-        guided_options_request: Optional[Union[LLMGuidedOptions,
-                                               GuidedDecodingRequest]] = None,
     ) -> list[RequestOutput]:
         ...
 
@@ -406,8 +393,6 @@ class LLM:
         prompt_token_ids: Union[list[int], list[list[int]]],
         use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
-        guided_options_request: Optional[Union[LLMGuidedOptions,
-                                               GuidedDecodingRequest]] = None,
     ) -> list[RequestOutput]:
         ...
 
@@ -425,8 +410,6 @@ class LLM:
         prompt_token_ids: Optional[Union[list[int], list[list[int]]]] = None,
         use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
-        guided_options_request: Optional[Union[LLMGuidedOptions,
-                                               GuidedDecodingRequest]] = None,
         priority: Optional[list[int]] = None,
     ) -> list[RequestOutput]:
         """Generates the completions for the input prompts.
@@ -478,14 +461,6 @@ class LLM:
             parsed_prompts = cast(Union[PromptType, Sequence[PromptType]],
                                   prompts)
 
-        if isinstance(guided_options_request, dict):
-            if len(guided_options_request) > 1:
-                raise ValueError(
-                    "You can only use one guided decoding but multiple is "
-                    f"specified: {guided_options_request}")
-            guided_options_request = GuidedDecodingRequest(
-                **guided_options_request)
-
         if sampling_params is None:
             # Use default sampling params.
             sampling_params = self.get_default_sampling_params()
@@ -507,7 +482,6 @@ class LLM:
             params=sampling_params,
             use_tqdm=use_tqdm,
             lora_request=lora_request,
-            guided_options=guided_options_request,
             tokenization_kwargs=tokenization_kwargs,
             priority=priority,
         )
@@ -1063,7 +1037,7 @@ class LLM:
         truncate_prompt_tokens: Optional[int] = None,
         use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
-        pooling_task: PoolingTask = "encode",
+        pooling_task: Optional[PoolingTask] = None,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
     ) -> list[PoolingRequestOutput]:
         """Apply pooling to the hidden states corresponding to the input
@@ -1095,6 +1069,25 @@ class LLM:
             considered legacy and may be deprecated in the future. You should
             instead pass them via the `inputs` parameter.
         """
+        if pooling_task is None:
+            if "embed" in self.supported_tasks:
+                pooling_task = "embed"
+            else:
+                pooling_task = "encode"
+
+            logger.warning_once(
+                "`LLM.encode` is currently using `pooling_task = %s`.\n"
+                "Please use one of the more specific methods or set the "
+                "task directly when using `LLM.encode`:\n"
+                "  - For embeddings, use `LLM.embed(...)` "
+                "or `pooling_task=\"embed\"`.\n"
+                "  - For classification logits, use `LLM.classify(...)` "
+                "or `pooling_task=\"classify\"`.\n"
+                "  - For rewards, use `LLM.reward(...)` "
+                "or `pooling_task=\"reward\"`\n"
+                "  - For similarity scores, use `LLM.score(...)`.",
+                pooling_task)
+
         model_config = self.llm_engine.model_config
         runner_type = model_config.runner_type
         if runner_type != "pooling":
@@ -1196,6 +1189,8 @@ class LLM:
         /,
         *,
         use_tqdm: Union[bool, Callable[..., tqdm]] = True,
+        pooling_params: Optional[Union[PoolingParams,
+                                       Sequence[PoolingParams]]] = None,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
     ) -> list[ClassificationRequestOutput]:
         """
@@ -1214,7 +1209,8 @@ class LLM:
                 it is used to create the progress bar.
                 If `False`, no progress bar is created.
             lora_request: LoRA request to use for generation, if any.
-
+            pooling_params: The pooling parameters for pooling. If None, we
+                use the default pooling parameters.
         Returns:
             A list of `ClassificationRequestOutput` objects containing the
             embedding vectors in the same order as the input prompts.
@@ -1227,12 +1223,52 @@ class LLM:
         items = self.encode(
             prompts,
             use_tqdm=use_tqdm,
+            pooling_params=pooling_params,
             lora_request=lora_request,
             pooling_task="classify",
         )
 
         return [ClassificationRequestOutput.from_base(item) for item in items]
 
+    def reward(
+        self,
+        prompts: Union[PromptType, Sequence[PromptType]],
+        /,
+        *,
+        truncate_prompt_tokens: Optional[int] = None,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
+        pooling_params: Optional[Union[PoolingParams,
+                                       Sequence[PoolingParams]]] = None,
+        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
+    ) -> list[PoolingRequestOutput]:
+        """
+        Generate rewards for each prompt.
+
+        Args:
+            prompts: The prompts to the LLM. You may pass a sequence of prompts
+                for batch inference. See [PromptType][vllm.inputs.PromptType]
+                for more details about the format of each prompts.
+            use_tqdm: If `True`, shows a tqdm progress bar.
+                If a callable (e.g., `functools.partial(tqdm, leave=False)`),
+                it is used to create the progress bar.
+                If `False`, no progress bar is created.
+            lora_request: LoRA request to use for generation, if any.
+            pooling_params: The pooling parameters for pooling. If None, we
+                use the default pooling parameters.
+        Returns:
+            A list of `PoolingRequestOutput` objects containing the
+            pooled hidden states in the same order as the input prompts.
+        """
+
+        return self.encode(
+            prompts,
+            use_tqdm=use_tqdm,
+            lora_request=lora_request,
+            pooling_params=pooling_params,
+            truncate_prompt_tokens=truncate_prompt_tokens,
+            pooling_task="encode",
+        )
+
     def _embedding_score(
         self,
         tokenizer: AnyTokenizer,
@@ -1240,6 +1276,7 @@ class LLM:
         text_2: list[Union[str, TextPrompt, TokensPrompt]],
         truncate_prompt_tokens: Optional[int] = None,
         use_tqdm: Union[bool, Callable[..., tqdm]] = True,
+        pooling_params: Optional[PoolingParams] = None,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
     ) -> list[ScoringRequestOutput]:
 
@@ -1248,6 +1285,7 @@ class LLM:
             truncate_prompt_tokens=truncate_prompt_tokens,
             use_tqdm=use_tqdm,
             lora_request=lora_request,
+            pooling_params=pooling_params,
             pooling_task="embed",
         )
 
@@ -1274,6 +1312,7 @@ class LLM:
         data_2: Union[list[str], list[ScoreContentPartParam]],
         truncate_prompt_tokens: Optional[int] = None,
         use_tqdm: Union[bool, Callable[..., tqdm]] = True,
+        pooling_params: Optional[PoolingParams] = None,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
     ) -> list[ScoringRequestOutput]:
         model_config = self.llm_engine.model_config
@@ -1285,7 +1324,12 @@ class LLM:
         if len(data_1) == 1:
             data_1 = data_1 * len(data_2)
 
-        pooling_params = PoolingParams(task="score")
+        if pooling_params is None:
+            pooling_params = PoolingParams(task="score")
+
+        model_config = self.llm_engine.model_config
+        pooling_params.verify("score", model_config)
+
         tokenization_kwargs: dict[str, Any] = {}
 
         _validate_truncation_size(model_config.max_model_len,
@@ -1347,6 +1391,7 @@ class LLM:
         *,
         truncate_prompt_tokens: Optional[int] = None,
         use_tqdm: Union[bool, Callable[..., tqdm]] = True,
+        pooling_params: Optional[PoolingParams] = None,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
     ) -> list[ScoringRequestOutput]:
         """Generate similarity scores for all pairs `<text,text_pair>` or
@@ -1361,24 +1406,25 @@ class LLM:
         of your inputs into a single list and pass it to this method.
 
         Supports both text and multi-modal data (images, etc.) when used with
-        appropriate multi-modal models. For multi-modal inputs, ensure the 
+        appropriate multi-modal models. For multi-modal inputs, ensure the
         prompt structure matches the model's expected input format.
 
         Args:
-            data_1: Can be a single prompt, a list of prompts or 
-                `ScoreMultiModalParam`, which can contain either text or 
-                multi-modal data. When a list, it must have the same length as 
+            data_1: Can be a single prompt, a list of prompts or
+                `ScoreMultiModalParam`, which can contain either text or
+                multi-modal data. When a list, it must have the same length as
                 the `data_2` list.
-            data_2: The data to pair with the query to form the input to 
+            data_2: The data to pair with the query to form the input to
                 the LLM. Can be text or multi-modal data. See [PromptType]
-                [vllm.inputs.PromptType] for more details about the format of 
+                [vllm.inputs.PromptType] for more details about the format of
                 each prompt.
             use_tqdm: If `True`, shows a tqdm progress bar.
                 If a callable (e.g., `functools.partial(tqdm, leave=False)`),
                 it is used to create the progress bar.
                 If `False`, no progress bar is created.
             lora_request: LoRA request to use for generation, if any.
-
+            pooling_params: The pooling parameters for pooling. If None, we
+                use the default pooling parameters.
         Returns:
             A list of `ScoringRequestOutput` objects containing the
             generated scores in the same order as the input prompts.
@@ -1462,6 +1508,7 @@ class LLM:
                 data_2,  # type: ignore[arg-type]
                 truncate_prompt_tokens,
                 use_tqdm,
+                pooling_params,
                 lora_request)
         else:
             return self._embedding_score(
@@ -1470,6 +1517,7 @@ class LLM:
                 data_2,  # type: ignore[arg-type]
                 truncate_prompt_tokens,
                 use_tqdm,
+                pooling_params,
                 lora_request)
 
     def start_profile(self) -> None:
@@ -1582,17 +1630,8 @@ class LLM:
         use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[Sequence[LoRARequest], LoRARequest]],
         tokenization_kwargs: Optional[dict[str, Any]] = None,
-        guided_options: Optional[GuidedDecodingRequest] = None,
         priority: Optional[list[int]] = None,
     ) -> None:
-        if guided_options is not None:
-            warnings.warn(
-                "guided_options_request is deprecated, use "
-                "SamplingParams.guided_decoding instead",
-                DeprecationWarning,
-                stacklevel=2,
-            )
-
         if isinstance(prompts, (str, dict)):
             # Convert a single prompt to a list.
             prompts = [prompts]
@@ -1608,8 +1647,6 @@ class LLM:
 
         for sp in params if isinstance(params, Sequence) else (params, ):
             if isinstance(sp, SamplingParams):
-                self._add_guided_params(sp, guided_options)
-
                 # We only care about the final output
                 sp.output_kind = RequestOutputKind.FINAL_ONLY
 
@@ -1647,29 +1684,6 @@ class LLM:
             priority=priority,
         )
 
-    def _add_guided_params(
-            self,
-            params: SamplingParams,
-            guided_options: Optional[GuidedDecodingRequest] = None):
-        if guided_options is None:
-            return params
-
-        if params.guided_decoding is not None:
-            raise ValueError("Cannot set both guided_options_request and "
-                             "params.guided_decoding.")
-
-        params.guided_decoding = GuidedDecodingParams(
-            json=guided_options.guided_json,
-            regex=guided_options.guided_regex,
-            choice=guided_options.guided_choice,
-            grammar=guided_options.guided_grammar,
-            json_object=guided_options.guided_json_object,
-            backend=guided_options.guided_decoding_backend,
-            whitespace_pattern=guided_options.guided_whitespace_pattern,
-            structural_tag=guided_options.structural_tag,
-        )
-        return params
-
     def _run_engine(
         self,
         *,
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 3d4c4a6b752a7..9bf4702320788 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -100,7 +100,7 @@ from vllm.transformers_utils.config import (
     maybe_register_config_serialize_by_value)
 from vllm.transformers_utils.tokenizer import MistralTokenizer
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import (Device, FlexibleArgumentParser, bind_process_name,
+from vllm.utils import (Device, FlexibleArgumentParser, decorate_logs,
                         get_open_zmq_ipc_path, is_valid_ipv6_address,
                         set_ulimit)
 from vllm.v1.metrics.prometheus import get_prometheus_registry
@@ -199,15 +199,18 @@ async def build_async_engine_client_from_engine_args(
 
         from vllm.v1.engine.async_llm import AsyncLLM
         async_llm: Optional[AsyncLLM] = None
+        client_count = client_config.pop(
+            "client_count") if client_config else 1
         client_index = client_config.pop(
             "client_index") if client_config else 0
         try:
             async_llm = AsyncLLM.from_vllm_config(
                 vllm_config=vllm_config,
                 usage_context=usage_context,
-                disable_log_requests=engine_args.disable_log_requests,
+                enable_log_requests=engine_args.enable_log_requests,
                 disable_log_stats=engine_args.disable_log_stats,
                 client_addresses=client_config,
+                client_count=client_count,
                 client_index=client_index)
 
             # Don't keep the dummy data in memory
@@ -227,7 +230,7 @@ async def build_async_engine_client_from_engine_args(
             engine_client = AsyncLLMEngine.from_vllm_config(
                 vllm_config=vllm_config,
                 usage_context=usage_context,
-                disable_log_requests=engine_args.disable_log_requests,
+                enable_log_requests=engine_args.enable_log_requests,
                 disable_log_stats=engine_args.disable_log_stats)
             yield engine_client
         finally:
@@ -272,7 +275,7 @@ async def build_async_engine_client_from_engine_args(
             target=run_mp_engine,
             args=(vllm_config, UsageContext.OPENAI_API_SERVER, ipc_path,
                   engine_args.disable_log_stats,
-                  engine_args.disable_log_requests, engine_alive))
+                  engine_args.enable_log_requests, engine_alive))
         engine_process.start()
         engine_pid = engine_process.pid
         assert engine_pid is not None, "Engine process failed to start."
@@ -1238,9 +1241,9 @@ class AuthenticationMiddleware:
         2. The request path doesn't start with /v1 (e.g. /health).
     """
 
-    def __init__(self, app: ASGIApp, api_token: str) -> None:
+    def __init__(self, app: ASGIApp, tokens: list[str]) -> None:
         self.app = app
-        self.api_token = api_token
+        self.api_tokens = {f"Bearer {token}" for token in tokens}
 
     def __call__(self, scope: Scope, receive: Receive,
                  send: Send) -> Awaitable[None]:
@@ -1254,7 +1257,7 @@ class AuthenticationMiddleware:
         headers = Headers(scope=scope)
         # Type narrow to satisfy mypy.
         if url_path.startswith("/v1") and headers.get(
-                "Authorization") != f"Bearer {self.api_token}":
+                "Authorization") not in self.api_tokens:
             response = JSONResponse(content={"error": "Unauthorized"},
                                     status_code=401)
             return response(scope, receive, send)
@@ -1302,7 +1305,7 @@ class ScalingMiddleware:
     """
     Middleware that checks if the model is currently scaling and
     returns a 503 Service Unavailable response if it is.
-    
+
     This middleware applies to all HTTP requests and prevents
     processing when the model is in a scaling state.
     """
@@ -1511,8 +1514,8 @@ def build_app(args: Namespace) -> FastAPI:
                             status_code=HTTPStatus.BAD_REQUEST)
 
     # Ensure --api-key option from CLI takes precedence over VLLM_API_KEY
-    if token := args.api_key or envs.VLLM_API_KEY:
-        app.add_middleware(AuthenticationMiddleware, api_token=token)
+    if tokens := [key for key in (args.api_key or [envs.VLLM_API_KEY]) if key]:
+        app.add_middleware(AuthenticationMiddleware, tokens=tokens)
 
     if args.enable_request_id_headers:
         app.add_middleware(XRequestIdMiddleware)
@@ -1570,10 +1573,10 @@ async def init_app_state(
     else:
         served_model_names = [args.model]
 
-    if args.disable_log_requests:
-        request_logger = None
-    else:
+    if args.enable_log_requests:
         request_logger = RequestLogger(max_log_len=args.max_log_len)
+    else:
+        request_logger = None
 
     base_model_paths = [
         BaseModelPath(name=name, model_path=args.model)
@@ -1805,6 +1808,10 @@ def setup_server(args):
 
 async def run_server(args, **uvicorn_kwargs) -> None:
     """Run a single-worker API server."""
+
+    # Add process-specific prefix to stdout and stderr.
+    decorate_logs("APIServer")
+
     listen_address, sock = setup_server(args)
     await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
 
@@ -1820,7 +1827,7 @@ async def run_server_worker(listen_address,
         ToolParserManager.import_tool_parser(args.tool_parser_plugin)
 
     server_index = client_config.get("client_index", 0) if client_config else 0
-    bind_process_name("APIServer", str(server_index))
+
     # Load logging config for uvicorn if specified
     log_config = load_log_config(args.log_config_file)
     if log_config is not None:
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index 2d19e16883aa2..dfbc9cde3d5b1 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -85,22 +85,22 @@ class FrontendArgs:
     """Allowed methods."""
     allowed_headers: list[str] = field(default_factory=lambda: ["*"])
     """Allowed headers."""
-    api_key: Optional[str] = None
-    """If provided, the server will require this key to be presented in the
-    header."""
+    api_key: Optional[list[str]] = None
+    """If provided, the server will require one of these keys to be presented in
+    the header."""
     lora_modules: Optional[list[LoRAModulePath]] = None
     """LoRA modules configurations in either 'name=path' format or JSON format
-    or JSON list format. Example (old format): `'name=path'` Example (new 
-    format): `{\"name\": \"name\", \"path\": \"lora_path\", 
+    or JSON list format. Example (old format): `'name=path'` Example (new
+    format): `{\"name\": \"name\", \"path\": \"lora_path\",
     \"base_model_name\": \"id\"}`"""
     chat_template: Optional[str] = None
-    """The file path to the chat template, or the template in single-line form 
+    """The file path to the chat template, or the template in single-line form
     for the specified model."""
     chat_template_content_format: ChatTemplateContentFormatOption = "auto"
     """The format to render message content within a chat template.
 
 * "string" will render the content as a string. Example: `"Hello World"`
-* "openai" will render the content as a list of dictionaries, similar to OpenAI 
+* "openai" will render the content as a list of dictionaries, similar to OpenAI
 schema. Example: `[{"type": "text", "text": "Hello world!"}]`"""
     response_role: str = "assistant"
     """The role name to return if `request.add_generation_prompt=true`."""
@@ -117,40 +117,40 @@ schema. Example: `[{"type": "text", "text": "Hello world!"}]`"""
     root_path: Optional[str] = None
     """FastAPI root_path when app is behind a path based routing proxy."""
     middleware: list[str] = field(default_factory=lambda: [])
-    """Additional ASGI middleware to apply to the app. We accept multiple 
-    --middleware arguments. The value should be an import path. If a function 
-    is provided, vLLM will add it to the server using 
-    `@app.middleware('http')`. If a class is provided, vLLM will 
+    """Additional ASGI middleware to apply to the app. We accept multiple
+    --middleware arguments. The value should be an import path. If a function
+    is provided, vLLM will add it to the server using
+    `@app.middleware('http')`. If a class is provided, vLLM will
     add it to the server using `app.add_middleware()`."""
     return_tokens_as_token_ids: bool = False
-    """When `--max-logprobs` is specified, represents single tokens as 
-    strings of the form 'token_id:{token_id}' so that tokens that are not 
+    """When `--max-logprobs` is specified, represents single tokens as
+    strings of the form 'token_id:{token_id}' so that tokens that are not
     JSON-encodable can be identified."""
     disable_frontend_multiprocessing: bool = False
-    """If specified, will run the OpenAI frontend server in the same process as 
+    """If specified, will run the OpenAI frontend server in the same process as
     the model serving engine."""
     enable_request_id_headers: bool = False
-    """If specified, API server will add X-Request-Id header to responses. 
+    """If specified, API server will add X-Request-Id header to responses.
     Caution: this hurts performance at high QPS."""
     enable_auto_tool_choice: bool = False
-    """If specified, exclude tool definitions in prompts when 
+    """If specified, exclude tool definitions in prompts when
     tool_choice='none'."""
     exclude_tools_when_tool_choice_none: bool = False
-    """Enable auto tool choice for supported models. Use `--tool-call-parser` 
+    """Enable auto tool choice for supported models. Use `--tool-call-parser`
     to specify which parser to use."""
     tool_call_parser: Optional[str] = None
-    """Select the tool call parser depending on the model that you're using. 
-    This is used to parse the model-generated tool call into OpenAI API format. 
-    Required for `--enable-auto-tool-choice`. You can choose any option from 
+    """Select the tool call parser depending on the model that you're using.
+    This is used to parse the model-generated tool call into OpenAI API format.
+    Required for `--enable-auto-tool-choice`. You can choose any option from
     the built-in parsers or register a plugin via `--tool-parser-plugin`."""
     tool_parser_plugin: str = ""
-    """Special the tool parser plugin write to parse the model-generated tool 
-    into OpenAI API format, the name register in this plugin can be used in 
+    """Special the tool parser plugin write to parse the model-generated tool
+    into OpenAI API format, the name register in this plugin can be used in
     `--tool-call-parser`."""
     log_config_file: Optional[str] = envs.VLLM_LOGGING_CONFIG_PATH
     """Path to logging config JSON file for both vllm and uvicorn"""
     max_log_len: Optional[int] = None
-    """Max number of prompt characters or prompt ID numbers being printed in 
+    """Max number of prompt characters or prompt ID numbers being printed in
     log. The default of None means unlimited."""
     disable_fastapi_docs: bool = False
     """Disable FastAPI's OpenAPI schema, Swagger UI, and ReDoc endpoint."""
@@ -194,7 +194,9 @@ schema. Example: `[{"type": "text", "text": "Hello world!"}]`"""
 
         # Special case: Tool call parser shows built-in options.
         valid_tool_parsers = list(ToolParserManager.tool_parsers.keys())
-        frontend_kwargs["tool_call_parser"]["choices"] = valid_tool_parsers
+        parsers_str = ",".join(valid_tool_parsers)
+        frontend_kwargs["tool_call_parser"]["metavar"] = (
+            f"{{{parsers_str}}} or name registered in --tool-parser-plugin")
 
         frontend_group = parser.add_argument_group(
             title="Frontend",
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index b6b3bf3f530e3..64f2beb14021a 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -859,6 +859,15 @@ class ChatCompletionRequest(OpenAIBaseModel):
                     'are supported.'
                 )
 
+            # if tool_choice is "required" but the "tools" list is empty,
+            # override the data to behave like "none" to align with
+            # OpenAI’s behavior.
+            if data["tool_choice"] == "required" and isinstance(
+                    data["tools"], list) and len(data["tools"]) == 0:
+                data["tool_choice"] = "none"
+                del data["tools"]
+                return data
+
             # ensure that if "tool_choice" is specified as an object,
             # it matches a valid tool
             correct_usage_message = 'Correct usage: `{"type": "function",' \
@@ -1265,11 +1274,13 @@ class EmbeddingCompletionRequest(OpenAIBaseModel):
             "not set it, a random_uuid will be generated. This id is used "
             "through out the inference process and return in response."),
     )
+    normalize: Optional[bool] = None
 
     # --8<-- [end:embedding-extra-params]
 
     def to_pooling_params(self):
-        return PoolingParams(dimensions=self.dimensions)
+        return PoolingParams(dimensions=self.dimensions,
+                             normalize=self.normalize)
 
 
 class EmbeddingChatRequest(OpenAIBaseModel):
@@ -1323,6 +1334,7 @@ class EmbeddingChatRequest(OpenAIBaseModel):
             "not set it, a random_uuid will be generated. This id is used "
             "through out the inference process and return in response."),
     )
+    normalize: Optional[bool] = None
     # --8<-- [end:chat-embedding-extra-params]
 
     @model_validator(mode="before")
@@ -1335,7 +1347,8 @@ class EmbeddingChatRequest(OpenAIBaseModel):
         return data
 
     def to_pooling_params(self):
-        return PoolingParams(dimensions=self.dimensions)
+        return PoolingParams(dimensions=self.dimensions,
+                             normalize=self.normalize)
 
 
 EmbeddingRequest = Union[EmbeddingCompletionRequest, EmbeddingChatRequest]
@@ -1366,10 +1379,12 @@ class ScoreRequest(OpenAIBaseModel):
             "if the served model does not use priority scheduling."),
     )
 
+    activation: Optional[bool] = None
+
     # --8<-- [end:score-extra-params]
 
     def to_pooling_params(self):
-        return PoolingParams()
+        return PoolingParams(activation=self.activation)
 
 
 class RerankRequest(OpenAIBaseModel):
@@ -1394,10 +1409,12 @@ class RerankRequest(OpenAIBaseModel):
             "if the served model does not use priority scheduling."),
     )
 
+    activation: Optional[bool] = None
+
     # --8<-- [end:rerank-extra-params]
 
     def to_pooling_params(self):
-        return PoolingParams()
+        return PoolingParams(activation=self.activation)
 
 
 class RerankDocument(BaseModel):
@@ -1544,10 +1561,12 @@ class ClassificationRequest(OpenAIBaseModel):
             "if the served model does not use priority scheduling."),
     )
 
+    activation: Optional[bool] = None
+
     # --8<-- [end:classification-extra-params]
 
     def to_pooling_params(self):
-        return PoolingParams()
+        return PoolingParams(activation=self.activation)
 
 
 class ClassificationData(OpenAIBaseModel):
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index 137b368dad202..d146ad485d194 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -324,10 +324,10 @@ async def run_batch(
     else:
         served_model_names = [args.model]
 
-    if args.disable_log_requests:
-        request_logger = None
-    else:
+    if args.enable_log_requests:
         request_logger = RequestLogger(max_log_len=args.max_log_len)
+    else:
+        request_logger = None
 
     base_model_paths = [
         BaseModelPath(name=name, model_path=args.model)
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index 64880a3a5377f..e009529fbd2ad 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -11,6 +11,7 @@ import jinja2
 from fastapi import Request
 from openai.types.responses import ResponseOutputMessage, ResponseOutputText
 
+from vllm import envs
 from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
@@ -89,15 +90,26 @@ class OpenAIServingResponses(OpenAIServing):
             logger.info("Using default chat sampling params from %s: %s",
                         source, self.default_sampling_params)
 
+        # If False (default), the "store" option is (silently) ignored and the
+        # response is not stored. If True, the response is stored in memory.
+        # NOTE(woosuk): This may not be intuitive for users, as the default
+        # behavior in OpenAI's Responses API is to store the response, but
+        # vLLM's default behavior is not.
+        self.enable_store = envs.VLLM_ENABLE_RESPONSES_API_STORE
+        if self.enable_store:
+            logger.warning_once(
+                "`VLLM_ENABLE_RESPONSES_API_STORE` is enabled. This may "
+                "cause a memory leak since we never remove responses from "
+                "the store.")
         # HACK(woosuk): This is a hack. We should use a better store.
-        # FIXME: This causes a memory leak since we never remove responses
-        # from the store.
+        # FIXME: If enable_store=True, this may cause a memory leak since we
+        # never remove responses from the store.
         self.response_store: dict[str, ResponsesResponse] = {}
         self.response_store_lock = asyncio.Lock()
 
         # HACK(woosuk): This is a hack. We should use a better store.
-        # FIXME: This causes a memory leak since we never remove messages
-        # from the store.
+        # FIXME: If enable_store=True, this may cause a memory leak since we
+        # never remove messages from the store.
         self.msg_store: dict[str, list[ChatCompletionMessageParam]] = {}
 
         self.background_tasks: dict[str, asyncio.Task] = {}
@@ -118,6 +130,26 @@ class OpenAIServingResponses(OpenAIServing):
         if self.engine_client.errored:
             raise self.engine_client.dead_error
 
+        if request.store and not self.enable_store:
+            if request.background:
+                return self.create_error_response(
+                    err_type="invalid_request_error",
+                    message=(
+                        "This vLLM engine does not support `store=True` and "
+                        "therefore does not support the background mode. To "
+                        "enable these features, set the environment variable "
+                        "`VLLM_ENABLE_RESPONSES_API_STORE=1` when launching "
+                        "the vLLM server."),
+                    status_code=HTTPStatus.BAD_REQUEST,
+                )
+            # Disable the store option.
+            # NOTE(woosuk): Although returning an error is possible, we opted
+            # to implicitly disable store and process the request anyway, as
+            # we assume most users do not intend to actually store the response
+            # (i.e., their request's `store=True` just because it's the default
+            # value).
+            request.store = False
+
         # Handle the previous response ID.
         prev_response_id = request.previous_response_id
         if prev_response_id is not None:
@@ -456,3 +488,13 @@ class OpenAIServingResponses(OpenAIServing):
             message=f"Response with id '{response_id}' not found.",
             status_code=HTTPStatus.NOT_FOUND,
         )
+
+    def _make_store_not_supported_error(self) -> ErrorResponse:
+        return self.create_error_response(
+            err_type="invalid_request_error",
+            message=("`store=True` (default) is not supported. Please set "
+                     "`store=False` in Responses API or set "
+                     "`VLLM_ENABLE_RESPONSES_API_STORE=1` in the env var when "
+                     "starting the vLLM server."),
+            status_code=HTTPStatus.BAD_REQUEST,
+        )
diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py
index c2227a21a4b9a..01140a4bfea7e 100644
--- a/vllm/entrypoints/openai/speech_to_text.py
+++ b/vllm/entrypoints/openai/speech_to_text.py
@@ -86,11 +86,7 @@ class OpenAISpeechToText(OpenAIServing):
         audio_data: bytes,
     ) -> tuple[list[PromptType], float]:
         # Validate request
-        # TODO language should be optional and can be guessed.
-        # For now we default to en. See
-        # https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/generation_whisper.py#L1520
-        lang = request.language or "en"
-        self.model_cls.validate_language(lang)
+        language = self.model_cls.validate_language(request.language)
 
         if len(audio_data) / 1024**2 > self.max_audio_filesize_mb:
             raise ValueError("Maximum file size exceeded.")
@@ -112,7 +108,7 @@ class OpenAISpeechToText(OpenAIServing):
                 audio=chunk,
                 stt_config=self.asr_config,
                 model_config=self.model_config,
-                language=lang,
+                language=language,
                 task_type=self.task_type,
                 request_prompt=request.prompt)
             prompts.append(prompt)
diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py
index 88c8aa929b78d..099e456aa486f 100644
--- a/vllm/entrypoints/openai/tool_parsers/__init__.py
+++ b/vllm/entrypoints/openai/tool_parsers/__init__.py
@@ -18,6 +18,7 @@ from .mistral_tool_parser import MistralToolParser
 from .phi4mini_tool_parser import Phi4MiniJsonToolParser
 from .pythonic_tool_parser import PythonicToolParser
 from .qwen3coder_tool_parser import Qwen3CoderToolParser
+from .step3_tool_parser import Step3ToolParser
 from .xlam_tool_parser import xLAMToolParser
 
 __all__ = [
@@ -40,4 +41,5 @@ __all__ = [
     "HunyuanA13BToolParser",
     "Glm4MoeModelToolParser",
     "Qwen3CoderToolParser",
+    "Step3ToolParser",
 ]
diff --git a/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py
index b0df442dd8644..834b33052b45d 100644
--- a/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py
@@ -38,15 +38,15 @@ class KimiK2ToolParser(ToolParser):
         self.tool_call_end_token: str = "<|tool_call_end|>"
 
         self.tool_call_regex = re.compile(
-            r"<\|tool_call_begin\|>\s*(?P<tool_call_id>[\w\.]+:\d+)\s*<\|tool_call_argument_begin\|>\s*(?P<function_arguments>.*?)\s*<\|tool_call_end\|>"
+            r"<\|tool_call_begin\|>\s*(?P<tool_call_id>.+:\d+)\s*<\|tool_call_argument_begin\|>\s*(?P<function_arguments>.*?)\s*<\|tool_call_end\|>"
         )
 
         self.stream_tool_call_portion_regex = re.compile(
-            r"(?P<tool_call_id>[\w\.]+:\d+)\s*<\|tool_call_argument_begin\|>\s*(?P<function_arguments>.*)"
+            r"(?P<tool_call_id>.+:\d+)\s*<\|tool_call_argument_begin\|>\s*(?P<function_arguments>.*)"
         )
 
         self.stream_tool_call_name_regex = re.compile(
-            r"(?P<tool_call_id>[\w\.]+:\d+)\s*")
+            r"(?P<tool_call_id>.+:\d+)\s*")
 
         if not self.model_tokenizer:
             raise ValueError(
@@ -374,4 +374,4 @@ class KimiK2ToolParser(ToolParser):
 
         except Exception:
             logger.exception("Error trying to handle streaming tool call.")
-            return None  # do not stream a delta. skip this token ID.
\ No newline at end of file
+            return None  # do not stream a delta. skip this token ID.
diff --git a/vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py
new file mode 100644
index 0000000000000..a20d18eb52544
--- /dev/null
+++ b/vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py
@@ -0,0 +1,296 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import contextlib
+import json
+from collections.abc import Sequence
+from typing import Any, Optional, Union
+
+import regex as re
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaFunctionCall, DeltaMessage,
+                                              DeltaToolCall,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser, ToolParserManager)
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.utils import random_uuid
+
+logger = init_logger(__name__)
+
+
+@ToolParserManager.register_module(["step3"])
+class Step3ToolParser(ToolParser):
+    """
+    Tool parser for a model that uses a specific XML-like format for tool calls.
+    This version uses a robust, stateful, cursor-based streaming parser and
+    consolidates tool arguments into a single message.
+    """
+
+    TOOL_CALLS_BEGIN = "<｜tool_calls_begin｜>"
+    TOOL_CALLS_END = "<｜tool_calls_end｜>"
+    TOOL_CALL_BEGIN = "<｜tool_call_begin｜>"
+    TOOL_CALL_END = "<｜tool_call_end｜>"
+    TOOL_SEP = "<｜tool_sep｜>"
+    SPECIAL_TOKENS = [
+        TOOL_CALLS_BEGIN, TOOL_CALLS_END, TOOL_CALL_BEGIN, TOOL_CALL_END
+    ]
+
+    def __init__(self, tokenizer: AnyTokenizer):
+        super().__init__(tokenizer)
+        self.position = 0
+        # Explicit state flags for robust streaming
+        self.tool_block_started = False
+        self.tool_block_finished = False
+
+    def adjust_request(
+            self, request: ChatCompletionRequest) -> ChatCompletionRequest:
+        if request.tools and request.tool_choice != 'none':
+            request.skip_special_tokens = False
+        return request
+
+    @staticmethod
+    def _parse_steptml_invoke(
+            action_text: str
+    ) -> tuple[Optional[str], Optional[dict[str, str]]]:
+        func_name_match = re.search(r'<steptml:invoke name="([^"]+)">',
+                                    action_text)
+        if not func_name_match:
+            return None, None
+        func_name = func_name_match.group(1)
+
+        params: dict[str, str] = {}
+        param_matches = re.findall(
+            r'<steptml:parameter name="([^"]+)">([^<]*)</steptml:parameter>',
+            action_text)
+        for name, value in param_matches:
+            params[name] = value.strip()
+        return func_name, params
+
+    def _cast_arguments(
+        self,
+        func_name: str,
+        params: dict[str, Any],
+        request: ChatCompletionRequest,
+    ) -> dict[str, Any]:
+        for tool in request.tools or []:
+            if tool.function.name == func_name:
+                schema = tool.function.parameters or {}
+                properties = schema.get("properties", {})
+                for key, value in params.items():
+                    if not isinstance(value, str):
+                        continue
+                    prop = properties.get(key, {})
+                    typ = prop.get("type")
+                    if typ == "string":
+                        params[key] = value.strip()
+                    elif typ == "integer":
+                        with contextlib.suppress(ValueError):
+                            params[key] = int(value)
+                    elif typ == "number":
+                        with contextlib.suppress(ValueError):
+                            params[key] = float(value)
+                    elif typ == "boolean":
+                        lower_val = value.lower()
+                        params[key] = lower_val == "true" if lower_val in (
+                            "true", "false") else value
+                    elif typ == "null":
+                        params[key] = None if value.lower(
+                        ) == "null" else value
+                break
+        return params
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Union[DeltaMessage, None]:
+
+        # The main loop processes the stream from the last known position.
+        while True:
+            if self.position >= len(current_text):
+                return None  # We've processed the entire stream.
+
+            unprocessed_text = current_text[self.position:]
+
+            # STATE: After all tools are done, all subsequent text is content.
+            if self.tool_block_finished:
+                self.position = len(current_text)
+                return DeltaMessage(content=unprocessed_text)
+
+            # STATE: Before the tool block has started.
+            if not self.tool_block_started:
+                if unprocessed_text.startswith(self.TOOL_CALLS_BEGIN):
+                    self.position += len(self.TOOL_CALLS_BEGIN)
+                    self.tool_block_started = True
+                    continue  # Token consumed, re-loop.
+
+                start_pos = unprocessed_text.find(self.TOOL_CALLS_BEGIN)
+                if start_pos == -1:
+                    if self.TOOL_CALLS_BEGIN.startswith(
+                            unprocessed_text.strip()) and unprocessed_text:
+                        return None  # It's a prefix, wait.
+                    self.position = len(current_text)
+                    return DeltaMessage(content=unprocessed_text)
+                else:
+                    content = unprocessed_text[:start_pos]
+                    self.position += len(content)
+                    return DeltaMessage(content=content)
+
+            # STATE: Inside the main tool block.
+            offset = len(unprocessed_text) - len(unprocessed_text.lstrip())
+            unprocessed_text = unprocessed_text.lstrip()
+            self.position += offset
+
+            if unprocessed_text.startswith(self.TOOL_CALLS_END):
+                self.position += len(self.TOOL_CALLS_END)
+                self.tool_block_finished = True
+                self.current_tool_id = -1
+                continue
+
+            # Check if we are between tool calls.
+            tool_finished = (
+                self.current_tool_id != -1 and
+                self.prev_tool_call_arr[self.current_tool_id].get("finished"))
+            if self.current_tool_id == -1 or tool_finished:
+                if unprocessed_text.startswith(self.TOOL_CALL_BEGIN):
+                    self.position += len(self.TOOL_CALL_BEGIN)
+                    if self.current_tool_id == -1:
+                        self.current_tool_id = 0
+                    else:
+                        self.current_tool_id += 1
+                    self.current_tool_name_sent = False
+                    while len(self.prev_tool_call_arr) <= self.current_tool_id:
+                        self.prev_tool_call_arr.append({})
+                    self.prev_tool_call_arr[
+                        self.current_tool_id]["finished"] = False
+                    continue
+
+                if self.TOOL_CALL_BEGIN.startswith(unprocessed_text):
+                    return None
+
+            # STATE: Parsing an active tool call.
+            if self.current_tool_id != -1 and not self.prev_tool_call_arr[
+                    self.current_tool_id].get("finished", False):
+                end_tool_pos = unprocessed_text.find(self.TOOL_CALL_END)
+                if end_tool_pos == -1:
+                    tool_body = unprocessed_text
+                else:
+                    tool_body = unprocessed_text[:end_tool_pos]
+
+                if end_tool_pos == -1 and self.TOOL_CALL_END.startswith(
+                        tool_body):
+                    return None
+
+                function_name, arguments = self._parse_steptml_invoke(
+                    tool_body)
+                if not function_name:
+                    return None
+
+                tool_call_arr = {
+                    "name": function_name,
+                    "parameters": arguments or {}
+                }
+
+                # Send the function name as soon as it's parsed.
+                if not self.current_tool_name_sent:
+                    self.current_tool_name_sent = True
+                    self.prev_tool_call_arr[self.current_tool_id].update(
+                        tool_call_arr)
+                    return DeltaMessage(tool_calls=[
+                        DeltaToolCall(index=self.current_tool_id,
+                                      type="function",
+                                      id=f"chatcmpl-tool-{random_uuid()}",
+                                      function=DeltaFunctionCall(
+                                          name=function_name))
+                    ])
+
+                # Update our internal state with the latest parsed arguments.
+                self.prev_tool_call_arr[
+                    self.current_tool_id].update(  # noqa: E501
+                        tool_call_arr)
+
+                # Only send arguments when the tool call is complete.
+                if end_tool_pos != -1:
+                    self.position += end_tool_pos + len(self.TOOL_CALL_END)
+                    self.prev_tool_call_arr[
+                        self.current_tool_id]["finished"] = True
+
+                    final_args = self._cast_arguments(
+                        function_name,
+                        tool_call_arr.get("parameters", {}),  # type: ignore
+                        request)
+                    if final_args:
+                        final_args_json = json.dumps(final_args,
+                                                     ensure_ascii=False)
+                        return DeltaMessage(tool_calls=[
+                            DeltaToolCall(index=self.current_tool_id,
+                                          function=DeltaFunctionCall(
+                                              arguments=final_args_json))
+                        ])
+
+                # If tool is not finished, return None to wait for more tokens.
+                return None
+
+            return None
+
+    def extract_tool_calls(
+        self,
+        model_output: str,
+        request: ChatCompletionRequest,
+    ) -> ExtractedToolCallInformation:
+        if self.TOOL_CALLS_BEGIN not in model_output:
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+        pre_text, rest = model_output.split(self.TOOL_CALLS_BEGIN, 1)
+        if self.TOOL_CALLS_END not in rest:
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+        tool_block, post_text = rest.split(self.TOOL_CALLS_END, 1)
+        content = (pre_text + post_text).strip()
+
+        tool_calls: list[ToolCall] = []
+        call_parts = tool_block.split(self.TOOL_CALL_BEGIN)
+
+        for part in call_parts:
+            if not part or self.TOOL_CALL_END not in part:
+                continue
+
+            call_content = part.split(self.TOOL_CALL_END, 1)[0]
+            if self.TOOL_SEP not in call_content:
+                continue
+
+            type_part, invoke_part = call_content.split(self.TOOL_SEP, 1)
+            if type_part.strip() != "function":
+                continue
+
+            function_name, params_dict = self._parse_steptml_invoke(
+                invoke_part)
+
+            if function_name and params_dict is not None:
+                params_dict = self._cast_arguments(function_name, params_dict,
+                                                   request)
+                params_str = json.dumps(params_dict, ensure_ascii=False)
+                tool_calls.append(
+                    ToolCall(function=FunctionCall(name=function_name,
+                                                   arguments=params_str)))
+        if tool_calls:
+            return ExtractedToolCallInformation(
+                tools_called=True,
+                tool_calls=tool_calls,
+                content=content if content else None)
+        return ExtractedToolCallInformation(tools_called=False,
+                                            tool_calls=[],
+                                            content=model_output)
diff --git a/vllm/envs.py b/vllm/envs.py
index fcfad4eec1621..e28e9658e5b53 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -55,6 +55,7 @@ if TYPE_CHECKING:
     VLLM_USE_RAY_COMPILED_DAG: bool = False
     VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: str = "auto"
     VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = False
+    VLLM_USE_RAY_WRAPPED_PP_COMM: bool = True
     VLLM_XLA_USE_SPMD: bool = False
     VLLM_WORKER_MULTIPROC_METHOD: str = "fork"
     VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets")
@@ -69,7 +70,6 @@ if TYPE_CHECKING:
     NVCC_THREADS: Optional[str] = None
     VLLM_USE_PRECOMPILED: bool = False
     VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False
-    VLLM_NO_DEPRECATION_WARNING: bool = False
     VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False
     CMAKE_BUILD_TYPE: Optional[str] = None
     VERBOSE: bool = False
@@ -79,6 +79,10 @@ if TYPE_CHECKING:
     VLLM_PLUGINS: Optional[list[str]] = None
     VLLM_LORA_RESOLVER_CACHE_DIR: Optional[str] = None
     VLLM_TORCH_PROFILER_DIR: Optional[str] = None
+    VLLM_TORCH_PROFILER_RECORD_SHAPES: bool = False
+    VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY: bool = False
+    VLLM_TORCH_PROFILER_WITH_STACK: bool = True
+    VLLM_TORCH_PROFILER_WITH_FLOPS: bool = False
     VLLM_USE_TRITON_AWQ: bool = False
     VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
     VLLM_SKIP_P2P_CHECK: bool = False
@@ -119,7 +123,9 @@ if TYPE_CHECKING:
     VLLM_V1_USE_OUTLINES_CACHE: bool = False
     VLLM_TPU_BUCKET_PADDING_GAP: int = 0
     VLLM_TPU_MOST_MODEL_LEN: Optional[int] = None
+    VLLM_TPU_USING_PATHWAYS: bool = False
     VLLM_USE_DEEP_GEMM: bool = False
+    VLLM_SKIP_DEEP_GEMM_WARMUP: bool = False
     VLLM_USE_FLASHINFER_MOE_FP8: bool = False
     VLLM_USE_FLASHINFER_MOE_FP4: bool = False
     VLLM_XGRAMMAR_CACHE_MB: int = 0
@@ -144,6 +150,7 @@ if TYPE_CHECKING:
     VLLM_ENABLE_CUDAGRAPH_GC: bool = False
     VLLM_LOOPBACK_IP: str = ""
     VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = False
+    VLLM_ENABLE_RESPONSES_API_STORE: bool = False
 
 
 def get_default_cache_root():
@@ -207,7 +214,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # Target device of vLLM, supporting [cuda (by default),
     # rocm, neuron, cpu]
     "VLLM_TARGET_DEVICE":
-    lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda"),
+    lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda").lower(),
 
     # Maximum number of compilation jobs to run in parallel.
     # By default this is the number of CPUs
@@ -493,6 +500,13 @@ environment_variables: dict[str, Callable[[], Any]] = {
     lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM", "0"))
                  ),
 
+    # If the env var is set, it uses a Ray Communicator wrapping
+    # vLLM's pipeline parallelism communicator to interact with Ray's
+    # Compiled Graph. Otherwise, it uses Ray's NCCL communicator.
+    # This flag is ignored if VLLM_USE_RAY_COMPILED_DAG is not set.
+    "VLLM_USE_RAY_WRAPPED_PP_COMM":
+    lambda: bool(int(os.getenv("VLLM_USE_RAY_WRAPPED_PP_COMM", "1"))),
+
     # Use dedicated multiprocess context for workers.
     # Both spawn and fork work
     "VLLM_WORKER_MULTIPROC_METHOD":
@@ -567,10 +581,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
     lambda: bool(
         int(os.getenv("VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING", "1"))),
 
-    # If set, vllm will skip the deprecation warnings.
-    "VLLM_NO_DEPRECATION_WARNING":
-    lambda: bool(int(os.getenv("VLLM_NO_DEPRECATION_WARNING", "0"))),
-
     # If set, the OpenAI API server will stay alive even after the underlying
     # AsyncLLMEngine errors and stops serving requests
     "VLLM_KEEP_ALIVE_ON_ENGINE_DEATH":
@@ -622,6 +632,31 @@ environment_variables: dict[str, Callable[[], Any]] = {
     lambda: (None if os.getenv("VLLM_TORCH_PROFILER_DIR", None) is None else os
              .path.expanduser(os.getenv("VLLM_TORCH_PROFILER_DIR", "."))),
 
+    # Enable torch profiler to record shapes if set
+    # VLLM_TORCH_PROFILER_RECORD_SHAPES=1. If not set, torch profiler will
+    # not record shapes.
+    "VLLM_TORCH_PROFILER_RECORD_SHAPES":
+    lambda: bool(os.getenv("VLLM_TORCH_PROFILER_RECORD_SHAPES", "0") != "0"),
+
+    # Enable torch profiler to profile memory if set
+    # VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY=1. If not set, torch profiler
+    # will not profile memory.
+    "VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY":
+    lambda: bool(
+        os.getenv("VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY", "0") != "0"),
+
+    # Enable torch profiler to profile stack if set
+    # VLLM_TORCH_PROFILER_WITH_STACK=1. If not set, torch profiler WILL
+    # profile stack by default.
+    "VLLM_TORCH_PROFILER_WITH_STACK":
+    lambda: bool(os.getenv("VLLM_TORCH_PROFILER_WITH_STACK", "1") != "0"),
+
+    # Enable torch profiler to profile flops if set
+    # VLLM_TORCH_PROFILER_WITH_FLOPS=1. If not set, torch profiler will
+    # not profile flops.
+    "VLLM_TORCH_PROFILER_WITH_FLOPS":
+    lambda: bool(os.getenv("VLLM_TORCH_PROFILER_WITH_FLOPS", "0") != "0"),
+
     # If set, vLLM will use Triton implementations of AWQ.
     "VLLM_USE_TRITON_AWQ":
     lambda: bool(int(os.getenv("VLLM_USE_TRITON_AWQ", "0"))),
@@ -632,12 +667,14 @@ environment_variables: dict[str, Callable[[], Any]] = {
     (os.environ.get("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "0").strip().lower() in
      ("1", "true")),
 
-    # By default, vLLM will check the peer-to-peer capability itself,
-    # in case of broken drivers. See https://github.com/vllm-project/vllm/blob/a9b15c606fea67a072416ea0ea115261a2756058/vllm/distributed/device_communicators/custom_all_reduce_utils.py#L101-L108 for details. # noqa
-    # If this env var is set to 1, vLLM will skip the peer-to-peer check,
-    # and trust the driver's peer-to-peer capability report.
+    # We assume drivers can report p2p status correctly.
+    # If the program hangs when using custom allreduce,
+    # potantially caused by a bug in the driver (535 series),
+    # if might be helpful to set VLLM_SKIP_P2P_CHECK=0
+    # so that vLLM can verify if p2p is actually working.
+    # See https://github.com/vllm-project/vllm/blob/a9b15c606fea67a072416ea0ea115261a2756058/vllm/distributed/device_communicators/custom_all_reduce_utils.py#L101-L108 for details. # noqa
     "VLLM_SKIP_P2P_CHECK":
-    lambda: os.getenv("VLLM_SKIP_P2P_CHECK", "0") == "1",
+    lambda: os.getenv("VLLM_SKIP_P2P_CHECK", "1") == "1",
 
     # List of quantization kernels that should be disabled, used for testing
     # and performance comparisons. Currently only affects MPLinearKernel
@@ -862,10 +899,22 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_TPU_MOST_MODEL_LEN":
     lambda: maybe_convert_int(os.environ.get("VLLM_TPU_MOST_MODEL_LEN", None)),
 
+    # Whether using Pathways
+    "VLLM_TPU_USING_PATHWAYS":
+    lambda: bool("proxy" in os.getenv("JAX_PLATFORMS", "").lower()),
+
     # Allow use of DeepGemm kernels for fused moe ops.
     "VLLM_USE_DEEP_GEMM":
     lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM", "0"))),
 
+    # DeepGemm JITs the kernels on-demand. The warmup attempts to make DeepGemm
+    # JIT all the required kernels before model execution so there is no
+    # JIT'ing in the hot-path. However, this warmup increases the engine
+    # startup time by a couple of minutes.
+    # Set `VLLM_SKIP_DEEP_GEMM_WARMUP` to disable the warmup.
+    "VLLM_SKIP_DEEP_GEMM_WARMUP":
+    lambda: bool(int(os.getenv("VLLM_SKIP_DEEP_GEMM_WARMUP", "0"))),
+
     # Allow use of FlashInfer MoE kernels for fused moe ops.
     "VLLM_USE_FLASHINFER_MOE_FP8":
     lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP8", "0"))),
@@ -973,9 +1022,9 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_USE_CUDNN_PREFILL":
     lambda: bool(int(os.getenv("VLLM_USE_CUDNN_PREFILL", "0"))),
 
-    # If set to 1, use the TRTLLM Decode Attention backend in flashinfer.
-    "VLLM_USE_TRTLLM_DECODE_ATTENTION":
-    lambda: os.getenv("VLLM_USE_TRTLLM_DECODE_ATTENTION", None),
+    # If set to 1, use the TRTLLM Attention backend in flashinfer.
+    "VLLM_USE_TRTLLM_ATTENTION":
+    lambda: os.getenv("VLLM_USE_TRTLLM_ATTENTION", None),
 
     # Controls garbage collection during CUDA graph capture.
     # If set to 0 (default), enables GC freezing to speed up capture time.
@@ -1003,6 +1052,18 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE":
     lambda: bool(int(os.getenv(\
             "VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE", "0"))),
+
+    # Enables support for the "store" option in the OpenAI Responses API.
+    # When set to 1, vLLM's OpenAI server will retain the input and output
+    # messages for those requests in memory. By default, this is disabled (0),
+    # and the "store" option is ignored.
+    # NOTE/WARNING:
+    # 1. Messages are kept in memory only (not persisted to disk) and will be
+    #    lost when the vLLM server shuts down.
+    # 2. Enabling this option will cause a memory leak, as stored messages are
+    #    never removed from memory until the server terminates.
+    "VLLM_ENABLE_RESPONSES_API_STORE":
+    lambda: bool(int(os.getenv("VLLM_ENABLE_RESPONSES_API_STORE", "0"))),
 }
 
 # --8<-- [end:env-vars-definition]
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index 97d0d6f08b81e..813232cd19281 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -35,6 +35,7 @@ class ExecutorBase(ABC):
     """
 
     uses_ray: bool  # whether the executor uses Ray for orchestration.
+    supports_pp: bool = False  # whether the executor supports PP
 
     def __init__(
         self,
diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py
index a6c172beff7bb..48b3479ed7997 100644
--- a/vllm/executor/multiproc_worker_utils.py
+++ b/vllm/executor/multiproc_worker_utils.py
@@ -3,21 +3,20 @@
 
 import asyncio
 import os
-import sys
 import threading
 import uuid
 from dataclasses import dataclass
 from multiprocessing import Queue
 from multiprocessing.connection import wait
 from multiprocessing.process import BaseProcess
-from typing import (Any, Callable, Dict, Generic, List, Optional, TextIO,
-                    TypeVar, Union)
+from typing import Any, Callable, Dict, Generic, List, Optional, TypeVar, Union
 
 import torch
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.utils import _maybe_force_spawn, get_mp_context, run_method
+from vllm.utils import (_maybe_force_spawn, decorate_logs, get_mp_context,
+                        run_method)
 
 logger = init_logger(__name__)
 
@@ -25,10 +24,6 @@ T = TypeVar('T')
 
 _TERMINATE = "TERMINATE"  # sentinel
 
-# ANSI color codes
-CYAN = '\033[1;36m'
-RESET = '\033[0;0m'
-
 JOIN_TIMEOUT_S = 2
 
 
@@ -213,9 +208,7 @@ def _run_worker_process(
 
     # Add process-specific prefix to stdout and stderr
     process_name = get_mp_context().current_process().name
-    pid = os.getpid()
-    _add_prefix(sys.stdout, process_name, pid)
-    _add_prefix(sys.stderr, process_name, pid)
+    decorate_logs(process_name)
 
     # Initialize worker
     worker = worker_factory(vllm_config, rank)
@@ -260,33 +253,6 @@ def _run_worker_process(
     logger.info("Worker exiting")
 
 
-def _add_prefix(file: TextIO, worker_name: str, pid: int) -> None:
-    """Prepend each output line with process-specific prefix"""
-
-    prefix = f"{CYAN}({worker_name} pid={pid}){RESET} "
-    file_write = file.write
-
-    def write_with_prefix(s: str):
-        if not s:
-            return
-        if file.start_new_line:  # type: ignore[attr-defined]
-            file_write(prefix)
-        idx = 0
-        while (next_idx := s.find('\n', idx)) != -1:
-            next_idx += 1
-            file_write(s[idx:next_idx])
-            if next_idx == len(s):
-                file.start_new_line = True  # type: ignore[attr-defined]
-                return
-            file_write(prefix)
-            idx = next_idx
-        file_write(s[idx:])
-        file.start_new_line = False  # type: ignore[attr-defined]
-
-    file.start_new_line = True  # type: ignore[attr-defined]
-    file.write = write_with_prefix  # type: ignore[method-assign]
-
-
 def set_multiprocessing_worker_envs(parallel_config):
     """ Set up environment variables that should be used when there are workers
     in a multiprocessing environment. This should be called by the parent 
diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py
index e9ad62aeb99a8..37c3fe59c65dd 100644
--- a/vllm/executor/ray_distributed_executor.py
+++ b/vllm/executor/ray_distributed_executor.py
@@ -608,6 +608,21 @@ class RayDistributedExecutor(DistributedExecutorBase):
 
             forward_dag = MultiOutputNode(outputs)
 
+        if envs.VLLM_USE_RAY_WRAPPED_PP_COMM:
+            from ray.experimental.channel.accelerator_context import (
+                register_accelerator_context)
+
+            from vllm.distributed.device_communicators.ray_communicator import (
+                RayPPCommunicator)
+            register_accelerator_context(torch_module_name="cuda",
+                                         communicator_cls=RayPPCommunicator)
+            logger.info("Using RayPPCommunicator "
+                        "(which wraps vLLM _PP GroupCoordinator) "
+                        "for Ray Compiled Graph communication.")
+        else:
+            logger.info("Using Ray's NCCL communicator for "
+                        "Ray Compiled Graph communication.")
+
         return forward_dag.experimental_compile(
             enable_asyncio=enable_asyncio,
             _overlap_gpu_communication=envs.
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index 033ecc00853ba..7abaffa54c089 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -295,9 +295,12 @@ def initialize_ray_cluster(
             logger.warning(
                 "No existing RAY instance detected. "
                 "A new instance will be launched with current node resources.")
-            ray.init(address=ray_address, num_gpus=parallel_config.world_size)
+            ray.init(address=ray_address,
+                     num_gpus=parallel_config.world_size,
+                     runtime_env=parallel_config.ray_runtime_env)
     else:
-        ray.init(address=ray_address)
+        ray.init(address=ray_address,
+                 runtime_env=parallel_config.ray_runtime_env)
 
     device_str = current_platform.ray_device_key
     if not device_str:
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 652136fbbfe73..6331a70b469aa 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -11,7 +11,7 @@ from typing_extensions import TypeVar
 from vllm.jsontree import JSONTree, json_map_leaves
 from vllm.logger import init_logger
 from vllm.transformers_utils.processor import cached_processor_from_config
-from vllm.utils import resolve_mm_processor_kwargs
+from vllm.utils import get_allowed_kwarg_only_overrides
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
@@ -154,14 +154,11 @@ class InputProcessingContext(InputContext):
         assert callable(hf_processor)
 
         mm_config = self.model_config.get_multimodal_config()
-        base_kwargs = mm_config.mm_processor_kwargs
-        if base_kwargs is None:
-            base_kwargs = {}
+        merged_kwargs = mm_config.merge_mm_processor_kwargs(kwargs)
 
-        merged_kwargs = resolve_mm_processor_kwargs(
-            base_kwargs,
-            kwargs,
+        allowed_kwargs = get_allowed_kwarg_only_overrides(
             hf_processor,
+            merged_kwargs,
             requires_kw_only=False,
             allow_var_kwargs=True,
         )
@@ -173,7 +170,9 @@ class InputProcessingContext(InputContext):
             return x
 
         try:
-            output = hf_processor(**data, **merged_kwargs, return_tensors="pt")
+            output = hf_processor(**data,
+                                  **allowed_kwargs,
+                                  return_tensors="pt")
             # this emulates output.to(dtype=self.model_config.dtype)
             if isinstance(output, BatchFeature):
                 cast_output = json_map_leaves(maybe_cast_dtype, output.data)
@@ -189,7 +188,7 @@ class InputProcessingContext(InputContext):
 
         except Exception as exc:
             msg = (f"Failed to apply {type(hf_processor).__name__} "
-                   f"on data={data} with kwargs={merged_kwargs}")
+                   f"on data={data} with kwargs={allowed_kwargs}")
 
             raise ValueError(msg) from exc
 
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index c3512ec3dbd43..de5933d6d41e5 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -682,12 +682,14 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
     def slice_lora_b(
         self, lora_b: list[Union[torch.Tensor, None]]
     ) -> list[Union[torch.Tensor, None]]:
+        sliced_lora_b = [None] * self.n_slices
         for i, (shard_id, shard_size) in enumerate(
                 zip(self.output_ids, self.output_slices)):
             if (lora_b_i := lora_b[i]) is not None:
-                lora_b[i] = lora_b_i[:, shard_size * shard_id:shard_size *
-                                     (shard_id + 1)]
-        return lora_b
+                sliced_lora_b[i] = lora_b_i[:,
+                                            shard_size * shard_id:shard_size *
+                                            (shard_id + 1)]
+        return sliced_lora_b
 
     def slice_bias(
         self, bias: list[Union[torch.Tensor,
diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py
index f6e79cd676f8c..6b5a107396c92 100644
--- a/vllm/model_executor/custom_op.py
+++ b/vllm/model_executor/custom_op.py
@@ -5,7 +5,7 @@ from typing import Optional
 
 import torch.nn as nn
 
-from vllm.config import get_current_vllm_config
+from vllm.config import get_cached_compilation_config
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 
@@ -86,7 +86,7 @@ class CustomOp(nn.Module):
     def dispatch_forward(self):
         # NOTE(woosuk): Here we assume that vLLM was built for only one
         # specific backend. Currently, we do not support dynamic dispatching.
-        compilation_config = get_current_vllm_config().compilation_config
+        compilation_config = get_cached_compilation_config()
         enabled = self.enabled()
         if enabled:
             compilation_config.enabled_custom_ops.update([self.__class__.name])
@@ -115,7 +115,7 @@ class CustomOp(nn.Module):
     @classmethod
     def enabled(cls) -> bool:
         # if no name, then it was not registered
-        compilation_config = get_current_vllm_config().compilation_config
+        compilation_config = get_cached_compilation_config()
         custom_ops = compilation_config.custom_ops
         if not hasattr(cls, "name"):
             logger.warning_once(
@@ -138,7 +138,7 @@ class CustomOp(nn.Module):
         Specifying 'all' or 'none' in custom_op takes precedence.
         """
         from vllm.config import CompilationLevel
-        compilation_config = get_current_vllm_config().compilation_config
+        compilation_config = get_cached_compilation_config()
         default_on = (compilation_config.level < CompilationLevel.PIECEWISE
                       or not compilation_config.use_inductor)
         count_none = compilation_config.custom_ops.count("none")
diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py
deleted file mode 100644
index 7540e6344a498..0000000000000
--- a/vllm/model_executor/guided_decoding/__init__.py
+++ /dev/null
@@ -1,192 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from __future__ import annotations
-
-from typing import TYPE_CHECKING
-
-from vllm.logger import init_logger
-from vllm.model_executor.guided_decoding.utils import (
-    convert_lark_to_gbnf, grammar_is_likely_lark,
-    has_lmf_unsupported_json_features, has_xgrammar_unsupported_json_features)
-from vllm.reasoning import ReasoningParserManager
-
-if TYPE_CHECKING:
-    from transformers import PreTrainedTokenizer
-
-    from vllm.config import ModelConfig
-    from vllm.logits_process import LogitsProcessor
-    from vllm.sampling_params import GuidedDecodingParams
-
-logger = init_logger(__name__)
-
-
-def maybe_backend_fallback(
-        guided_params: GuidedDecodingParams) -> GuidedDecodingParams:
-
-    def fallback_or_error(guided_params: GuidedDecodingParams, message: str,
-                          fallback: str) -> None:
-        """Change the backend to the specified fallback with a warning log,
-        or raise a ValueError if the `disable_fallback` option is specified."""
-        if guided_params.disable_fallback:
-            raise ValueError(message)
-
-        logger.warning("%s Falling back to use %s instead.", message, fallback)
-        guided_params.backend = fallback
-
-    # `auto` was added for V1 to explicitly declare a mode that has fallbacks
-    # in place. If that is specified with V0, treat it as `xgrammar`, as we have
-    # fallbacks enabled for that and it is the V0 default.
-    if guided_params.backend == "auto":
-        guided_params.backend = "xgrammar"
-
-    # lm-format-enforce doesn't support grammar, fallback to xgrammar
-    if guided_params.backend == "lm-format-enforcer":
-        if guided_params.grammar is not None:
-            fallback_or_error(
-                guided_params,
-                "lm-format-enforcer does not support grammar guided decoding.",
-                "xgrammar")
-
-        # lm-format-enforcer doesn't support some JSON schema features
-        elif (guided_params.json is not None
-              and has_lmf_unsupported_json_features(guided_params.json)):
-            fallback_or_error(
-                guided_params,
-                "lm-format-enforcer does not support advanced JSON schema "
-                "features like patterns or numeric ranges.", "outlines")
-
-    if guided_params.backend == "xgrammar":
-        from vllm.model_executor.guided_decoding.xgrammar_decoding import (
-            xgr_installed)
-
-        # xgrammar doesn't support some JSON schema features
-        if (guided_params.json is not None and
-                has_xgrammar_unsupported_json_features(guided_params.json)):
-            fallback_or_error(
-                guided_params,
-                "xgrammar does not support advanced JSON schema features like "
-                "string length, item limits, or property bounds.", "outlines")
-
-        # xgrammar only supports GBNF grammars, so we must convert Lark.
-        # We must check if the grammar is likely Lark and if that
-        # grammar is convertible to GBNF
-        elif (guided_params.grammar is not None
-              and grammar_is_likely_lark(guided_params.grammar)):
-            try:
-                convert_lark_to_gbnf(guided_params.grammar)
-            except Exception:
-                fallback_or_error(
-                    guided_params,
-                    "xgrammar does not support Lark grammars and the "
-                    "grammar failed to convert to GBNF.", "guidance")
-
-        # If the xgrammar module cannot be imported successfully,
-        # we should still allow users to use guided decoding with a fallback.
-        elif not xgr_installed:
-            fallback_or_error(
-                guided_params,
-                "xgrammar module cannot be imported successfully.", "guidance")
-
-    if guided_params.backend == "outlines":
-        if guided_params.json_object is not None:
-            # outlines doesn't support json_object, fallback to guidance
-            fallback_or_error(guided_params,
-                              "outlines does not support json_object.",
-                              "guidance")
-        elif guided_params.grammar is not None:
-            # outlines grammar support has been removed, fallback to guidance
-            # if it is a lark-based grammar and xgrammar otherwise
-            if grammar_is_likely_lark(guided_params.grammar):
-                fallback_or_error(guided_params,
-                                  "outlines no longer supports grammars.",
-                                  "guidance")
-            else:
-                # The grammar is likely already GBNF format.
-                fallback_or_error(guided_params,
-                                  "outlines no longer supports grammars.",
-                                  "xgrammar")
-
-    return guided_params
-
-
-async def get_guided_decoding_logits_processor(
-        guided_params: GuidedDecodingParams,
-        tokenizer: PreTrainedTokenizer,
-        model_config: ModelConfig,
-        reasoning_backend: str | None = None) -> LogitsProcessor | None:
-
-    reasoner = None
-    if reasoning_backend:
-        reasoner_class = ReasoningParserManager.get_reasoning_parser(
-            reasoning_backend)
-        reasoner = reasoner_class(tokenizer)
-
-    guided_params = maybe_backend_fallback(guided_params)
-
-    if guided_params.backend == 'outlines':
-        # NOTE: lazy import outlines to avoid https://github.com/vllm-project/vllm/issues/4193
-        from vllm.model_executor.guided_decoding.outlines_decoding import (  # noqa
-            get_outlines_guided_decoding_logits_processor)
-        return await get_outlines_guided_decoding_logits_processor(
-            guided_params, tokenizer, reasoner)
-    if guided_params.backend == 'lm-format-enforcer':
-        from vllm.model_executor.guided_decoding.lm_format_enforcer_decoding import (  # noqa
-            get_local_lm_format_enforcer_guided_decoding_logits_processor)
-        return get_local_lm_format_enforcer_guided_decoding_logits_processor(
-            guided_params, tokenizer)
-    if guided_params.backend == 'xgrammar':
-        from vllm.model_executor.guided_decoding.xgrammar_decoding import (  # noqa
-            get_local_xgrammar_guided_decoding_logits_processor)
-        return get_local_xgrammar_guided_decoding_logits_processor(
-            guided_params, tokenizer, model_config, reasoner)
-    if guided_params.backend == 'guidance':
-        from vllm.model_executor.guided_decoding.guidance_decoding import (
-            get_local_guidance_guided_decoding_logits_processor)
-        return get_local_guidance_guided_decoding_logits_processor(
-            guided_params, tokenizer)
-    raise ValueError(
-        f"Unknown guided decoding backend '{guided_params.backend}'. "
-        "Must be one of 'outlines, 'lm-format-enforcer', 'xgrammar', 'guidance'"
-    )
-
-
-def get_local_guided_decoding_logits_processor(
-        guided_params: GuidedDecodingParams,
-        tokenizer: PreTrainedTokenizer,
-        model_config: ModelConfig,
-        reasoning_backend: str | None = None) -> LogitsProcessor | None:
-    guided_params = maybe_backend_fallback(guided_params)
-
-    reasoner = None
-    if reasoning_backend:
-        reasoner_class = ReasoningParserManager.get_reasoning_parser(
-            reasoning_backend)
-        reasoner = reasoner_class(tokenizer)
-
-    if guided_params.backend == 'outlines':
-        # NOTE: lazy import outlines to avoid https://github.com/vllm-project/vllm/issues/4193
-        from vllm.model_executor.guided_decoding.outlines_decoding import (  # noqa
-            get_local_outlines_guided_decoding_logits_processor)
-        return get_local_outlines_guided_decoding_logits_processor(
-            guided_params, tokenizer, reasoner)
-    if guided_params.backend == 'lm-format-enforcer':
-        from vllm.model_executor.guided_decoding.lm_format_enforcer_decoding import (  # noqa
-            get_local_lm_format_enforcer_guided_decoding_logits_processor)
-        return get_local_lm_format_enforcer_guided_decoding_logits_processor(
-            guided_params, tokenizer)
-    if guided_params.backend == 'xgrammar':
-        from vllm.model_executor.guided_decoding.xgrammar_decoding import (  # noqa
-            get_local_xgrammar_guided_decoding_logits_processor)
-        return get_local_xgrammar_guided_decoding_logits_processor(
-            guided_params, tokenizer, model_config, reasoner)
-    if guided_params.backend == 'guidance':
-        from vllm.model_executor.guided_decoding.guidance_decoding import (
-            get_local_guidance_guided_decoding_logits_processor)
-        return get_local_guidance_guided_decoding_logits_processor(
-            guided_params, tokenizer)
-
-    raise ValueError(
-        f"Unknown guided decoding backend '{guided_params.backend}'. "
-        "Must be one of 'outlines, 'lm-format-enforcer', 'xgrammar', 'guidance'"
-    )
diff --git a/vllm/model_executor/guided_decoding/guidance_decoding.py b/vllm/model_executor/guided_decoding/guidance_decoding.py
deleted file mode 100644
index 05b6a1c3239f1..0000000000000
--- a/vllm/model_executor/guided_decoding/guidance_decoding.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import json
-
-import llguidance
-from regex import escape as regex_escape
-from transformers import PreTrainedTokenizerBase
-
-from vllm.model_executor.guided_decoding.guidance_logits_processors import (
-    GuidanceLogitsProcessor)
-from vllm.sampling_params import GuidedDecodingParams
-from vllm.v1.structured_output.backend_guidance import (
-    process_for_additional_properties)
-
-
-def get_local_guidance_guided_decoding_logits_processor(
-        guided_params: GuidedDecodingParams,
-        tokenizer: PreTrainedTokenizerBase) -> GuidanceLogitsProcessor:
-    """
-    Given an OpenAI-compatible request, check for guided decoding parameters
-    and get the necessary logits processor for the given guide.
-    """
-
-    grm = ""
-    any_whitespace = not guided_params.disable_any_whitespace
-    if (guide_json := guided_params.json) is not None:
-        # Optionally set additionalProperties to False at the top-level
-        # By default, other backends do not allow additional top-level
-        # properties, so this makes guidance more similar to other backends
-        if guided_params.disable_additional_properties:
-            if not isinstance(guide_json, str):
-                guide_json = json.dumps(guide_json)
-            guide_json = process_for_additional_properties(guide_json)
-
-        grm = llguidance.LLMatcher.grammar_from_json_schema(
-            guide_json,
-            overrides={"whitespace_pattern": guided_params.whitespace_pattern},
-            defaults={
-                "whitespace_flexible": any_whitespace,
-            })
-    elif guided_params.json_object:
-        grm = llguidance.LLMatcher.grammar_from_json_schema(
-            '{"type": "object"}',
-            overrides={"whitespace_pattern": guided_params.whitespace_pattern},
-            defaults={
-                "whitespace_flexible": any_whitespace,
-            })
-    elif guided_params.regex:
-        grm = llguidance.grammar_from("regex", guided_params.regex)
-    elif guided_params.choice:
-        # choice just uses regex
-        choices = (regex_escape(str(choice))
-                   for choice in guided_params.choice)
-        choices_regex = "(" + "|".join(choices) + ")"
-        grm = llguidance.grammar_from("regex", choices_regex)
-    elif guided_params.grammar:
-        # this supports Lark and GBNF
-        grm = llguidance.grammar_from("grammar", guided_params.grammar)
-
-    if grm:
-        return GuidanceLogitsProcessor(grm, tokenizer)
-
-    raise ValueError("Unknown guided decoding mode")
diff --git a/vllm/model_executor/guided_decoding/guidance_logits_processors.py b/vllm/model_executor/guided_decoding/guidance_logits_processors.py
deleted file mode 100644
index 379b5eaa38a76..0000000000000
--- a/vllm/model_executor/guided_decoding/guidance_logits_processors.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import copy
-import os
-from typing import Any
-
-import llguidance
-import llguidance.hf
-import llguidance.torch
-import torch
-from transformers import PreTrainedTokenizerBase
-
-from vllm.logger import init_logger
-
-logger = init_logger(__name__)
-
-
-class GuidanceLogitsProcessor:
-    """Base Guidance Logits Processor"""
-
-    cached_tokenizers: dict[str, Any] = {}
-
-    def __init__(
-        self,
-        grammar: str,
-        tokenizer: PreTrainedTokenizerBase,
-    ) -> None:
-        """Base Guidance Logits Processor
-
-        Args:
-            grammar (str)
-                grammar to guide the generation
-            tokenizer (PreTrainedTokenizerBase)
-                model's tokenizer
-        """
-        self.grammar = grammar
-        self.tokenizer = tokenizer
-        self.tokenizer_name = tokenizer.name_or_path
-        self.ll_tokenizer = None
-        self.ll_matcher = None
-        self.bitmask = None
-        self.new_sampling = False
-        self.initialized = False
-
-    def clone(self) -> "GuidanceLogitsProcessor":
-        cloned = copy.copy(self)
-        if self.initialized:
-            cloned.ll_matcher = llguidance.LLMatcher(
-                self.ll_tokenizer,  # type: ignore[assignment]
-                self.grammar,
-                log_level=int(os.environ.get("LLGUIDANCE_LOG_LEVEL", "1")),
-            )
-            self.bitmask = llguidance.torch.allocate_token_bitmask(
-                1, self.ll_tokenizer.vocab_size)  # type: ignore[attr-defined]
-        return cloned
-
-    def _initialize(self):
-        if self.initialized:
-            return
-
-        ll_tokenizer = self.cached_tokenizers.get(self.tokenizer.name_or_path,
-                                                  None)
-        if ll_tokenizer is None:
-            ll_tokenizer = llguidance.hf.from_tokenizer(self.tokenizer, None)
-            self.cached_tokenizers[self.tokenizer.name_or_path] = ll_tokenizer
-
-        self.ll_tokenizer = ll_tokenizer
-        self.ll_matcher = llguidance.LLMatcher(
-            self.ll_tokenizer,
-            self.grammar,
-            log_level=int(os.environ.get("LLGUIDANCE_LOG_LEVEL", "1")),
-        )
-
-        # create reusable bitmask
-        self.bitmask = llguidance.torch.allocate_token_bitmask(
-            1, self.ll_tokenizer.vocab_size)  # type: ignore[attr-defined]
-
-        self.initialized = True
-
-    def __call__(
-        self,
-        input_ids: list[int],
-        scores: torch.Tensor,
-    ) -> torch.Tensor:
-        # we initialize the guidance model here
-        # to avoid pickling ll_tokenizer and ll_interpreter
-        self._initialize()
-
-        if self.new_sampling and len(input_ids) > 0:
-            self.ll_matcher.consume_token(  # type: ignore[attr-defined]
-                input_ids[-1])
-            err = self.ll_matcher.get_error()  # type: ignore[attr-defined]
-            if err:
-                logger.warning("Error in LLMatcher: %s", err)
-
-        llguidance.torch.fill_next_token_bitmask(self.ll_matcher, self.bitmask,
-                                                 0)
-        llguidance.torch.apply_token_bitmask_inplace(
-            scores,
-            self.bitmask.to(scores.device))  # type: ignore[attr-defined]
-
-        self.new_sampling = True
-
-        return scores
diff --git a/vllm/model_executor/guided_decoding/guided_fields.py b/vllm/model_executor/guided_decoding/guided_fields.py
deleted file mode 100644
index fa97b6dbf5115..0000000000000
--- a/vllm/model_executor/guided_decoding/guided_fields.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from dataclasses import dataclass
-from typing import Optional, TypedDict, Union
-
-
-# These classes are deprecated, see SamplingParams
-class LLMGuidedOptions(TypedDict, total=False):
-    guided_json: Union[dict, str]
-    guided_regex: str
-    guided_choice: list[str]
-    guided_grammar: str
-    guided_decoding_backend: str
-    guided_whitespace_pattern: str
-    guided_json_object: bool
-
-
-@dataclass
-class GuidedDecodingRequest:
-    """One of the fields will be used to retrieve the logit processor."""
-    guided_json: Optional[Union[dict, str]] = None
-    guided_regex: Optional[str] = None
-    guided_choice: Optional[list[str]] = None
-    guided_grammar: Optional[str] = None
-    guided_decoding_backend: Optional[str] = None
-    guided_whitespace_pattern: Optional[str] = None
-    guided_json_object: Optional[bool] = None
-    structural_tag: Optional[str] = None
-
-    def __post_init__(self):
-        """Validate that some fields are mutually exclusive."""
-        guide_count = sum(x is not None
-                          for x in (self.guided_json, self.guided_regex,
-                                    self.guided_choice, self.guided_grammar,
-                                    self.guided_json_object,
-                                    self.structural_tag))
-        if guide_count > 1:
-            raise ValueError(
-                "You can only use one kind of guided decoding but multiple are "
-                f"specified: {self.__dict__}")
diff --git a/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py b/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py
deleted file mode 100644
index f9b51f4c15745..0000000000000
--- a/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from functools import lru_cache
-from json import loads as json_loads
-from typing import Optional, Union
-
-from lmformatenforcer import (CharacterLevelParser, JsonSchemaParser,
-                              RegexParser, StringParser,
-                              TokenEnforcerTokenizerData, UnionParser)
-from lmformatenforcer.integrations.vllm import (
-    build_vllm_logits_processor, build_vllm_token_enforcer_tokenizer_data)
-from transformers import PreTrainedTokenizerBase
-
-from vllm.logits_process import LogitsProcessor
-from vllm.sampling_params import GuidedDecodingParams
-
-
-def get_local_lm_format_enforcer_guided_decoding_logits_processor(
-        guided_params: GuidedDecodingParams,
-        tokenizer) -> Optional[LogitsProcessor]:
-    """
-    Given an OpenAI-compatible request, check for guided decoding parameters
-    and get the necessary logits processor for the given guide.
-    We cache logit processors by (guide, tokenizer), and on cache hit
-    we make a shallow copy to reuse the same underlying FSM.
-    """
-
-    tokenizer_data = _cached_build_vllm_token_enforcer_tokenizer_data(
-        tokenizer)
-    character_level_parser: CharacterLevelParser
-    if guided_params.json:
-        schema_dict = _normalize_json_schema_object(guided_params.json)
-        character_level_parser = JsonSchemaParser(schema_dict)
-    elif guided_params.choice:
-        character_level_parser = UnionParser(
-            [StringParser(choice) for choice in guided_params.choice])
-    elif guided_params.regex:
-        character_level_parser = RegexParser(guided_params.regex)
-    elif guided_params.grammar:
-        # CFG grammar not supported by LMFE
-        raise ValueError("Cannot construct a guided decoding logits processor"
-                         " using the grammar option with the"
-                         " lm_format_enforcer backend.")
-    elif guided_params.json_object:
-        # None means any json object
-        character_level_parser = JsonSchemaParser(None)
-    else:
-        return None
-
-    logits_processor = build_vllm_logits_processor(tokenizer_data,
-                                                   character_level_parser)
-    return logits_processor
-
-
-def _normalize_json_schema_object(schema: Union[str, dict]) -> dict:
-    if isinstance(schema, str):
-        return json_loads(schema)
-    if isinstance(schema, dict):
-        return schema
-    raise AssertionError(f"Unsupported schema type {schema}")
-
-
-@lru_cache
-def _cached_build_vllm_token_enforcer_tokenizer_data(
-        tokenizer: PreTrainedTokenizerBase) -> TokenEnforcerTokenizerData:
-    return build_vllm_token_enforcer_tokenizer_data(tokenizer)
diff --git a/vllm/model_executor/guided_decoding/outlines_decoding.py b/vllm/model_executor/guided_decoding/outlines_decoding.py
deleted file mode 100644
index 7e365b294438b..0000000000000
--- a/vllm/model_executor/guided_decoding/outlines_decoding.py
+++ /dev/null
@@ -1,117 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import asyncio
-import concurrent.futures
-import os
-from enum import Enum
-from json import dumps as json_dumps
-from typing import Optional, Union
-
-from regex import escape as regex_escape
-from transformers import PreTrainedTokenizerBase
-
-from vllm.model_executor.guided_decoding.outlines_logits_processors import (
-    JSONLogitsProcessor, RegexLogitsProcessor)
-from vllm.reasoning import ReasoningParser
-from vllm.sampling_params import GuidedDecodingParams
-
-
-class GuidedDecodingMode(Enum):
-    JSON = "json"
-    REGEX = "regex"
-    CHOICE = "choice"
-
-
-global_thread_pool = None  # used for generating logits processor fsm
-
-# It's not yet clear that using more provides a benefit, and it could
-# potentially starve other processes on the machine. We'll cap this for now and
-# adjust later if testing proves it to help overcome a bottleneck.
-_MAX_THREADPOOL_WORKERS = 16
-
-
-async def get_outlines_guided_decoding_logits_processor(
-    guided_params: GuidedDecodingParams, tokenizer: PreTrainedTokenizerBase,
-    reasoner: Optional[ReasoningParser]
-) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, None]:
-    """
-    Given an OpenAI-compatible request, check for guided decoding parameters
-    and get the necessary logits processor for the given guide.
-    """
-    global global_thread_pool
-    guide, mode = _get_guide_and_mode(guided_params)
-    if not guide or not mode:
-        return None
-
-    if global_thread_pool is None:
-        max_workers = os.cpu_count() or 2
-        if max_workers > _MAX_THREADPOOL_WORKERS:
-            max_workers = _MAX_THREADPOOL_WORKERS
-        global_thread_pool = concurrent.futures.ThreadPoolExecutor(
-            max_workers=max_workers)
-    loop = asyncio.get_running_loop()
-    return await loop.run_in_executor(global_thread_pool,
-                                      _get_logits_processor, guide, tokenizer,
-                                      mode, guided_params.whitespace_pattern,
-                                      reasoner)
-
-
-def get_local_outlines_guided_decoding_logits_processor(
-    guided_params: GuidedDecodingParams, tokenizer: PreTrainedTokenizerBase,
-    reasoner: Optional[ReasoningParser]
-) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, None]:
-    """
-    Given an OpenAI-compatible request, check for guided decoding parameters
-    and get the necessary logits processor for the given guide.
-    """
-    guide, mode = _get_guide_and_mode(guided_params)
-    if not guide or not mode:
-        return None
-
-    return _get_logits_processor(guide, tokenizer, mode,
-                                 guided_params.whitespace_pattern, reasoner)
-
-
-def _get_guide_and_mode(
-    guided_params: GuidedDecodingParams
-) -> Union[tuple[str, GuidedDecodingMode], tuple[None, None]]:
-    if guided_params.json:
-        if isinstance(guided_params.json, dict):
-            # turn dict into hashable string
-            json = json_dumps(guided_params.json)
-        else:
-            json = guided_params.json
-        return json, GuidedDecodingMode.JSON
-    elif guided_params.regex:
-        return guided_params.regex, GuidedDecodingMode.REGEX
-    elif guided_params.choice:
-        # choice just uses regex
-        choices = [
-            regex_escape(str(choice)) for choice in guided_params.choice
-        ]
-        choices_regex = "(" + "|".join(choices) + ")"
-        return choices_regex, GuidedDecodingMode.CHOICE
-    elif guided_params.grammar:
-        raise ValueError(
-            "The `outlines` guided decoding backend no longer supports grammar "
-            "guided generation. Please use either the `xgrammar` or `guidance` "
-            "backend")
-    else:
-        return None, None
-
-
-def _get_logits_processor(
-    guide: str,
-    tokenizer: PreTrainedTokenizerBase,
-    mode: GuidedDecodingMode,
-    whitespace_pattern: Union[str, None],
-    reasoner: Optional[ReasoningParser],
-) -> Union[JSONLogitsProcessor, RegexLogitsProcessor]:
-    if mode == GuidedDecodingMode.JSON:
-        return JSONLogitsProcessor(guide, tokenizer, whitespace_pattern,
-                                   reasoner)
-    elif mode == GuidedDecodingMode.REGEX or mode == GuidedDecodingMode.CHOICE:
-        return RegexLogitsProcessor(guide, tokenizer, reasoner)
-    else:
-        raise ValueError(f"Unknown guided decoding mode {mode}")
diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
deleted file mode 100644
index 7f047a1df6a58..0000000000000
--- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py
+++ /dev/null
@@ -1,307 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# SPDX-FileCopyrightText: Copyright 2024-present the Outlines developers
-from __future__ import annotations
-
-import copy
-import hashlib
-import importlib.metadata
-import json
-import os
-from typing import Optional, Union
-
-import regex as re
-import torch
-from cachetools import LRUCache
-from diskcache import Cache
-from outlines_core import Guide, Index, Vocabulary
-from outlines_core.json_schema import build_regex_from_schema
-from outlines_core.kernels.torch import (_apply_token_bitmask_inplace_kernel,
-                                         allocate_token_bitmask)
-from pydantic import BaseModel
-from transformers import PreTrainedTokenizerBase
-from transformers.file_utils import SPIECE_UNDERLINE
-from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
-
-import vllm.envs as envs
-from vllm.logger import init_logger
-from vllm.reasoning import ReasoningParser
-from vllm.transformers_utils.tokenizer import AnyTokenizer
-
-logger = init_logger(__name__)
-
-CACHE = None
-
-
-class BaseLogitsProcessor:
-
-    def __init__(self, guide: Guide, eos_token_id: int,
-                 reasoner: Optional[ReasoningParser]) -> None:
-        self._guide: Guide = guide
-        self._eos_token_id: int = eos_token_id
-        self._reasoner: Optional[ReasoningParser] = reasoner
-        self._mask: Optional[torch.Tensor] = None
-
-    def __call__(self, input_ids: list[int],
-                 scores: torch.Tensor) -> torch.Tensor:
-        if self._mask is None:
-            self._mask = allocate_token_bitmask(scores.size(-1))
-
-        # Skip the structured logits processing if reasoning is not finished.
-        # reasoner is not None only when `--reasoning-parser` is set.
-        if self._reasoner is not None and not self._reasoner.is_reasoning_end(
-                input_ids):
-            return scores
-
-        # Remove the reasoning tokens from the input_ids
-        # We need this because our implementation relies on the
-        # input_ids sequence to store the FSM state.
-        input_ids = (self._reasoner.extract_content_ids(input_ids)
-                     if self._reasoner is not None else input_ids)
-
-        # Vllm V0 engine has a weird bug where we have to repeat
-        # the eos token id twice for generation to stop, or at least
-        # that is what we have to do from here in any case.
-        # This is a patch until a better solution can be pushed
-        # to outlines_core
-        if input_ids and input_ids[-1] != self._eos_token_id:
-            self._guide.advance(token_id=input_ids[-1], return_tokens=False)
-
-        self._guide.write_mask_into(
-            data_ptr=self._mask.data_ptr(),
-            numel=self._mask.numel(),
-            element_size=self._mask.element_size(),
-        )
-
-        # Any allowed tokens beyond the length of the scores will
-        # be ignored by the kernel, taking care of the issue with
-        # models such as Llama 3.2 Vision with an `<|image|>` token
-        # with id 128256, but scores.shape == torch.Size([128256])
-        _apply_token_bitmask_inplace_kernel(
-            logits=scores.unsqueeze(dim=0),
-            # mask must be on same device
-            mask=self._mask.to(scores.device, non_blocking=True))
-        self._mask.to("cpu", non_blocking=True)
-
-        return scores
-
-    def clone(self) -> BaseLogitsProcessor:
-        guide = copy.deepcopy(self._guide)
-        guide.reset()
-        return BaseLogitsProcessor(guide=guide,
-                                   eos_token_id=self._eos_token_id,
-                                   reasoner=self._reasoner)
-
-
-class RegexLogitsProcessor(BaseLogitsProcessor):
-
-    @classmethod
-    def _get_guide(cls, regex_string: str,
-                   tokenizer: PreTrainedTokenizerBase) -> Guide:
-        global CACHE
-        if CACHE is None:
-            CACHE = get_cache()
-        vocabulary = get_vocabulary(tokenizer)  # type: ignore[arg-type]
-        cache_key = f"{vocabulary._hash}_{regex_string}"
-        if CACHE is not None and cache_key in CACHE:
-            return Guide(CACHE[cache_key])
-
-        index = Index(regex_string, vocabulary.inner)
-
-        if CACHE is not None:
-            CACHE[cache_key] = index
-
-        return Guide(index)
-
-    def __init__(self, regex_string: str, tokenizer: PreTrainedTokenizerBase,
-                 reasoner: Optional[ReasoningParser]) -> None:
-        super().__init__(
-            guide=RegexLogitsProcessor._get_guide(regex_string, tokenizer),
-            eos_token_id=tokenizer.eos_token_id,  # type: ignore
-            reasoner=reasoner)
-
-
-class JSONLogitsProcessor(RegexLogitsProcessor):
-
-    def __init__(self, schema: Union[str, dict, BaseModel],
-                 tokenizer: PreTrainedTokenizerBase,
-                 whitespace_pattern: Union[str, None],
-                 reasoner: Optional[ReasoningParser]) -> None:
-
-        if isinstance(schema, type(BaseModel)):
-            schema_str = json.dumps(schema.model_json_schema())
-        elif isinstance(schema, dict):
-            schema_str = json.dumps(schema)
-        elif isinstance(schema, str):
-            schema_str = schema
-        else:
-            raise ValueError(
-                f"Cannot parse schema {schema}. The schema must be either "
-                f"a Pydantic object, a dictionary or a string that contains "
-                f"the JSON Schema specification")
-
-        regex_string = build_regex_from_schema(schema_str, whitespace_pattern)
-        super().__init__(regex_string, tokenizer, reasoner)
-
-
-class OutlinesVocabulary:
-    """
-    Wrapper class for `outlines_core.Vocabulary`,
-    which allows us to store a hash with the vocabulary
-    """
-
-    def __init__(self, vocabulary: Vocabulary) -> None:
-        # Actual vocabulary object
-        self.inner = vocabulary
-        # Have to do abs(hash()) because python hashes can
-        # be negative, and we are using hash as a cache key.
-        hex_str = hashlib.sha256(
-            vocabulary.__repr__().encode('utf-8')).hexdigest()
-        hash_int = int(hex_str, 16)
-        self._hash = hash_int
-
-
-re_llama_byte_token = re.compile(r"^<0x[0-9A-F]{2}>$")
-re_replacement_seq = re.compile(r"^.{0,6}�+.{0,6}$")
-
-
-def _reduced_vocabulary(tokenizer: AnyTokenizer,
-                        eos_token_id: int) -> dict[bytes, list[int]]:
-    """Create a map from vocabulary tokens to lists of equivalent token ids.
-    
-    Returns:
-        A Dict of token string -> equivalent token ids
-    """
-    unicode_to_bytes = {v: k for k, v in bytes_to_unicode().items()}
-
-    def convert_token_to_string(token: str) -> str:
-
-        string = tokenizer.convert_tokens_to_string([token])
-
-        # A hack to handle missing spaces to HF's Llama tokenizers
-        if (type(token) is str and token.startswith(SPIECE_UNDERLINE)
-                or token == "<0x20>"):
-            return " " + string
-
-        return string
-
-    vocabulary: dict[bytes, list[int]] = {}
-    empty_token_ids: list[int] = []
-    for token, token_idx in tokenizer.get_vocab().items():
-        if token in tokenizer.all_special_tokens:  # type: ignore
-            continue
-
-        token_str = convert_token_to_string(token)
-        if token_str:
-            if isinstance(token, (bytes, bytearray)):
-                # For BPE tokenizers where tokens are stored as bytes.
-
-                # safe to ignore since token_str is of type (bytearray, bytes)
-                # by this point.
-                token_bytes = bytes(token_str)  # type: ignore[arg-type]
-
-            elif "\ufffd" in token_str and not re_replacement_seq.match(
-                    token_str):
-                # Handle tokens with invalid UTF-8 sequences.
-                if re_llama_byte_token.match(token):
-                    # Llama-like tokenizers use <0xXX> for incomplete sequences.
-                    token_bytes = bytes([int(token[3:5], 16)])
-                else:
-                    # GPT2 tokenizers: map each byte back using unicode_to_bytes
-                    byte_vals = [unicode_to_bytes.get(c) for c in token]
-                    if None in byte_vals:
-                        raise RuntimeError(
-                            f"Cannot convert token `{token}`"
-                            f" ({token_idx}) to bytes: {token_str}")
-                    # safe to ignore, since if None in byte_vals,
-                    # an error is thrown.
-                    token_bytes = bytes(byte_vals)  # type: ignore[arg-type]
-            else:
-                token_bytes = token_str.encode('utf-8')
-
-            if token_idx != eos_token_id:
-                vocabulary.setdefault(token_bytes, []).append(token_idx)
-        else:
-            empty_token_ids.append(token_idx)
-
-    return vocabulary
-
-
-def get_vocabulary(tokenizer: AnyTokenizer) -> Vocabulary:
-    """Get the `Vocabulary` object for a given tokenizer.
-    """
-    if hasattr(tokenizer, "_outlines_vocabulary"):
-        return tokenizer._outlines_vocabulary  # type: ignore
-
-    try:
-        if hasattr(
-                tokenizer,
-                "eos_token_id",
-        ) and tokenizer.eos_token_id is not None:
-            eos_token_id = tokenizer.eos_token_id
-        else:
-            raise ValueError(
-                f"Error during guided decoding setup: Tokenizer"
-                f" ({type(tokenizer)}) has no `eos_token_id` property, "
-                "but `eos_token_id` is required for guided decoding"
-                " to work properly.")
-
-        reduced_vocab = _reduced_vocabulary(
-            tokenizer,
-            eos_token_id  #type: ignore
-        )
-        vocabulary = OutlinesVocabulary(Vocabulary(eos_token_id,
-                                                   reduced_vocab))
-        tokenizer._outlines_vocabulary = vocabulary  # type: ignore
-
-        return vocabulary
-    except AttributeError as e:
-        raise ValueError(f"Cannot get the vocabulary of the tokenizer "
-                         f"({type(tokenizer)}). The tokenizer should have a "
-                         "get_vocab method.") from e
-
-
-def get_cache_path() -> str:
-    """Get the context object that contains previously-computed return values"""
-    outlines_cache_dir = os.getenv("OUTLINES_CACHE_DIR")
-    xdg_cache_home = os.getenv("XDG_CACHE_HOME")
-    home_dir = os.path.expanduser("~")
-
-    if outlines_cache_dir:
-        # OUTLINES_CACHE_DIR takes precedence
-        return outlines_cache_dir
-    elif xdg_cache_home:
-        return os.path.join(xdg_cache_home, ".cache", "outlines")
-    # If homedir is "/", we may be inside a container, and thus writing to
-    # root would be problematic, so we fallback to using a tempfile.
-    # Also validate the path exists, since os.path.expanduser does
-    # not garuntee existence.
-    elif os.path.isdir(home_dir) and home_dir != "/":
-        # Default Unix fallback: ~/.cache/outlines
-        return os.path.join(home_dir, ".cache", "outlines")
-    else:
-        import tempfile
-
-        # home_dir may be / inside a docker container without existing user
-        tempdir = tempfile.gettempdir()
-        return os.path.join(tempdir, ".cache", "outlines")
-
-
-def get_cache():
-    """Get the Cache instance to be used for index caching"""
-
-    cache_dir = get_cache_path()
-    if envs.VLLM_V0_USE_OUTLINES_CACHE:
-        logger.warning("Enabling outlines cache. This is an unbounded on-disk "
-                       "cache. It may consume a lot of disk space and should "
-                       "not be used with untrusted clients.")
-        cache = Cache(cache_dir, eviction_policy="none", cull_limit=0)
-        outlines_version = importlib.metadata.version("outlines_core")
-
-        cached_version = cache.get('__version__', None)
-        if cached_version != outlines_version:
-            cache.clear()
-        cache.set('__version__', outlines_version)
-        return cache
-    else:
-        return LRUCache(maxsize=128)
diff --git a/vllm/model_executor/guided_decoding/utils.py b/vllm/model_executor/guided_decoding/utils.py
deleted file mode 100644
index 8fdfa983e120b..0000000000000
--- a/vllm/model_executor/guided_decoding/utils.py
+++ /dev/null
@@ -1,242 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import regex as re
-
-
-def has_xgrammar_unsupported_json_features(schema: dict) -> bool:
-    """Check if JSON schema contains features unsupported by xgrammar."""
-
-    def check_object(obj: dict) -> bool:
-        if not isinstance(obj, dict):
-            return False
-
-        # Check for numeric ranges
-        if obj.get("type") in ("integer", "number") and ("multipleOf" in obj):
-            return True
-
-        # Check for array unsupported keywords
-        if obj.get("type") == "array" and any(key in obj for key in [
-                "uniqueItems", "contains", "minContains", "maxContains",
-                "minItems", "maxItems"
-        ]):
-            return True
-
-        # Unsupported keywords for strings
-        if obj.get("type") == "string" and any(
-                key in obj for key in ["minLength", "maxLength", "format"]):
-            return True
-
-        # Unsupported keywords for objects
-        if obj.get("type") == "object" and any(key in obj for key in [
-                "minProperties", "maxProperties", "propertyNames",
-                "patternProperties"
-        ]):
-            return True
-
-        # Recursively check all nested objects and arrays
-        for value in obj.values():
-            if isinstance(value, dict):
-                if check_object(value):
-                    return True
-            elif isinstance(value, list):
-                for item in value:
-                    if isinstance(item, dict) and check_object(item):
-                        return True
-
-        return False
-
-    return check_object(schema)
-
-
-def has_lmf_unsupported_json_features(schema: dict) -> bool:
-    """
-    Check if JSON schema contains features unsupported 
-    by lm_format_enforcer.
-
-    Known issues:
-    - Regex patterns:
-        "grade": {
-            "type": "string",
-            "pattern": "^[A-D]$"  # Regex pattern
-        },
-    """
-
-    def check_object(obj: dict) -> bool:
-        if not isinstance(obj, dict):
-            return False
-
-        # Check for pattern restrictions
-        if "pattern" in obj:
-            return True
-
-        # Recursively check all nested objects and arrays
-        for value in obj.values():
-            if isinstance(value, dict):
-                if check_object(value):
-                    return True
-            elif isinstance(value, list):
-                for item in value:
-                    if isinstance(item, dict) and check_object(item):
-                        return True
-
-        return False
-
-    return check_object(schema)
-
-
-def grammar_is_likely_lark(grammar_str: str) -> bool:
-    """
-    Check if grammar appears to use Lark syntax.
-    
-    Args:
-        grammar_str: Input grammar string
-        
-    Returns:
-        bool: True if grammar appears to be in Lark format, False otherwise
-        
-    Examples:
-        >>> grammar_is_likely_lark("rule: 'abc'")
-        True
-        >>> grammar_is_likely_lark("rule ::= 'abc'")
-        False
-    """
-    if not grammar_str or not isinstance(grammar_str, str):
-        return False
-
-    for line in grammar_str.split('\n'):
-        # Remove both comment styles
-        line = re.sub(r'(#|//).*$', '', line).strip()
-        if not line:
-            continue
-
-        # Look for GBNF rule definition
-        if '::=' in line:
-            return False
-
-    return True
-
-
-def convert_lark_to_gbnf(grammar_str: str) -> str:
-    """
-    Convert a Lark grammar string to GBNF format.
-
-    GBNF reference:
-    https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md
-    Lark grammar reference:
-    https://lark-parser.readthedocs.io/en/latest/grammar.html
-    
-    Args:
-        grammar_str: Input grammar in Lark format
-        
-    Returns:
-        str: Converted grammar in GBNF format
-        
-    Examples:
-        >>> print(convert_lark_to_gbnf("rule: 'hello'"))
-        root ::= rule
-        rule ::= "hello"
-    """
-    if not isinstance(grammar_str, str):
-        raise ValueError(f"Grammar must be a string, got {type(grammar_str)}")
-    if not grammar_str.strip():
-        raise ValueError("Grammar string cannot be empty")
-
-    defined_rules = set()
-    referenced_rules = set()
-    output_lines = []
-
-    def clean_line(line: str) -> str:
-        """Remove comments and whitespace from line."""
-        return re.sub(r'(#|//).*$', '', line).strip()
-
-    def check_quotes(text: str, rule_name: str, line_num: int) -> None:
-        """Validate quote matching in text."""
-        if text.count("'") % 2 != 0 or text.count('"') % 2 != 0:
-            raise ValueError(
-                f"Mismatched quotes in {rule_name} on line {line_num}")
-
-    def extract_references(text: str) -> set:
-        """Extract rule references from text."""
-        # Remove quoted strings and special characters
-        text = re.sub(r'"[^"]*"', '', text)
-        text = re.sub(r'[+*?()|\[\]{}]', ' ', text)
-        return set(re.findall(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b', text))
-
-    # First pass: Find root rule and validate rule definitions
-    lines = [clean_line(line) for line in grammar_str.split('\n')]
-    first_rule = None
-
-    for line_num, line in enumerate(lines, 1):
-        if not line or line.startswith('|'):
-            continue
-
-        if ':' in line:
-            try:
-                name = line.split(':', 1)[0].strip().strip('?')
-                defined_rules.add(name)
-                if first_rule is None:
-                    first_rule = name
-                if name == 'start':
-                    first_rule = 'start'
-            except IndexError as e:
-                raise ValueError(f"Invalid rule format on line {line_num}. "
-                                 "Expected 'rule_name: definition'") from e
-
-    if not defined_rules:
-        raise ValueError("No valid rules found in grammar")
-
-    # Add root rule
-    output_lines.append(f"root ::= {first_rule}")
-
-    # Second pass: Process rule definitions and alternatives
-    current_rule = None
-    current_definition = []
-
-    for line_num, line in enumerate(lines, 1):
-        if not line:
-            continue
-
-        try:
-            if ':' in line and not line.startswith('|'):
-                # Save previous rule if exists
-                if current_rule:
-                    output_lines.append(
-                        f"{current_rule} ::= {' | '.join(current_definition)}")
-
-                # Process new rule
-                name, definition = line.split(':', 1)
-                current_rule = name.strip().strip('?')
-
-                check_quotes(definition, f"rule '{current_rule}'", line_num)
-                definition = re.sub(r"'([^']*)'", r'"\1"', definition)
-                referenced_rules.update(extract_references(definition))
-                current_definition = [definition.strip()]
-
-            elif line.startswith('|'):
-                if not current_rule:
-                    raise ValueError(f"Alternative '|' on line {line_num} "
-                                     "without a preceding rule definition")
-
-                alt_def = line[1:].strip()
-                check_quotes(alt_def, f"alternative for rule '{current_rule}'",
-                             line_num)
-                alt_def = re.sub(r"'([^']*)'", r'"\1"', alt_def)
-                referenced_rules.update(extract_references(alt_def))
-                current_definition.append(alt_def)
-
-        except ValueError as e:
-            raise ValueError(f"Error on line {line_num}: {str(e)}") from e
-
-    # Add final rule if exists
-    if current_rule:
-        output_lines.append(
-            f"{current_rule} ::= {' | '.join(current_definition)}")
-
-    # Validate all rules are defined
-    undefined_rules = referenced_rules - defined_rules - {'root'}
-    if undefined_rules:
-        raise ValueError("Referenced rules are not defined: "
-                         f"{', '.join(sorted(undefined_rules))}")
-
-    return '\n'.join(output_lines)
diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
deleted file mode 100644
index bdd3a1a9c0a59..0000000000000
--- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py
+++ /dev/null
@@ -1,426 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# noqa: UP007
-from __future__ import annotations
-
-import json
-from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Any
-
-import regex as re
-import torch
-
-import vllm.envs
-from vllm.logger import init_logger
-
-try:
-    import xgrammar as xgr
-    xgr_installed = True
-except ImportError:
-    xgr_installed = False
-    pass
-
-from vllm.model_executor.guided_decoding.utils import (convert_lark_to_gbnf,
-                                                       grammar_is_likely_lark)
-from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
-
-if TYPE_CHECKING:
-    from transformers import PreTrainedTokenizer
-
-    from vllm.config import ModelConfig
-    from vllm.reasoning import ReasoningParser
-    from vllm.sampling_params import GuidedDecodingParams
-
-logger = init_logger(__name__)
-
-
-def get_local_xgrammar_guided_decoding_logits_processor(
-        guided_params: GuidedDecodingParams,
-        tokenizer: PreTrainedTokenizer,
-        model_config: ModelConfig,
-        reasoner: ReasoningParser | None,
-        max_threads: int = 8):
-    config = GrammarConfig.from_guided_params(guided_params=guided_params,
-                                              model_config=model_config,
-                                              tokenizer=tokenizer,
-                                              max_threads=max_threads)
-    return XGrammarLogitsProcessor(config, reasoner)
-
-
-@dataclass(frozen=True)
-class TokenizerData:
-    """Immutable container for cached tokenizer data."""
-    metadata: str
-    encoded_vocab: list[str] = field(default_factory=list)
-
-
-class TokenizerDataCache:
-    """Cache manager for tokenizer data to avoid repeated processing."""
-    _cache: dict[int, TokenizerData] = {}
-
-    @classmethod
-    def get_tokenizer_data(
-        cls,
-        tokenizer: PreTrainedTokenizer,
-        /,
-        *,
-        tokenizer_hash: int,
-        vocab_size: int,
-    ) -> TokenizerData:
-
-        if tokenizer_hash not in cls._cache:
-            tokenizer_info = xgr.TokenizerInfo.from_huggingface(
-                tokenizer,
-                # NOTE: We will need to use lm_head's vocab_size
-                # to determine correct special_token_ids for this tokenizer.
-                # See https://github.com/mlc-ai/xgrammar/commit/70c959fb6d9cea75aae33c414763cd0602022d92  # noqa: E501
-                vocab_size=vocab_size,
-            )
-            metadata = json.loads(tokenizer_info.dump_metadata())
-
-            # Vendored from xgrammar logic to get encoded_vocab
-            # https://github.com/mlc-ai/xgrammar/blob/989222175c2a30fb7987d8bcce35bec1bf6817f2/python/xgrammar/tokenizer_info.py#L127 # noqa: E501
-            try:
-                vocab_dict = tokenizer.get_vocab()
-            except AttributeError as e:
-                raise ValueError(
-                    f"Cannot get the vocabulary of the tokenizer "
-                    f"{type(tokenizer)}. The tokenizer should have a "
-                    "get_vocab method.") from e
-
-            # maintain tokenizer's indexing
-            encoded_vocab = [""] * tokenizer_info.vocab_size
-            for token, idx in vocab_dict.items():
-                if idx < tokenizer_info.vocab_size:
-                    encoded_vocab[idx] = token
-
-            if isinstance(tokenizer, MistralTokenizer):
-                # REF: https://github.com/mlc-ai/xgrammar/blob/5e141f6ff1ca02bc31f9e512e68b61f2a8ae88e5/tests/python/test_tokenizer_info.py#L43 # noqa: E501
-                metadata.update({
-                    "vocab_type": xgr.VocabType.BYTE_FALLBACK,
-                    "add_prefix_space": True
-                })
-
-            cls._cache[tokenizer_hash] = TokenizerData(
-                encoded_vocab=encoded_vocab,
-                metadata=json.dumps(metadata),
-            )
-
-        return cls._cache[tokenizer_hash]
-
-
-class GrammarCompilerCache:
-    """
-    Cache for GrammarCompiler instances based on tokenizer.
-
-    This cache reduces the overhead of creating new compiler instances when
-    using the same tokenizer configuration.
-    """
-    _cache: dict[str, xgr.GrammarCompiler] = {}
-
-    @classmethod
-    def get_compiler(cls, config: GrammarConfig) -> xgr.GrammarCompiler:
-        cache_key = str(config.tokenizer_hash)
-
-        if cache_key not in cls._cache:
-            config_data = config.tokenizer_data
-
-            # In TokenizerDataCache.get_tokenizer_data, a serializable
-            # tokenizer_data is created and cached. This data is used to build
-            # a tokenizer_info and create an xgrammar compiler.
-            tokenizer_info = xgr.TokenizerInfo.from_vocab_and_metadata(
-                encoded_vocab=config_data.encoded_vocab,
-                metadata=config_data.metadata,
-            )
-            cache_size = vllm.envs.VLLM_XGRAMMAR_CACHE_MB * 1024 * 1024
-            cls._cache[cache_key] = xgr.GrammarCompiler(
-                tokenizer_info,
-                max_threads=config.max_threads,
-                cache_enabled=True,
-                cache_limit_bytes=cache_size,
-            )
-
-        return cls._cache[cache_key]
-
-
-@dataclass
-class GrammarConfig:
-    """Serializable configuration for grammar compilation"""
-    tokenizer_hash: int
-    tokenizer_data: TokenizerData
-    json_str: str | None = None
-    grammar_str: str | None = None
-    json_object: bool | None = None
-    any_whitespace: bool = True
-    regex_str: str | None = None
-    max_threads: int = 8
-
-    @classmethod
-    def from_guided_params(cls,
-                           guided_params: GuidedDecodingParams,
-                           model_config: ModelConfig,
-                           tokenizer: PreTrainedTokenizer,
-                           max_threads: int = 8) -> GrammarConfig:
-
-        tokenizer_hash = hash(tokenizer)
-        tokenizer_data = TokenizerDataCache.get_tokenizer_data(
-            tokenizer,
-            tokenizer_hash=tokenizer_hash,
-            vocab_size=model_config.hf_text_config.vocab_size,
-        )
-
-        if guided_params.json:
-            if not isinstance(guided_params.json, str):
-                json_str = json.dumps(guided_params.json)
-            else:
-                json_str = guided_params.json
-
-            any_whitespace = not guided_params.disable_any_whitespace
-
-            # Check and log if model with xgrammar and whitespace have history
-            # of runaway generation of whitespaces.
-            # References:
-            # https://github.com/vllm-project/vllm/pull/12744
-            # https://github.com/mlc-ai/xgrammar/issues/212
-            model_with_warn = None
-
-            if 'Mistral' in model_config.model:
-                model_with_warn = 'Mistral'
-            elif 'Qwen' in model_config.model:
-                model_with_warn = 'Qwen'
-
-            if model_with_warn is not None and any_whitespace:
-                logger.info_once(
-                    "%s model detected, consider setting `disable_any_whitespace` to prevent runaway generation of whitespaces.",  # noqa: E501
-                    model_with_warn,
-                )
-            # Validate the schema and raise ValueError here if it is invalid.
-            # This is to avoid exceptions in model execution, which will crash
-            # the engine worker process.
-            try:
-                xgr.Grammar.from_json_schema(json_str,
-                                             any_whitespace=any_whitespace)
-            except RuntimeError as err:
-                raise ValueError(str(err)) from err
-
-            return cls(json_str=json_str,
-                       tokenizer_hash=tokenizer_hash,
-                       max_threads=max_threads,
-                       tokenizer_data=tokenizer_data,
-                       any_whitespace=any_whitespace)
-        elif guided_params.grammar:
-            # XGrammar only supports GBNF grammars, so we must convert Lark
-            if grammar_is_likely_lark(guided_params.grammar):
-                try:
-                    grammar_str = convert_lark_to_gbnf(guided_params.grammar)
-                except ValueError as e:
-                    raise ValueError(
-                        "Failed to convert the grammar from Lark to GBNF. "
-                        "Please either use GBNF grammar directly or specify"
-                        " --guided-decoding-backend=outlines.\n"
-                        f"Conversion error: {str(e)}") from e
-            else:
-                grammar_str = guided_params.grammar
-
-            # Validate the grammar and raise ValueError here if it is invalid.
-            # This is to avoid exceptions in model execution, which will crash
-            # the engine worker process.
-            try:
-                xgr.Grammar.from_ebnf(grammar_str)
-            except RuntimeError as err:
-                raise ValueError(str(err)) from err
-
-            return cls(grammar_str=grammar_str,
-                       tokenizer_hash=tokenizer_hash,
-                       max_threads=max_threads,
-                       tokenizer_data=tokenizer_data)
-        elif guided_params.json_object:
-            return cls(
-                json_object=True,
-                tokenizer_hash=tokenizer_hash,
-                max_threads=max_threads,
-                tokenizer_data=tokenizer_data,
-            )
-        elif guided_params.choice:
-            choice_str = GrammarConfig.choice_as_grammar(guided_params.choice)
-            try:
-                xgr.Grammar.from_ebnf(choice_str)
-            except RuntimeError as err:
-                raise ValueError(str(err)) from err
-
-            return cls(
-                grammar_str=choice_str,
-                tokenizer_hash=tokenizer_hash,
-                max_threads=max_threads,
-                tokenizer_data=tokenizer_data,
-            )
-        elif guided_params.regex:
-            return cls(
-                regex_str=guided_params.regex,
-                tokenizer_hash=tokenizer_hash,
-                max_threads=max_threads,
-                tokenizer_data=tokenizer_data,
-            )
-        else:
-            raise ValueError(
-                "Currently only support JSON and EBNF grammar mode for xgrammar"
-            )
-
-    @staticmethod
-    def escape_ebnf_string(s: str) -> str:
-        """Escape special characters in a EBNF string."""
-        # Escape double quotes and backslashes
-        return re.sub(r'(["\\])', r'\\\1', s)
-
-    @staticmethod
-    def choice_as_grammar(choice: list[str] | None) -> str:
-        if choice is None:
-            raise ValueError("Choice is not set")
-        escaped_choices = (GrammarConfig.escape_ebnf_string(c) for c in choice)
-        grammar = ('root ::= ' + ' | '.join(f'"{c}"' for c in escaped_choices))
-        return grammar
-
-    @staticmethod
-    def tokenizer_info(tokenizer_data: TokenizerData) -> xgr.TokenizerInfo:
-        return xgr.TokenizerInfo.from_vocab_and_metadata(
-            encoded_vocab=tokenizer_data.encoded_vocab,
-            metadata=tokenizer_data.metadata,
-        )
-
-
-@dataclass
-class XGrammarLogitsProcessor:
-    """Wrapper class to support pickle protocol"""
-    config: GrammarConfig
-    reasoner: ReasoningParser | None = None
-
-    ctx: xgr.CompiledGrammar | None = None
-    tokenizer_info: xgr.TokenizerInfo = None  # type: ignore[assignment]
-    token_bitmask: torch.Tensor = None  # type: ignore[assignment]
-    matchers: list[xgr.GrammarMatcher] = field(default_factory=list)
-    batch_size: int = field(default=1)
-    prefilled: bool = field(default=False)
-
-    def __post_init__(self):
-        if self.tokenizer_info is None:
-            self.tokenizer_info = self.config.tokenizer_info(
-                self.config.tokenizer_data)
-
-    def __getstate__(self) -> dict[str, Any]:
-        return {'config': self.config, 'reasoner': self.reasoner}
-
-    def __setstate__(self, state: dict[str, Any]):
-        self.config = state['config']
-        self.reasoner = state['reasoner']
-
-        self.tokenizer_info = GrammarConfig.tokenizer_info(
-            self.config.tokenizer_data)
-        self.ctx = None
-        self.matchers = []
-        self.batch_size = 1
-        self.token_bitmask = None  # type: ignore[assignment]
-        self.prefilled = False
-
-    def _ensure_ctx(self):
-        """Lazily initialize the processor in the worker process"""
-        if self.ctx is None:
-            compiler = GrammarCompilerCache.get_compiler(self.config)
-            if self.config.json_str is not None:
-                any_whitespace = self.config.any_whitespace
-                self.ctx = compiler\
-                    .compile_json_schema(self.config.json_str,
-                                         any_whitespace=any_whitespace)
-            elif self.config.grammar_str is not None:
-                self.ctx = compiler.compile_grammar(self.config.grammar_str)
-            elif self.config.json_object:
-                any_whitespace = self.config.any_whitespace
-                self.ctx = compiler\
-                    .compile_json_schema('{"type": "object"}',
-                                         any_whitespace=any_whitespace)
-            elif self.config.regex_str:
-                self.ctx = compiler.compile_regex(self.config.regex_str)
-            else:
-                raise ValueError(
-                    "Invalid configuration for xgrammar logits processor")
-
-    def __call__(self, input_ids: list[int],
-                 scores: torch.Tensor) -> torch.Tensor:
-
-        # Skip the structured logits processing if reasoning is not finished.
-        # reasoner is not None only when `--reasoning-parser` is set.
-        if self.reasoner is not None and \
-        not self.reasoner.is_reasoning_end(
-                input_ids):
-            return scores
-
-        if self.ctx is None:
-            self._ensure_ctx()
-
-        if len(self.matchers) == 0:
-            self.matchers = [
-                xgr.GrammarMatcher(self.ctx) for _ in range(self.batch_size)
-            ]
-            self.token_bitmask = xgr.allocate_token_bitmask(
-                self.batch_size, self.tokenizer_info.vocab_size)
-
-        if not self.prefilled:
-            # Have not sampled a token yet
-            self.prefilled = True
-        else:
-            for i, matcher in enumerate(self.matchers):
-                if not matcher.is_terminated():
-                    sampled_token = input_ids[-1]
-                    assert self.matchers[i].accept_token(sampled_token)
-
-        for i, matcher in enumerate(self.matchers):
-            if not matcher.is_terminated():
-                # @ubospica: ideally, fill_next_token_bitmask should be
-                # parallelized with model decoding
-                # See https://github.com/vllm-project/vllm/pull/10785/files#r1864278303
-                matcher.fill_next_token_bitmask(self.token_bitmask, i)
-
-        # token_bitmask is a CPU tensor for use with accept_token and
-        # fill_next_token_bitmask so we move it to the device of scores
-        device_type = scores.device.type
-        dtype = scores.dtype
-        if device_type != "cuda":
-            # xgrammar on cpu only supports float32 scores
-            # see: https://github.com/mlc-ai/xgrammar/blob/c1b64920cad24f44f235778c1c00bb52d57da01a/python/xgrammar/kernels/apply_token_bitmask_inplace_cpu.py#L22
-            scores = scores.to("cpu").float().unsqueeze(0)
-
-        # Note: In this method, if the tensors have different dimensions
-        # on CPU device fails, but on GPU it runs without error. Hence the
-        # unsqueeze above for scores, to match the token bitmask shape
-        xgr.apply_token_bitmask_inplace(
-            scores, self.token_bitmask.to(scores.device, non_blocking=True))
-        if device_type != "cuda":
-            scores = scores.to(dtype).to(device_type).squeeze()
-
-        return scores
-
-    def clone(self) -> XGrammarLogitsProcessor:
-        """Create a new instance with shared compiled grammar
-          but separate state"""
-        new_processor = XGrammarLogitsProcessor(self.config, self.reasoner,
-                                                None, self.tokenizer_info)
-
-        # Share the compiled grammar context (immutable after compilation)
-        new_processor.ctx = self.ctx
-
-        # Create fresh matchers for the new sequence
-        if self.ctx is not None:
-            new_processor.matchers = [
-                xgr.GrammarMatcher(self.ctx) for _ in range(self.batch_size)
-            ]
-
-        # Create a new token bitmask with the same size
-        if hasattr(self, 'token_bitmask') and self.token_bitmask is not None:
-            new_processor.token_bitmask = self.token_bitmask
-
-        # Copy simple attributes
-        new_processor.batch_size = self.batch_size
-        # Reset prefilled state for new sequence
-        new_processor.prefilled = False
-
-        return new_processor
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index 1fd96fe405b9a..7ce44174ead6d 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -65,11 +65,13 @@ class SiluAndMul(CustomOp):
 
     def __init__(self):
         super().__init__()
-        if current_platform.is_cuda_alike() or current_platform.is_cpu():
+        if current_platform.is_cuda_alike():
             self.op = torch.ops._C.silu_and_mul
         elif current_platform.is_xpu():
             from vllm._ipex_ops import ipex_ops
             self.op = ipex_ops.silu_and_mul
+        elif current_platform.is_cpu():
+            self._forward_method = self.forward_native
 
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         """PyTorch-native implementation equivalent to forward()."""
diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
index b89e5ac6f093e..ba7105c83a92f 100644
--- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
@@ -4,7 +4,9 @@ import functools
 from typing import Any, Optional
 
 import torch
+from tqdm import tqdm
 
+import vllm.envs as env
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
@@ -17,7 +19,7 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
 from vllm.model_executor.layers.fused_moe.utils import _resize_cache
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     per_token_group_quant_fp8)
-from vllm.utils import has_deep_gemm
+from vllm.utils import has_deep_gemm, run_once
 from vllm.utils.deep_gemm import m_grouped_fp8_gemm_nt_contiguous
 
 logger = init_logger(__name__)
@@ -31,7 +33,7 @@ def deep_gemm_block_shape() -> list[int]:
     return [block, block]
 
 
-def _valid_deep_gemm_shape(M: int, N: int, K: int):
+def _valid_deep_gemm_shape(M: int, N: int, K: int) -> bool:
     align = deep_gemm_block_shape()[0]
     return align <= M and N % align == 0 and K % align == 0
 
@@ -49,9 +51,26 @@ def _valid_deep_gemm(hidden_states: torch.Tensor, w1: torch.Tensor,
 
     M = hidden_states.size(0)
     _, K, N = w2.size()
+
+    align = deep_gemm_block_shape()[0]
+
     if not _valid_deep_gemm_shape(M, N, K):
         logger.debug_once(
-            "DeepGemm disabled: unaligned problem size. M: %s, N: %s, K: %s",
+            "DeepGemm disabled due to unaligned problem size. "
+            "M: %s, N: %s, K: %s. M should >= align size "
+            "and N and K must be multiples of %s."
+            "This is not an error and we will fall back to triton.",
+            M,
+            N,
+            K,
+            align,
+        )
+        return False
+    elif N <= 512:
+        logger.debug_once(
+            "DeepGemm disabled for N <= 512. M: %s, N: %s, K: %s. "
+            "This means we will fallback to triton "
+            "for this specific shape for further speed up.",
             M,
             N,
             K,
@@ -82,6 +101,65 @@ def _valid_deep_gemm(hidden_states: torch.Tensor, w1: torch.Tensor,
     return True
 
 
+@run_once
+def warmup_deepgemm_gg_contiguous_kernels(w1: torch.Tensor, w2: torch.Tensor,
+                                          w1_scale: torch.Tensor,
+                                          w2_scale: torch.Tensor,
+                                          num_topk: int):
+    """
+    DeepGemm JITs the grouped-gemm kernels. The JIT'ing happens based on the
+    input tensor shapes. In this function, we construct all possible input
+    tensor shapes so all the kernels are JIT'ed and cached.
+    Note that this warmup is expected to happen during the model profile
+    call and not during actual model inference.
+    """
+
+    assert w1.size(0) == w2.size(0), (
+        "w1 and w2 must have the same number of experts")
+
+    block_m = deep_gemm_block_shape()[0]
+    num_experts = w1.size(0)
+    device = w1.device
+
+    # This is the maximum GroupedGemm M size that we expect to run
+    # the grouped_gemm with.
+    MAX_M = compute_aligned_M(env.VLLM_FUSED_MOE_CHUNK_SIZE,
+                              num_topk,
+                              num_experts,
+                              block_m,
+                              expert_tokens_meta=None)
+    # Distribute expert-ids evenly.
+    MAX_BLOCKS = MAX_M // block_m
+    expert_ids_block = torch.randint(low=0,
+                                     high=num_experts,
+                                     size=(MAX_BLOCKS, ),
+                                     device=device,
+                                     dtype=torch.int32)
+    expert_ids = torch.repeat_interleave(expert_ids_block, block_m, dim=0)
+
+    def _warmup(w: torch.Tensor, w_scale: torch.Tensor):
+
+        _, n, k = w.size()
+        a1q = torch.empty((MAX_M, k), device=device).to(torch.float8_e4m3fn)
+        a1q_scales = torch.empty((MAX_M, k // block_m),
+                                 device=device,
+                                 dtype=torch.float32)
+        out = torch.empty((MAX_M, n), device=device, dtype=torch.bfloat16)
+
+        pbar = tqdm(total=MAX_BLOCKS,
+                    desc=f"DeepGemmExperts GEMM warmup (MAX_M={MAX_M})")
+        num_tokens = MAX_M
+        while num_tokens > 0:
+            m_grouped_fp8_gemm_nt_contiguous(
+                (a1q[:num_tokens], a1q_scales[:num_tokens]), (w, w_scale),
+                out[:num_tokens], expert_ids[:num_tokens])
+            pbar.update(1)
+            num_tokens = num_tokens - block_m
+
+    _warmup(w1, w1_scale)
+    _warmup(w2, w2_scale)
+
+
 class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
 
     def __init__(self):
@@ -156,6 +234,20 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
     ):
         assert self.block_shape is not None
         assert a1q_scale is not None
+        assert w1_scale is not None
+        assert w2_scale is not None
+
+        if not env.VLLM_SKIP_DEEP_GEMM_WARMUP:
+            # DeepGemm JITs the grouped-gemm kernels. We don't want the JIT'ing
+            # to happen during actual model-inference. The
+            # `warmup_deepgemm_kernels` function is a `run_once` decorated
+            # function that executes during the model profile run. This warmup
+            # should create all the required JITs for the current model.
+            warmup_deepgemm_gg_contiguous_kernels(w1,
+                                                  w2,
+                                                  w1_scale,
+                                                  w2_scale,
+                                                  num_topk=topk_ids.size(1))
 
         a1q = hidden_states
         _, N, K = w1.size()
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
index 7016ff34c3a85..f6b62254e7b4c 100644
--- a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
@@ -144,12 +144,13 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
                 "apply_router_weight_on_input is only implemented for topk=1")
             a1 = a1 * topk_weights.to(a1.dtype)
 
-        if quant_config.per_act_token_quant:
+        if quant_config.is_block_quantized:
+            # Quant and Dispatch
             a1q, a1q_scale = moe_kernel_quantize_input(
                 a1,
                 a1_scale,
                 quant_dtype=quant_config.quant_dtype,
-                per_act_token_quant=True,
+                per_act_token_quant=quant_config.per_act_token_quant,
                 block_shape=quant_config.block_shape,
             )
             if a1q_scale is not None and a1q_scale.numel() == 1:
@@ -162,8 +163,10 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
                  rank_topk_weights=topk_weights,
                  num_experts=num_experts)
         else:
-            # DeepEP kernels only support dispatching per-token-quant
-            # quantization. dispatch in bfloat16.
+            # Dispatch and Quant
+            # DeepEP kernels only support dispatching block-quantized
+            # activation scales.
+            # Dispatch in bfloat16
             (expert_x, _, expert_tokens_meta, expert_topk_ids,
              expert_topk_weights) = self._do_dispatch(
                  tokens=a1,
@@ -171,7 +174,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
                  rank_topk_ids=topk_ids,
                  rank_topk_weights=topk_weights,
                  num_experts=num_experts)
-            # quantize now
+            # Quantize after dispatch.
             expert_x_scale = None
             if expert_x.numel() != 0:
                 expert_x, expert_x_scale = moe_kernel_quantize_input(
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
index 57871ca250ae3..cfc2bdcf02408 100644
--- a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
@@ -40,7 +40,7 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
 
     # DeepEP low-latency kernels are compiled only for certain
     # specific hidden sizes.
-    SUPPORTED_HIDDEN_SIZES = [2048, 2560, 4096, 5120, 7168]
+    SUPPORTED_HIDDEN_SIZES = [2048, 2560, 4096, 5120, 6144, 7168]
 
     def __init__(self,
                  buffer: deep_ep.Buffer,
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 1985e8612da35..597af08c3c9fa 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -7,6 +7,7 @@ import os
 from typing import Any, Callable, Optional
 
 import torch
+import torch.nn.functional as F
 
 import vllm.envs as envs
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
@@ -29,6 +30,8 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
     TopKWeightAndReduceNoOP)
 from vllm.model_executor.layers.fused_moe.utils import (
     _resize_cache, moe_kernel_quantize_input, per_token_group_quant_fp8)
+from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
+    calculate_tile_tokens_dim)
 from vllm.model_executor.layers.quantization.utils.mxfp4_utils import (
     dequant_mxfp4)
 from vllm.platforms import current_platform
@@ -758,8 +761,8 @@ def get_moe_wna16_block_config(config: dict[str,
 
 def should_moe_wna16_use_cuda(num_valid_tokens: int, group_size: int,
                               num_experts: int, bit: int):
-    return bit == 4 and group_size in [32, 64, 128] and \
-        num_valid_tokens / num_experts <= 6
+    return current_platform.is_cuda() and bit == 4 and \
+        group_size in [32, 64, 128] and num_valid_tokens / num_experts <= 6
 
 
 def get_default_config(
@@ -1001,6 +1004,7 @@ def inplace_fused_experts(hidden_states: torch.Tensor,
                           topk_weights: torch.Tensor,
                           topk_ids: torch.Tensor,
                           activation: str = "silu",
+                          is_act_and_mul: bool = True,
                           apply_router_weight_on_input: bool = False,
                           use_fp8_w8a8: bool = False,
                           use_int8_w8a8: bool = False,
@@ -1018,7 +1022,8 @@ def inplace_fused_experts(hidden_states: torch.Tensor,
                           a2_scale: Optional[torch.Tensor] = None,
                           block_shape: Optional[list[int]] = None) -> None:
     fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids, True,
-                       activation, apply_router_weight_on_input, use_fp8_w8a8,
+                       activation, is_act_and_mul,
+                       apply_router_weight_on_input, use_fp8_w8a8,
                        use_int8_w8a8, use_int8_w8a16, use_int4_w4a16,
                        use_mxfp4_w4a4, per_channel_quant, global_num_experts,
                        expert_map, w1_scale, w2_scale, w1_zp, w2_zp, a1_scale,
@@ -1032,6 +1037,7 @@ def inplace_fused_experts_fake(
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
         activation: str = "silu",
+        is_act_and_mul: bool = True,
         apply_router_weight_on_input: bool = False,
         use_fp8_w8a8: bool = False,
         use_int8_w8a8: bool = False,
@@ -1061,22 +1067,6 @@ direct_register_custom_op(
 )
 
 
-def next_positive_power_of_2(x: int) -> int:
-    if x < 1:
-        return 1
-    return 1 << (x - 1).bit_length()
-
-
-def _get_tile_tokens_dim(num_tokens, top_k, num_experts):
-    # Guess tokens per expert assuming perfect expert distribution first.
-    num_tokens_per_expert = (num_tokens * top_k) // num_experts
-    # And pad the number to the next power of 2.
-    tile_tokens_dim = next_positive_power_of_2(num_tokens_per_expert)
-    # Cap to 8-64 tokens per CTA tile as it's the range supported by the kernel.
-    tile_tokens_dim = min(max(tile_tokens_dim, 8), 64)
-    return tile_tokens_dim
-
-
 def flashinfer_fused_moe_blockscale_fp8(
         routing_logits: torch.Tensor,
         routing_bias: torch.Tensor,
@@ -1124,8 +1114,8 @@ def flashinfer_fused_moe_blockscale_fp8(
         local_expert_offset=expert_offset,
         local_num_experts=local_num_experts,
         routed_scaling_factor=routed_scaling,
-        tile_tokens_dim=_get_tile_tokens_dim(x.shape[0], top_k,
-                                             global_num_experts),
+        tile_tokens_dim=calculate_tile_tokens_dim(x.shape[0], top_k,
+                                                  global_num_experts),
         routing_method_type=2,  # DeepSeek-styled routing method
         use_shuffled_weight=False,
     )
@@ -1160,6 +1150,97 @@ direct_register_custom_op(
 )
 
 
+def flashinfer_fused_moe_per_tensor_scale_fp8(
+        routing_logits: torch.Tensor,
+        routing_bias: Optional[torch.Tensor],
+        hidden_states: torch.Tensor,
+        input_scale: torch.Tensor,
+        gemm1_weights: torch.Tensor,
+        gemm1_weights_scale: torch.Tensor,
+        activation_scale: torch.Tensor,
+        gemm2_weights: torch.Tensor,
+        gemm2_weights_scale: torch.Tensor,
+        num_experts: int,
+        top_k: int,
+        num_expert_group: Optional[int],
+        topk_group: Optional[int],
+        intermediate_size: int,
+        local_expert_offset: int,
+        local_num_experts: int,
+        use_routing_scales_on_input: bool,
+        routing_method_type: int,
+        routed_scaling_factor: float = 1.0) -> torch.Tensor:
+    num_expert_group = num_expert_group if num_expert_group is not None else 0
+    topk_group = topk_group if topk_group is not None else 0
+
+    quant_hidden_states, input_scale = moe_kernel_quantize_input(
+        hidden_states,
+        input_scale,
+        quant_dtype=torch.float8_e4m3fn,
+        per_act_token_quant=False)
+
+    output1_scales_scalar = gemm1_weights_scale * input_scale * (
+        1.0 / activation_scale)
+    output1_scales_gate_scalar = gemm1_weights_scale * input_scale
+    output2_scales_scalar = activation_scale * gemm2_weights_scale
+
+    from vllm.utils.flashinfer import (
+        flashinfer_trtllm_fp8_per_tensor_scale_moe)
+    return flashinfer_trtllm_fp8_per_tensor_scale_moe(
+        routing_logits=routing_logits,
+        routing_bias=routing_bias,
+        hidden_states=quant_hidden_states,
+        gemm1_weights=gemm1_weights,
+        output1_scales_scalar=output1_scales_scalar,
+        output1_scales_gate_scalar=output1_scales_gate_scalar,
+        gemm2_weights=gemm2_weights,
+        output2_scales_scalar=output2_scales_scalar,
+        num_experts=num_experts,
+        top_k=top_k,
+        n_group=num_expert_group,
+        topk_group=topk_group,
+        intermediate_size=intermediate_size,
+        local_expert_offset=local_expert_offset,
+        local_num_experts=local_num_experts,
+        routed_scaling_factor=routed_scaling_factor,
+        use_routing_scales_on_input=use_routing_scales_on_input,
+        tile_tokens_dim=calculate_tile_tokens_dim(hidden_states.shape[0],
+                                                  top_k, num_experts),
+        routing_method_type=routing_method_type)
+
+
+def flashinfer_fused_moe_per_tensor_scale_fp8_fake(
+        routing_logits: torch.Tensor,
+        routing_bias: torch.Tensor,
+        hidden_states: torch.Tensor,
+        gemm1_weights: torch.Tensor,
+        output1_scales_scalar: torch.Tensor,
+        output1_scales_gate_scalar: torch.Tensor,
+        gemm2_weights: torch.Tensor,
+        output2_scales_scalar: torch.Tensor,
+        num_experts: int,
+        top_k: int,
+        num_expert_group: int,
+        topk_group: int,
+        intermediate_size: int,
+        local_expert_offset: int,
+        local_num_experts: int,
+        routed_scaling_factor: float = 1.0,
+        use_routing_scales_on_input: bool = False,
+        tile_tokens_dim: int = 8,
+        routing_method_type: int = 0) -> torch.Tensor:
+    pass
+
+
+direct_register_custom_op(
+    op_name="flashinfer_fused_moe_per_tensor_scale_fp8",
+    op_func=flashinfer_fused_moe_per_tensor_scale_fp8,
+    mutates_args=["hidden_states"],
+    fake_impl=flashinfer_fused_moe_per_tensor_scale_fp8_fake,
+    tags=(torch.Tag.needs_fixed_stride_order, ),
+)
+
+
 def outplace_fused_experts(
         hidden_states: torch.Tensor,
         w1: torch.Tensor,
@@ -1167,6 +1248,7 @@ def outplace_fused_experts(
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
         activation: str = "silu",
+        is_act_and_mul: bool = True,
         apply_router_weight_on_input: bool = False,
         use_fp8_w8a8: bool = False,
         use_int8_w8a8: bool = False,
@@ -1183,13 +1265,12 @@ def outplace_fused_experts(
         a1_scale: Optional[torch.Tensor] = None,
         a2_scale: Optional[torch.Tensor] = None,
         block_shape: Optional[list[int]] = None) -> torch.Tensor:
-    return fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids,
-                              False, activation, apply_router_weight_on_input,
-                              use_fp8_w8a8, use_int8_w8a8, use_int8_w8a16,
-                              use_int4_w4a16, use_mxfp4_w4a4,
-                              per_channel_quant, global_num_experts,
-                              expert_map, w1_scale, w2_scale, w1_zp, w2_zp,
-                              a1_scale, a2_scale, block_shape)
+    return fused_experts_impl(
+        hidden_states, w1, w2, topk_weights, topk_ids, False, activation,
+        is_act_and_mul, apply_router_weight_on_input, use_fp8_w8a8,
+        use_int8_w8a8, use_int8_w8a16, use_int4_w4a16, use_mxfp4_w4a4,
+        per_channel_quant, global_num_experts, expert_map, w1_scale, w2_scale,
+        w1_zp, w2_zp, a1_scale, a2_scale, block_shape)
 
 
 def outplace_fused_experts_fake(
@@ -1199,6 +1280,7 @@ def outplace_fused_experts_fake(
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
         activation: str = "silu",
+        is_act_and_mul: bool = True,
         use_fp8_w8a8: bool = False,
         use_int8_w8a8: bool = False,
         use_int8_w8a16: bool = False,
@@ -1253,6 +1335,7 @@ def fused_experts(
         topk_ids: torch.Tensor,
         inplace: bool = False,
         activation: str = "silu",
+        is_act_and_mul: bool = True,
         apply_router_weight_on_input: bool = False,
         use_fp8_w8a8: bool = False,
         use_int8_w8a8: bool = False,
@@ -1277,12 +1360,12 @@ def fused_experts(
     # E8M0 scale, which means we requantize the weight and input to the specific
     # scale. Fallen back to cutlass or triton for some cases would cause
     # accuracy issue.
-    N = w1.size(1)
-    should_use_deep_gemm = ((N > 512
-                             and _valid_deep_gemm(hidden_states, w1, w2))
-                            or is_blackwell_deep_gemm_used())
+    should_use_deep_gemm = is_blackwell_deep_gemm_used() or _valid_deep_gemm(
+        hidden_states, w1, w2)
     if (allow_deep_gemm and use_fp8_w8a8 and should_use_deep_gemm):
         assert apply_router_weight_on_input is False
+        assert is_act_and_mul, (
+            "DeepGemm only supports is_act_and_mul=True for now.")
         return deep_gemm_moe_fp8(
             hidden_states=hidden_states,
             w1=w1,
@@ -1319,6 +1402,7 @@ def fused_experts(
             topk_weights=topk_weights,
             topk_ids=topk_ids,
             activation=activation,
+            is_act_and_mul=is_act_and_mul,
             apply_router_weight_on_input=apply_router_weight_on_input,
             use_fp8_w8a8=use_fp8_w8a8,
             use_int8_w8a8=use_int8_w8a8,
@@ -1345,6 +1429,7 @@ def fused_experts_impl(
     topk_ids: torch.Tensor,
     inplace: bool = False,
     activation: str = "silu",
+    is_act_and_mul: bool = True,
     apply_router_weight_on_input: bool = False,
     use_fp8_w8a8: bool = False,
     use_int8_w8a8: bool = False,
@@ -1503,14 +1588,21 @@ def fused_experts_impl(
                                 per_channel_quant=per_channel_quant,
                                 block_shape=block_shape)
 
-        if activation == "silu":
+        # Activation function with multiplication
+        if activation == "silu" and is_act_and_mul:
             torch.ops._C.silu_and_mul(intermediate_cache2,
                                       intermediate_cache1.view(-1, N))
-        elif activation == "gelu":
+        elif activation == "gelu" and is_act_and_mul:
             torch.ops._C.gelu_and_mul(intermediate_cache2,
                                       intermediate_cache1.view(-1, N))
+        # Activation function without multiplication
+        elif activation == "silu":
+            intermediate_cache2 = F.silu(intermediate_cache1.view(-1, N))
+        elif activation == "gelu":
+            intermediate_cache2 = F.gelu(intermediate_cache1.view(-1, N))
         else:
-            raise ValueError(f"Unsupported FusedMoe activation: {activation}")
+            raise ValueError(f"Unsupported FusedMoe activation: {activation}, "
+                             f"with is_act_and_mul={is_act_and_mul}.")
 
         qintermediate_cache2, a2q_scale = moe_kernel_quantize_input(
             A=intermediate_cache2,
@@ -1555,6 +1647,7 @@ def fused_moe(
     renormalize: bool,
     inplace: bool = False,
     activation: str = "silu",
+    is_act_and_mul: bool = True,
     use_grouped_topk: bool = False,
     num_expert_group: Optional[int] = None,
     topk_group: Optional[int] = None,
@@ -1591,6 +1684,9 @@ def fused_moe(
         Defaults to False.
     - activation (str): The activation function to apply after the first
         MoE layer.
+    - is_act_and_mul (bool): If True, use activation-and-mul function for
+        activation (self-gated activation), otherwise use activation function
+        for activation (ungated activation).
     - num_expert_group: Optional[int]: additional parameter for grouped_topk
     - topk_group: Optional[int]: additional parameter for grouped_topk
     - use_grouped_topk: If True, use grouped_topk instead of fused_topk
@@ -1627,6 +1723,9 @@ def fused_moe(
     Returns:
     - torch.Tensor: The output tensor after applying the MoE layer.
     """
+    if not is_act_and_mul:
+        assert inplace is False, (
+            "is_act_and_mul=False is not supported with inplace=True")
 
     if use_grouped_topk:
         assert num_expert_group is not None and topk_group is not None
@@ -1647,6 +1746,7 @@ def fused_moe(
                          topk_ids,
                          inplace=inplace,
                          activation=activation,
+                         is_act_and_mul=is_act_and_mul,
                          use_fp8_w8a8=use_fp8_w8a8,
                          use_int8_w8a8=use_int8_w8a8,
                          use_int8_w8a16=use_int8_w8a16,
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 254cd2e10b8fb..f155a1b11fbff 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -327,7 +327,14 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
             layer.w13_weight.data = shuffled_w13
             layer.w2_weight.data = shuffled_w2
 
-        if current_platform.is_cpu():
+        if current_platform.is_xpu():
+            import intel_extension_for_pytorch as ipex
+            layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE(
+                layer.w13_weight,
+                layer.w2_weight,
+                use_prepack=True,
+            )
+        elif current_platform.is_cpu():
             if current_platform.get_cpu_architecture() == CpuArchEnum.X86:
                 from vllm.model_executor.layers.fused_moe import cpu_fused_moe
                 dtype = layer.w13_weight.dtype
@@ -509,6 +516,44 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
             activation,
         )
 
+    def forward_xpu(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        use_grouped_topk: bool,
+        top_k: int,
+        router_logits: torch.Tensor,
+        renormalize: bool,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: Optional[torch.Tensor] = None,
+        logical_to_physical_map: Optional[torch.Tensor] = None,
+        logical_replica_count: Optional[torch.Tensor] = None,
+    ):
+        if enable_eplb is not False or expert_load_view is not None or \
+                logical_to_physical_map is not None or \
+                logical_replica_count is not None:
+            raise NotImplementedError("Expert load balancing is not supported "
+                                      "for XPU.")
+        assert custom_routing_function is None
+        return layer.ipex_fusion(
+            x,
+            use_grouped_topk,
+            top_k,
+            router_logits,
+            renormalize,
+            topk_group,
+            num_expert_group,
+        )
+
     def forward_tpu(
         self,
         layer: torch.nn.Module,
@@ -874,6 +919,18 @@ class FusedMoE(torch.nn.Module):
         elif shard_id == "w2":
             param_data[expert_id] = loaded_weight
 
+    def _load_combined_w13_weight_scale(self, shard_dim: int,
+                                        loaded_weight: torch.Tensor,
+                                        param: torch.Tensor, tp_rank: int):
+        """
+        Load w13 weight scales assuming that w1 weight scales and w3 weight
+        scales are stored in the same loaded_weight tensor.
+        """
+        shard_size = param.shape[shard_dim]
+        loaded_weight = loaded_weight.narrow(shard_dim, shard_size * tp_rank,
+                                             shard_size)
+        param.copy_(loaded_weight)
+
     def _load_model_weight_or_group_weight_scale(self,
                                                  shard_dim: int,
                                                  expert_data: torch.Tensor,
@@ -1026,9 +1083,6 @@ class FusedMoE(torch.nn.Module):
             raise ValueError(f"shard_id must be ['w1','w2','w3'] but "
                              f"got {shard_id}.")
 
-        WEIGHT_SCALE_SUPPORTED = [
-            e.value for e in FusedMoeWeightScaleSupported
-        ]
         # Fetch the dim to shard the parameter/loaded weight
         # based on the shard id. This will be whatever
         # dimension intermediate_size_per_partition is used.
@@ -1118,19 +1172,43 @@ class FusedMoE(torch.nn.Module):
             uses_weight_scale_2 = self.quant_method.uses_weight_scale_2_pattern(
             )
 
-            # For per-tensor, FP4 uses "weight_scale_2", FP8 uses "weight_scale"
-            per_tensor_conditions = (
-                "weight_scale_2" in weight_name if uses_weight_scale_2 else
-                "weight_scale" in weight_name) or "input_scale" in weight_name
-
-            if per_tensor_conditions:
+            # Call _load_per_tensor_weight_scale() to load per-tensor (scalar)
+            # weights scales.
+            # Input scales are always per-tensor.
+            # Weight scales: FP4 uses "weight_scale_2" and FP8 uses
+            # "weight_scale" for per-tensor scales.
+            is_per_tensor = ("weight_scale_2" in weight_name
+                             if uses_weight_scale_2 else "weight_scale"
+                             in weight_name) or "input_scale" in weight_name
+            if is_per_tensor:
                 self._load_per_tensor_weight_scale(
                     shard_id=shard_id,
                     param=param,
                     loaded_weight=loaded_weight,
                     expert_id=expert_id,
                 )
-            elif "weight" in weight_name:
+                return True if return_success else None
+
+            # If the weight is w13_weight_scale and w13_weight_scales are
+            # combined into single loaded_weight, call
+            # _load_combined_w13_weight_scale() to load it.
+            # This is checked by comparing the hidden_out dims of the
+            # loaded_weight and the param.
+            if "w13_weight_scale" in weight_name:
+                loaded_weight_hidden_out = loaded_weight.shape[-2]
+                param_hidden_out = param.data.shape[-2] * self.tp_size
+                if loaded_weight_hidden_out == param_hidden_out:
+                    self._load_combined_w13_weight_scale(
+                        shard_dim=shard_dim,
+                        loaded_weight=loaded_weight,
+                        param=param,
+                        tp_rank=self.tp_rank,
+                    )
+                    return True if return_success else None
+
+            # For other weights, call _load_model_weight_or_group_weight_scale()
+            # to load it.
+            if "weight" in weight_name:
                 self._load_model_weight_or_group_weight_scale(
                     shard_id=shard_id,
                     shard_dim=shard_dim,
@@ -1172,6 +1250,9 @@ class FusedMoE(torch.nn.Module):
                                                    loaded_weight=loaded_weight,
                                                    expert_id=expert_id)
             else:
+                WEIGHT_SCALE_SUPPORTED = [
+                    e.value for e in FusedMoeWeightScaleSupported
+                ]
                 raise ValueError(
                     f"quant method must be one of {WEIGHT_SCALE_SUPPORTED}")
             return True if return_success else None
diff --git a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
index 1b31368c79cd5..c67f7e808301a 100644
--- a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
@@ -107,8 +107,8 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
         # Note: the deep gemm workspaces are strictly larger than the triton
         # workspaces so we can be pessimistic here and allocate for DeepGemm
         # even if we fall back to triton later, e.g. if expert maps are set.
-        if self.allow_deep_gemm and (_valid_deep_gemm_shape(M, N, K)
-                                     or is_blackwell_deep_gemm_used()):
+        if self.allow_deep_gemm and (is_blackwell_deep_gemm_used()
+                                     or _valid_deep_gemm_shape(M, N, K)):
             assert self.deep_gemm_expert is not None
             return self.deep_gemm_expert.workspace_shapes(
                 a, aq, M, N, K, topk, global_num_experts, local_num_experts,
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py
index 796c8d9375727..60cf3e11885a1 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer.py
@@ -220,7 +220,8 @@ class MambaMixer(CustomOp):
                 has_initial_state=attn_metadata.context_lens_tensor > 0,
                 query_start_loc=attn_metadata.query_start_loc)
         else:
-            scan_outputs = selective_state_update(
+            scan_outputs = torch.empty_like(hidden_states.transpose(0, 1))
+            selective_state_update(
                 mamba_cache_params.ssm_state,
                 hidden_states.transpose(0, 1),
                 discrete_time_step.transpose(0, 1),
@@ -231,7 +232,8 @@ class MambaMixer(CustomOp):
                 gate.transpose(0, 1),
                 time_proj_bias,
                 dt_softplus=True,
-                state_batch_indices=mamba_cache_params.state_indices_tensor)
+                state_batch_indices=mamba_cache_params.state_indices_tensor,
+                out=scan_outputs)
             scan_outputs = scan_outputs.transpose(0, 1)
 
         # 4. Final linear projection
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py
index 36edac2375d0e..5ac9a7f9ab3e4 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer2.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py
@@ -541,7 +541,6 @@ class MambaMixer2(MambaBase, CustomOp):
         # NOTE: V0 put prefill before decode, v1 puts decode before prefill
         # Separate prefill and decode by splitting varlen input
         # Split along token dimension
-        # NOTE: V0 put prefill before decode, v1 puts decode before prefill
         if envs.VLLM_USE_V1:
             hidden_states_B_C_d, hidden_states_B_C_p = torch.split(
                 hidden_states_B_C[:num_actual_tokens],
@@ -583,7 +582,28 @@ class MambaMixer2(MambaBase, CustomOp):
                                                                1]
                                  if has_prefill else None)
 
-        ssd_output_list = []
+        # Preallocate output tensor to avoid memcpy cost for merging prefill
+        # and decode outputs
+        preallocated_ssm_out = torch.empty(
+            [
+                num_prefill_tokens + num_decodes,
+                (self.num_heads // self.tp_size) * self.head_dim
+            ],
+            dtype=hidden_states.dtype,
+            device=hidden_states.device,
+        )
+        if envs.VLLM_USE_V1:
+            preallocated_ssm_out_d, preallocated_ssm_out_p = torch.split(
+                preallocated_ssm_out,
+                [num_decodes, num_prefill_tokens],
+                dim=0,
+            )
+        else:
+            preallocated_ssm_out_p, preallocated_ssm_out_d = torch.split(
+                preallocated_ssm_out,
+                [num_prefill_tokens, num_decodes],
+                dim=0,
+            )
 
         # Process prefill requests
         if has_prefill:
@@ -623,7 +643,8 @@ class MambaMixer2(MambaBase, CustomOp):
                         has_initial_states_p[:num_prefills, None, None, None],
                         ssm_state[state_indices_tensor_p], 0)
 
-            scan_output, varlen_state = mamba_chunk_scan_combined(
+            # NOTE: final output is an in-place update of out tensor
+            varlen_state = mamba_chunk_scan_combined(
                 hidden_states_p.view(1, num_prefill_tokens,
                                      self.num_heads // self.tp_size,
                                      self.head_dim),
@@ -646,15 +667,14 @@ class MambaMixer2(MambaBase, CustomOp):
                 return_final_states=False,
                 dt_softplus=True,
                 dt_limit=(0.0, float("inf")),
+                out=preallocated_ssm_out_p.view(1, num_prefill_tokens, -1,
+                                                self.head_dim),
             )
 
             # update ssm states
             # - varlen state is a (num_prefills, nheads, headdim, dstate) tensor
             ssm_state[state_indices_tensor_p] = varlen_state
 
-            # - reshape
-            ssd_output_list.append(scan_output.view(num_prefill_tokens, -1))
-
         # Process decode requests
         if has_decode:
             # 2. Convolution sequence transformation
@@ -684,8 +704,8 @@ class MambaMixer2(MambaBase, CustomOp):
             # - the hidden is reshaped into (bs, num_heads, head_dim)
             # - mamba_cache_params.ssm_state's slots will be selected
             #   using state_indices_tensor_d
-
-            hidden_states_d = selective_state_update(
+            # NOTE: final output is an in-place update of out tensor
+            selective_state_update(
                 ssm_state,
                 hidden_states_d,
                 dt_d,
@@ -697,26 +717,16 @@ class MambaMixer2(MambaBase, CustomOp):
                 dt_bias=dt_bias,
                 dt_softplus=True,
                 state_batch_indices=state_indices_tensor_d,
+                out=preallocated_ssm_out_d.view(num_decodes, -1,
+                                                self.head_dim),
             )
 
-            if envs.VLLM_USE_V1:
-                ssd_output_list.insert(
-                    0,
-                    hidden_states_d.view(-1, (self.num_heads // self.tp_size) *
-                                         self.head_dim))
-            else:
-                ssd_output_list.append(
-                    hidden_states_d.view(-1, (self.num_heads // self.tp_size) *
-                                         self.head_dim))
-
-        # Merge prefill and decode outputs before passing to gated MLP
-        hidden_states = torch.vstack(ssd_output_list)
-
         # 4. gated MLP
         # GatedRMSNorm internally applying SiLU to the gate
         # SiLU is applied internally before normalization, unlike standard
         # norm usage
-        hidden_states = self.norm(hidden_states, gate[:num_actual_tokens])
+        hidden_states = self.norm(preallocated_ssm_out,
+                                  gate[:num_actual_tokens])
 
         # 5. Final linear projection
         output[:num_actual_tokens], _ = self.out_proj(hidden_states)
diff --git a/vllm/model_executor/layers/mamba/ops/mamba_ssm.py b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
index 3f67fc35afdfc..838290a9f5fb2 100644
--- a/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
+++ b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
@@ -205,7 +205,8 @@ def selective_state_update(state,
                            dt_bias=None,
                            dt_softplus=False,
                            state_batch_indices=None,
-                           pad_slot_id=PAD_SLOT_ID):
+                           pad_slot_id=PAD_SLOT_ID,
+                           out=None):
     """
     Argument:
         state: (batch, dim, dstate) or (batch, nheads, dim, dstate)
@@ -223,10 +224,9 @@ def selective_state_update(state,
             for example: cache_indices = [pad_slot_id, 1, 20, pad_slot_id] 
             in this case, the kernel will not process entries at 
             indices 0 and 3
-    Return:
-        out: (batch, dim) or (batch, nheads, dim)
+        out: Preallocated ssm output tensor. Assume same shape as x. 
+             In-place updated.
     """
-    has_heads = state.dim() > 3
     if state.dim() == 3:
         state = state.unsqueeze(1)
     if x.dim() == 2:
@@ -245,6 +245,8 @@ def selective_state_update(state,
         z = z.unsqueeze(1)
     if dt_bias is not None and dt_bias.dim() == 1:
         dt_bias = dt_bias.unsqueeze(0)
+    if out.dim() == 2:
+        out = out.unsqueeze(1)
 
     _, nheads, dim, dstate = state.shape
     batch = x.shape[0]
@@ -264,7 +266,8 @@ def selective_state_update(state,
         assert dt_bias.shape == (nheads, dim)
     if state_batch_indices is not None:
         assert state_batch_indices.shape == (batch, )
-    out = torch.empty_like(x)
+    assert out.shape == x.shape
+
     grid = lambda META: (triton.cdiv(dim, META['BLOCK_SIZE_M']), batch, nheads)
     z_strides = ((z.stride(0), z.stride(1), z.stride(2)) if z is not None else
                  (0, 0, 0))
@@ -328,9 +331,6 @@ def selective_state_update(state,
             BLOCK_SIZE_M,
             num_warps=num_warps,
         )
-    if not has_heads:
-        out = out.squeeze(1)
-    return out
 
 
 def selective_scan_fn(u,
diff --git a/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py b/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
index 365e1c54b555a..fc2b3b25fd0a8 100644
--- a/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
+++ b/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
@@ -454,6 +454,7 @@ def _chunk_scan_fwd(
     chunk_indices=None,
     chunk_offsets=None,
     initial_states=None,
+    out=None,
 ):
     batch, seqlen, nheads, headdim = x.shape
     _, _, nchunks, chunk_size = dt.shape
@@ -476,34 +477,17 @@ def _chunk_scan_fwd(
             # with initial states, we need to take care of how
             # seq_idx crosses the boundaries
             assert batch == 1, "chunk scan only supports initial states with batch 1"
-
-            if initial_states.shape[0] == 1:
-                # no in this case no point to use initial states
-                initial_states = None
-            else:
-                assert chunk_indices is not None and chunk_offsets is not None, \
-                    (
-                        "chunk_indices and chunk_offsets should have been set"
-                    )
+            assert chunk_indices is not None and chunk_offsets is not None, \
+                "chunk_indices and chunk_offsets should have been set"
         else:
             chunk_indices, chunk_offsets = None, None
     else:
         chunk_indices, chunk_offsets = None, None
 
-    # Allocates output.
-    out = torch.empty(batch,
-                      seqlen,
-                      nheads,
-                      headdim,
-                      device=x.device,
-                      dtype=x.dtype)
+    assert out.shape == x.shape
+
     if z is not None:
-        out_x = torch.empty(batch,
-                            seqlen,
-                            nheads,
-                            headdim,
-                            device=x.device,
-                            dtype=x.dtype)
+        out_x = torch.empty_like(x)
         assert out_x.stride() == out.stride()
     else:
         out_x = None
@@ -586,4 +570,4 @@ def _chunk_scan_fwd(
         IS_TRITON_22=TRITON_22,
         HAS_INITSTATES=initial_states is not None,
     )
-    return out, out_x
+    return out_x
diff --git a/vllm/model_executor/layers/mamba/ops/ssd_combined.py b/vllm/model_executor/layers/mamba/ops/ssd_combined.py
index b121275e9eb38..ad2853a3d8a8b 100644
--- a/vllm/model_executor/layers/mamba/ops/ssd_combined.py
+++ b/vllm/model_executor/layers/mamba/ops/ssd_combined.py
@@ -36,7 +36,8 @@ def _mamba_chunk_scan_combined_fwd(x,
                                    chunk_offsets=None,
                                    cu_seqlens=None,
                                    dt_softplus=False,
-                                   dt_limit=(0.0, float("inf"))):
+                                   dt_limit=(0.0, float("inf")),
+                                   out=None):
     batch, seqlen, nheads, headdim = x.shape
     _, _, ngroups, dstate = B.shape
     assert nheads % ngroups == 0
@@ -134,7 +135,7 @@ def _mamba_chunk_scan_combined_fwd(x,
     # - in each (pseudo) chunk, we detect if the previous (pseudo) chunk had
     #   a seq_idx change, in which case we take states information from
     #   init_states.
-    out, out_x = _chunk_scan_fwd(
+    out_x = _chunk_scan_fwd(
         CB,
         x,
         dt,
@@ -147,9 +148,10 @@ def _mamba_chunk_scan_combined_fwd(x,
         chunk_indices=chunk_indices,
         chunk_offsets=chunk_offsets,
         initial_states=initial_states,
+        out=out,
     )
     if cu_seqlens is None:
-        return out, out_x, dt, dA_cumsum, states, final_states
+        return out_x, dt, dA_cumsum, states, final_states
     else:
         assert batch == 1, "passing cu_seqlens to get the varlen states is only supported if batch dimension is 1"
         varlen_states = chunk_state_varlen(
@@ -161,7 +163,7 @@ def _mamba_chunk_scan_combined_fwd(x,
             states.squeeze(0),
             initial_states=initial_states,
         )
-        return out, out_x, dt, dA_cumsum, states, final_states, varlen_states
+        return out_x, dt, dA_cumsum, states, final_states, varlen_states
 
 
 def mamba_chunk_scan_combined(x,
@@ -180,6 +182,7 @@ def mamba_chunk_scan_combined(x,
                               cu_seqlens=None,
                               dt_softplus=False,
                               dt_limit=(0.0, float("inf")),
+                              out=None,
                               return_final_states=False,
                               return_varlen_states=False):
     """
@@ -197,15 +200,14 @@ def mamba_chunk_scan_combined(x,
         seq_idx: (batch, seqlen)
         cu_seqlens: (num_sequences + 1) or None, only used if return_varlen_states is True
         dt_softplus: Whether to apply softplus to dt
-    Return:
-        out: (batch, seqlen, nheads, headdim)
+        out: Preallocated output tensor
     """
 
     if not return_varlen_states:
         cu_seqlens = None
     else:
         assert cu_seqlens is not None, "cu_seqlens must be provided if return_varlen_states is True"
-    out, out_x, dt_out, dA_cumsum, states, final_states, *rest = _mamba_chunk_scan_combined_fwd(
+    out_x, dt_out, dA_cumsum, states, final_states, *rest = _mamba_chunk_scan_combined_fwd(
         x,
         dt,
         A,
@@ -221,12 +223,14 @@ def mamba_chunk_scan_combined(x,
         chunk_offsets=chunk_offsets,
         cu_seqlens=cu_seqlens,
         dt_softplus=dt_softplus,
-        dt_limit=dt_limit)
+        dt_limit=dt_limit,
+        out=out)
     if not return_varlen_states:
-        return out if not return_final_states else (out, final_states)
+        if not return_final_states:
+            return
+        else:
+            return final_states
     else:
         varlen_states = rest[0]
-        return (out,
-                varlen_states) if not return_final_states else (out,
-                                                                final_states,
+        return (varlen_states) if not return_final_states else (final_states,
                                                                 varlen_states)
diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py
index 5bfd4aaccc17c..0f2e58eb9b5d9 100644
--- a/vllm/model_executor/layers/pooler.py
+++ b/vllm/model_executor/layers/pooler.py
@@ -41,35 +41,18 @@ class PoolingType(IntEnum):
 @dataclass(frozen=True)
 class ResolvedPoolingConfig:
     pooling_type: PoolingType
-
-    normalize: bool
-    softmax: bool
-    step_tag_id: Optional[int]
-    returned_token_ids: Optional[list[int]]
+    task: PoolingTask
 
     @classmethod
     def from_config_with_defaults(
         cls,
+        task: PoolingTask,
         pooler_config: PoolerConfig,
         pooling_type: PoolingType,
-        normalize: bool,
-        softmax: bool,
-        step_tag_id: Optional[int] = None,
-        returned_token_ids: Optional[list[int]] = None,
     ) -> "ResolvedPoolingConfig":
-        return cls(
-            pooling_type=PoolingType[pooler_config.pooling_type]
-            if pooler_config.pooling_type is not None else pooling_type,
-            normalize=pooler_config.normalize
-            if pooler_config.normalize is not None else normalize,
-            softmax=pooler_config.softmax
-            if pooler_config.softmax is not None else softmax,
-            step_tag_id=pooler_config.step_tag_id
-            if pooler_config.step_tag_id is not None else step_tag_id,
-            returned_token_ids=pooler_config.returned_token_ids
-            if pooler_config.returned_token_ids is not None else
-            returned_token_ids,
-        )
+        return cls(task=task,
+                   pooling_type=PoolingType[pooler_config.pooling_type]
+                   if pooler_config.pooling_type is not None else pooling_type)
 
 
 @dataclass(frozen=True)
@@ -89,22 +72,15 @@ class Pooler(nn.Module, ABC):
         pooler_config: PoolerConfig,
         *,
         default_pooling_type: PoolingType = PoolingType.ALL,
-        default_normalize: bool = False,
-        default_softmax: bool = False,
-        default_step_tag_id: Optional[int] = None,
-        default_returned_token_ids: Optional[list[int]] = None,
     ):
         resolved_config = ResolvedPoolingConfig.from_config_with_defaults(
+            task="encode",
             pooler_config=pooler_config,
             pooling_type=default_pooling_type,
-            normalize=default_normalize,
-            softmax=default_softmax,
-            step_tag_id=default_step_tag_id,
-            returned_token_ids=default_returned_token_ids,
         )
 
         if resolved_config.pooling_type == PoolingType.STEP:
-            return StepPooler.from_config(resolved_config)
+            return StepPooler()
 
         return SimplePooler.from_config(resolved_config)
 
@@ -113,14 +89,11 @@ class Pooler(nn.Module, ABC):
         pooler_config: PoolerConfig,
         *,
         default_pooling_type: PoolingType = PoolingType.LAST,
-        default_normalize: bool = True,
-        default_softmax: bool = False,
     ):
         resolved_config = ResolvedPoolingConfig.from_config_with_defaults(
+            task="embed",
             pooler_config=pooler_config,
             pooling_type=default_pooling_type,
-            normalize=default_normalize,
-            softmax=default_softmax,
         )
 
         return SimplePooler.from_config(resolved_config)
@@ -131,23 +104,18 @@ class Pooler(nn.Module, ABC):
         classifier: Optional[ClassifierFn],
         *,
         default_pooling_type: PoolingType = PoolingType.LAST,
-        default_normalize: bool = False,
-        default_softmax: bool = True,
     ):
         resolved_config = ResolvedPoolingConfig.from_config_with_defaults(
+            task="classify",
             pooler_config=pooler_config,
             pooling_type=default_pooling_type,
-            normalize=default_normalize,
-            softmax=default_softmax,
         )
-        base_pooler = SimplePooler.from_config(resolved_config)
-        if classifier is None:
-            return base_pooler
+
+        pooling = PoolingMethod.from_pooling_type(resolved_config.pooling_type)
 
         return ClassifierPooler(
-            pooling=base_pooler.pooling,
+            pooling=pooling,
             classifier=classifier,
-            act_fn=base_pooler.head.activation,
         )
 
     @abstractmethod
@@ -198,11 +166,17 @@ def get_prompt_token_ids(
     ]
 
 
-def get_tasks(pooling_metadata: PoolingMetadata) -> list[PoolingTask]:
+def get_pooling_params(
+        pooling_metadata: PoolingMetadata) -> list[PoolingParams]:
     if isinstance(pooling_metadata, V0PoolingMetadata):
         pooling_params = [p for _, p in pooling_metadata.seq_groups]
     else:
         pooling_params = pooling_metadata.pooling_params
+    return pooling_params
+
+
+def get_tasks(pooling_metadata: PoolingMetadata) -> list[PoolingTask]:
+    pooling_params = get_pooling_params(pooling_metadata)
 
     tasks: list[PoolingTask] = [
         task for pooling_param in pooling_params
@@ -484,49 +458,30 @@ class LambdaPoolerActivation(PoolerActivation):
 
 class PoolerHead(nn.Module):
 
-    @classmethod
-    def from_config(cls, pooler_config: ResolvedPoolingConfig) -> "PoolerHead":
-        if pooler_config.normalize and pooler_config.softmax:
-            raise ValueError("`normalize=True` and `softmax=True` should not "
-                             "be set together")
-
-        activation: PoolerActivation
-        if pooler_config.normalize:
-            activation = PoolerNormalize()
-        elif pooler_config.softmax:
-            activation = PoolerClassify()
-        else:
-            activation = PoolerIdentity()
-
-        return cls(activation)
-
     def __init__(self, activation: PoolerActivation) -> None:
         super().__init__()
-
         self.activation = activation
 
     def forward(self, pooled_data: Union[list[torch.Tensor], torch.Tensor],
                 pooling_metadata: PoolingMetadata):
 
-        # Using float32 in PoolerHead
-        if isinstance(pooled_data, list):
-            for i in range(len(pooled_data)):
-                pooled_data[i] = pooled_data[i].to(torch.float32)
-        else:
-            pooled_data = pooled_data.to(torch.float32)
+        return self.activation(pooled_data)
+
+
+class EmbeddingPoolerHead(PoolerHead):
+
+    def __init__(self) -> None:
+        super().__init__(activation=PoolerNormalize())
+
+    def forward(self, pooled_data: Union[list[torch.Tensor], torch.Tensor],
+                pooling_metadata: PoolingMetadata):
+
+        pooling_params = get_pooling_params(pooling_metadata)
 
         # for matryoshka representation
-        if isinstance(pooling_metadata, V0PoolingMetadata):
-            dimensions_list = [
-                pooling_param.dimensions
-                for _, pooling_param in pooling_metadata.seq_groups
-            ]
-        else:
-            assert isinstance(pooled_data, list)
-            dimensions_list = [
-                pooling_param.dimensions
-                for pooling_param in pooling_metadata.pooling_params
-            ]
+        dimensions_list = [
+            pooling_param.dimensions for pooling_param in pooling_params
+        ]
         if any(d is not None for d in dimensions_list):
             # change the output dimension
             assert len(pooled_data) == len(dimensions_list)
@@ -541,7 +496,41 @@ class PoolerHead(nn.Module):
                     for vecs, d in zip(pooled_data, dimensions_list)
                 ]
 
-        return self.activation(pooled_data)
+        # for normalize
+        flags = [p.normalize for p in pooling_params]
+        if len(set(flags)) == 1:
+            if flags[0]:
+                pooled_data = self.activation(pooled_data)
+        else:
+            pooled_data = [
+                self.activation(vecs) if f else vecs
+                for vecs, f in zip(pooled_data, flags)
+            ]
+
+        return pooled_data
+
+
+class RewardPoolerHead(PoolerHead):
+
+    def __init__(self) -> None:
+        super().__init__(activation=PoolerClassify())
+
+    def forward(self, pooled_data: Union[list[torch.Tensor], torch.Tensor],
+                pooling_metadata: PoolingMetadata):
+        pooling_params = get_pooling_params(pooling_metadata)
+
+        # for softmax
+        flags = [p.softmax for p in pooling_params]
+        if len(set(flags)) == 1:
+            if flags[0]:
+                pooled_data = self.activation(pooled_data)
+        else:
+            pooled_data = [
+                self.activation(vecs) if f else vecs
+                for vecs, f in zip(pooled_data, flags)
+            ]
+
+        return pooled_data
 
 
 class SimplePooler(Pooler):
@@ -559,8 +548,12 @@ class SimplePooler(Pooler):
         pooler_config: ResolvedPoolingConfig,
     ) -> "SimplePooler":
         pooling = PoolingMethod.from_pooling_type(pooler_config.pooling_type)
-        head = PoolerHead.from_config(pooler_config)
-
+        if pooler_config.task == "embed":
+            head = EmbeddingPoolerHead()
+        elif pooler_config.task == "encode":
+            head = RewardPoolerHead()
+        else:
+            raise NotImplementedError(f"Unknown task: {pooler_config.task}")
         return cls(pooling, head)
 
     def __init__(self, pooling: PoolingMethod, head: PoolerHead) -> None:
@@ -587,29 +580,11 @@ class SimplePooler(Pooler):
 
 class StepPooler(Pooler):
 
-    @classmethod
-    def from_config(cls, pooler_config: ResolvedPoolingConfig) -> "StepPooler":
-        assert pooler_config.pooling_type == PoolingType.STEP
-
-        return cls(
-            PoolerHead.from_config(pooler_config),
-            step_tag_id=pooler_config.step_tag_id,
-            returned_token_ids=pooler_config.returned_token_ids,
-        )
-
-    def __init__(
-        self,
-        head: PoolerHead,
-        *,
-        step_tag_id: Optional[int] = None,
-        returned_token_ids: Optional[list[int]] = None,
-    ) -> None:
+    def __init__(self, ) -> None:
         super().__init__()
 
         self.pooling = AllPool()
-        self.head = head
-        self.step_tag_id = step_tag_id
-        self.returned_token_ids = returned_token_ids
+        self.head = RewardPoolerHead()
 
     def extract_states(
         self,
@@ -620,10 +595,15 @@ class StepPooler(Pooler):
         prompt_token_ids = get_prompt_token_ids(pooling_metadata)
 
         pooled_data = list[torch.Tensor]()
-        returned_token_ids = self.returned_token_ids
-        step_tag_id = self.step_tag_id
 
-        for data, token_id in zip(pooled_data_lst, prompt_token_ids):
+        pooling_params = get_pooling_params(pooling_metadata)
+
+        for data, token_id, pooling_param in zip(pooled_data_lst,
+                                                 prompt_token_ids,
+                                                 pooling_params):
+            step_tag_id = pooling_param.step_tag_id
+            returned_token_ids = pooling_param.returned_token_ids
+
             if returned_token_ids is not None and len(returned_token_ids) > 0:
                 data = data[:, returned_token_ids]
 
@@ -669,14 +649,14 @@ class ClassifierPooler(Pooler):
     def __init__(
         self,
         pooling: PoolingFn,
-        classifier: ClassifierFn,
-        act_fn: PoolerActivation,
+        classifier: Optional[ClassifierFn],
+        act_fn: Optional[PoolerActivation] = None,
     ) -> None:
         super().__init__()
 
         self.pooling = pooling
         self.classifier = classifier
-        self.act_fn = act_fn
+        self.act_fn = act_fn or PoolerClassify()
 
     def get_supported_tasks(self) -> Set[PoolingTask]:
         return {"classify", "score"}
@@ -688,15 +668,25 @@ class ClassifierPooler(Pooler):
     ) -> PoolerOutput:
         pooled_data = self.pooling(hidden_states, pooling_metadata)
 
-        # apply classifier once on the full batch if possible
-        if isinstance(pooled_data, torch.Tensor):
-            pooled_output = self.classifier(pooled_data)
-        elif len({data.shape for data in pooled_data}) <= 1:
-            pooled_output = self.classifier(torch.stack(pooled_data))
-        else:
-            pooled_output = [self.classifier(data) for data in pooled_data]
+        if self.classifier is not None:
+            # apply classifier once on the full batch if possible
+            if isinstance(pooled_data, torch.Tensor):
+                pooled_data = self.classifier(pooled_data)
+            elif len({data.shape for data in pooled_data}) <= 1:
+                pooled_data = self.classifier(torch.stack(pooled_data))
+            else:
+                pooled_data = [self.classifier(data) for data in pooled_data]
 
-        scores = self.act_fn(pooled_output)
+        pooling_params = get_pooling_params(pooling_metadata)
+        flags = [p.activation for p in pooling_params]
+
+        if len(set(flags)) == 1:
+            scores = self.act_fn(pooled_data) if flags[0] else pooled_data
+        else:
+            scores = [
+                self.act_fn(vecs) if f else vecs
+                for vecs, f in zip(pooled_data, flags)
+            ]
 
         return build_output(scores)
 
diff --git a/vllm/model_executor/layers/quantization/auto_round.py b/vllm/model_executor/layers/quantization/auto_round.py
index ea17cd56c9855..a9e967e608e96 100644
--- a/vllm/model_executor/layers/quantization/auto_round.py
+++ b/vllm/model_executor/layers/quantization/auto_round.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from fractions import Fraction
-from typing import Any, Optional, Union
+from typing import TYPE_CHECKING, Any, Optional, Union
 
 import torch
 
@@ -16,6 +16,9 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 
+if TYPE_CHECKING:
+    from vllm.model_executor.models.utils import WeightsMapper
+
 logger = init_logger(__name__)
 
 
@@ -28,7 +31,13 @@ class AutoRoundConfig(QuantizationConfig):
     SUPPORTED_DTYPES = {"int"}
     SUPPORTED_FORMATS = {"auto_round:auto_gptq", "auto_round:auto_awq"}
     SUPPORTED_BACKENDS = {
-        "auto", "gptq", "gptq:marlin", "awq", "awq:marlin", "marlin", "ipex"
+        "auto",
+        "gptq",
+        "gptq:marlin",
+        "awq",
+        "awq:marlin",
+        "marlin",
+        "ipex",
     }
 
     def __init__(
@@ -109,26 +118,70 @@ class AutoRoundConfig(QuantizationConfig):
         )
 
     def get_layer_config(self, layer, layer_name: str):
-        # Priority: extra_config > block_name_to_quantize > type fallback
-        if self.extra_config and layer_name in self.extra_config:
-            cfg = self.extra_config[layer_name]
-            return cfg.get("bits", self.weight_bits), cfg.get(
-                "group_size", self.group_size), cfg.get("sym", self.sym)
 
-        quantized = True
+        def get_config(name: str, quantized: bool = True):
+            cfg = self.extra_config.get(name, {}) if self.extra_config else {}
+            return (
+                cfg.get("bits", self.weight_bits if quantized else 16),
+                cfg.get("group_size", self.group_size if quantized else -1),
+                cfg.get("sym", self.sym if quantized else True),
+            )
+
+        # 1. Exact match from config
+        if self.extra_config and layer_name in self.extra_config:
+            return get_config(layer_name)
+
+        # 2. Determine whether layer should be quantized
+        quantized = not isinstance(layer, ParallelLMHead)
         if self.block_name_to_quantize:
             quantized = any(
                 layer_name.startswith(name)
                 for name in self.block_name_to_quantize)
-        elif isinstance(layer, ParallelLMHead):
-            quantized = False
 
-        return (self.weight_bits, self.group_size,
-                self.sym) if quantized else (16, -1, True)
+        # 3. Handle fused MoE
+        if self.extra_config and "fusedmoe" in layer.__class__.__name__.lower(
+        ):
+            moe_configs = [
+                get_config(name, quantized) for name in self.extra_config
+                if name.startswith(layer_name)
+            ]
+            if moe_configs:
+                if len(set(moe_configs)) == 1:
+                    return moe_configs[0]
+                raise ValueError(f"Fused MoE layer '{layer_name}' requires "
+                                 f"consistent quant config for all sub-layers")
+
+        # 4. Handle fused QKV or other patterns
+        if self.extra_config:
+            for fusion_key, sub_keys in self.packed_modules_mapping.items():
+                if fusion_key in layer_name and layer_name.count(
+                        fusion_key) == 1:
+                    sub_names = [
+                        layer_name.replace(fusion_key, sub_key)
+                        for sub_key in sub_keys
+                    ]
+                    sub_configs = [
+                        get_config(name, quantized) for name in sub_names
+                    ]
+                    if len(set(sub_configs)) == 1:
+                        return sub_configs[0]
+                    raise ValueError(
+                        f"Fused module '{layer_name}' requires "
+                        f"consistent quant config for {sub_names}")
+
+        # 5. Fallback
+        return get_config(layer_name, quantized)
 
     def check_quantized(self, weight_bits: int) -> bool:
         return weight_bits < 16
 
+    def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
+        if self.block_name_to_quantize is not None:
+            self.block_name_to_quantize = hf_to_vllm_mapper.apply_list(
+                self.block_name_to_quantize)
+        if self.extra_config is not None:
+            self.extra_config = hf_to_vllm_mapper.apply_dict(self.extra_config)
+
     def apply_awq_quant_layer(self, layer, prefix: str, backend: str = "auto"):
         from vllm.model_executor.layers.fused_moe import FusedMoE
         from vllm.model_executor.layers.quantization.utils.marlin_utils import (
@@ -141,9 +194,14 @@ class AutoRoundConfig(QuantizationConfig):
             else:
                 return None
 
-        logger.debug("[%s] Type: %s, Bits: %s, Group Size: %s, Sym: %s",
-                     prefix, layer.__class__.__name__, weight_bits, group_size,
-                     sym)
+        logger.debug(
+            "[%s] Type: %s, Bits: %s, Group Size: %s, Sym: %s",
+            prefix,
+            layer.__class__.__name__,
+            weight_bits,
+            group_size,
+            sym,
+        )
         if backend == "auto" or "marlin" in backend:
             AWQ_TYPE_MAP = {
                 4: scalar_types.uint4,
@@ -162,15 +220,19 @@ class AutoRoundConfig(QuantizationConfig):
         if use_marlin:
             from vllm.model_executor.layers.quantization.awq_marlin import (
                 AWQMarlinConfig, AWQMarlinLinearMethod, AWQMoEMethod)
-            quant_args_marlin = AWQMarlinConfig(weight_bits=weight_bits,
-                                                group_size=group_size,
-                                                zero_point=not sym,
-                                                lm_head_quantized=False,
-                                                full_config={},
-                                                modules_to_not_convert=[])
+
+            quant_args_marlin = AWQMarlinConfig(
+                weight_bits=weight_bits,
+                group_size=group_size,
+                zero_point=not sym,
+                lm_head_quantized=False,
+                full_config={},
+                modules_to_not_convert=[],
+            )
         else:
             from vllm.model_executor.layers.quantization.awq import (
                 AWQConfig, AWQLinearMethod)
+
             quant_args = AWQConfig(
                 weight_bits=weight_bits,
                 group_size=group_size,
@@ -182,6 +244,7 @@ class AutoRoundConfig(QuantizationConfig):
                 return AWQMoEMethod(quant_args_marlin)
             from vllm.model_executor.layers.quantization.moe_wna16 import (
                 MoeWNA16Config)
+
             config = {
                 "quant_method": "awq",
                 "bits": weight_bits,
@@ -206,6 +269,7 @@ class AutoRoundConfig(QuantizationConfig):
         from vllm.model_executor.layers.fused_moe import FusedMoE
         from vllm.model_executor.layers.quantization.utils.marlin_utils import (
             check_marlin_supported, check_moe_marlin_supports_layer)
+
         weight_bits, group_size, sym = self.get_layer_config(layer, prefix)
         if not self.check_quantized(weight_bits):
             if isinstance(layer, (LinearBase, ParallelLMHead)):
@@ -213,19 +277,24 @@ class AutoRoundConfig(QuantizationConfig):
             else:
                 return None
 
-        logger.debug("[%s] Type: %s, Bits: %s, Group Size: %s, Sym: %s",
-                     prefix, layer.__class__.__name__, weight_bits, group_size,
-                     sym)
+        logger.debug(
+            "[%s] Type: %s, Bits: %s, Group Size: %s, Sym: %s",
+            prefix,
+            layer.__class__.__name__,
+            weight_bits,
+            group_size,
+            sym,
+        )
         if backend == "auto" or "marlin" in backend:
             GPTQ_TYPE_MAP = {
                 (4, True): scalar_types.uint4b8,
                 (8, True): scalar_types.uint8b128,
             }
-            use_marlin = ((weight_bits, sym) in GPTQ_TYPE_MAP
-                          and check_marlin_supported(
+            use_marlin = (weight_bits,
+                          sym) in GPTQ_TYPE_MAP and check_marlin_supported(
                               GPTQ_TYPE_MAP[(weight_bits, sym)],
                               group_size,
-                              has_zp=not sym))
+                              has_zp=not sym)
             if isinstance(layer, FusedMoE):
                 use_marlin = use_marlin and check_moe_marlin_supports_layer(
                     layer, group_size)
@@ -234,26 +303,33 @@ class AutoRoundConfig(QuantizationConfig):
         if use_marlin:
             from vllm.model_executor.layers.quantization.gptq_marlin import (
                 GPTQMarlinConfig, GPTQMarlinLinearMethod, GPTQMarlinMoEMethod)
-            quant_args_marlin = GPTQMarlinConfig(weight_bits=weight_bits,
-                                                 group_size=group_size,
-                                                 is_sym=sym,
-                                                 lm_head_quantized=False,
-                                                 desc_act=False,
-                                                 dynamic={},
-                                                 full_config={})
+
+            quant_args_marlin = GPTQMarlinConfig(
+                weight_bits=weight_bits,
+                group_size=group_size,
+                is_sym=sym,
+                lm_head_quantized=False,
+                desc_act=False,
+                dynamic={},
+                full_config={},
+            )
         else:
             from vllm.model_executor.layers.quantization.gptq import (
                 GPTQConfig, GPTQLinearMethod)
-            quant_args = GPTQConfig(weight_bits=weight_bits,
-                                    group_size=group_size,
-                                    lm_head_quantized=False,
-                                    desc_act=False,
-                                    dynamic={})
+
+            quant_args = GPTQConfig(
+                weight_bits=weight_bits,
+                group_size=group_size,
+                lm_head_quantized=False,
+                desc_act=False,
+                dynamic={},
+            )
 
         if isinstance(layer, FusedMoE):
             if use_marlin:
                 from vllm.model_executor.layers.quantization.moe_wna16 import (
                     MoeWNA16Config)
+
                 config = {
                     "quant_method": "gptq",
                     "bits": weight_bits,
@@ -282,6 +358,7 @@ class AutoRoundConfig(QuantizationConfig):
                 return None
         from vllm.model_executor.layers.quantization.ipex_quant import (
             IPEXAWQLinearMethod, IPEXConfig, IPEXGPTQLinearMethod)
+
         if isinstance(layer, (LinearBase, ParallelLMHead)):
             if "awq" in self.packing_format:
                 config = IPEXConfig(method="awq",
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 8f69636dda7bf..09d8890888fa8 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -17,9 +17,14 @@ from vllm.model_executor.layers.fused_moe import (
     FusedMoE, FusedMoEActivationFormat, FusedMoEConfig, FusedMoEMethodBase,
     FusedMoEPermuteExpertsUnpermute, FusedMoEPrepareAndFinalize,
     FusedMoeWeightScaleSupported)
+from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import (  # noqa
+    FlashInferCutlassMoEPrepareAndFinalize)
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_wNa16 import (  # noqa
     WNA16_SUPPORTED_BITS, WNA16_SUPPORTED_TYPES_MAP)
 from vllm.model_executor.layers.quantization.utils import replace_parameter
+from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (
+    build_flashinfer_fp4_cutlass_moe_kernel,
+    flashinfer_fp4_cutlass_moe_forward, reorder_w1w3_to_w3w1)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     check_moe_marlin_supports_layer, marlin_make_workspace_new,
     marlin_moe_permute_scales)
@@ -28,7 +33,7 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
     prepare_moe_fp8_layer_for_marlin)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    cutlass_fp4_supported, swizzle_blockscale)
+    swizzle_blockscale)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     all_close_1d, normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize)
 from vllm.model_executor.utils import set_weight_attrs
@@ -45,7 +50,6 @@ class GPTQMarlinState(Enum):
 
 __all__ = [
     "CompressedTensorsMoEMethod", "CompressedTensorsW8A8Fp8MoEMethod",
-    "CompressedTensorsW8A8Fp8MoECutlassMethod",
     "CompressedTensorsW8A8Int8MoEMethod",
     "CompressedTensorsWNA16MarlinMoEMethod", "CompressedTensorsWNA16MoEMethod",
     "CompressedTensorsW4A4MoeMethod"
@@ -84,9 +88,8 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
         elif quant_config._is_fp4a4_nvfp4(weight_quant, input_quant):
             return CompressedTensorsW4A4MoeMethod()
         elif (quant_config._is_fp8_w8a8_sm90(weight_quant, input_quant)
-              or quant_config._is_fp8_w8a8_sm100(weight_quant, input_quant)):
-            return CompressedTensorsW8A8Fp8MoECutlassMethod(quant_config)
-        elif quant_config._is_fp8_w8a8(weight_quant, input_quant):
+              or quant_config._is_fp8_w8a8_sm100(weight_quant, input_quant)
+              or quant_config._is_fp8_w8a8(weight_quant, input_quant)):
             return CompressedTensorsW8A8Fp8MoEMethod(quant_config)
         elif quant_config._is_dynamic_token_w8a8(weight_quant, input_quant):
             return CompressedTensorsW8A8Int8MoEMethod(quant_config)
@@ -98,8 +101,14 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
 class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
 
     def __init__(self):
-        self.use_marlin = not cutlass_fp4_supported()
+        from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import (  # noqa: E501
+            detect_nvfp4_moe_support)
+        _nvfp4 = detect_nvfp4_moe_support(self.__class__.__name__)
+        self.cutlass_nvfp4_supported = _nvfp4.cutlass_supported
+        self.allow_flashinfer_cutlass = _nvfp4.allow_flashinfer_cutlass
+        self.use_marlin = _nvfp4.use_marlin
         self.group_size = 16
+        self.fused_experts = None  # type: ignore[assignment]
 
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
                        hidden_size: int, intermediate_size_per_partition: int,
@@ -202,6 +211,14 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
         layer.w2_weight = torch.nn.Parameter(layer.w2_weight_packed.data,
                                              requires_grad=False)
 
+        # reorder GEMM1 weights and block scales for FlashInfer CUTLASS kernel.
+        if self.allow_flashinfer_cutlass:
+            w, s = reorder_w1w3_to_w3w1(layer.w13_weight.data,
+                                        layer.w13_weight_scale.data,
+                                        dim=-2)
+            layer.w13_weight = torch.nn.Parameter(w, requires_grad=False)
+            layer.w13_weight_scale = torch.nn.Parameter(s, requires_grad=False)
+
         if not torch.allclose(layer.w13_weight_global_scale[:, 0],
                               layer.w13_weight_global_scale[:, 1]):
             logger.warning_once(
@@ -248,6 +265,21 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
         layer.w2_input_scale_quant = torch.nn.Parameter(
             (layer.w2_input_global_scale), requires_grad=False)
 
+    def maybe_swap_experts_impl(self, moe_parallel_config):
+        if not self.allow_flashinfer_cutlass:
+            return
+        self.fused_experts = build_flashinfer_fp4_cutlass_moe_kernel(
+            moe_parallel_config)
+
+    def select_gemm_impl(self, prepare_finalize, moe):
+        """Return the appropriate GEMM experts implementation."""
+        assert moe is not None and prepare_finalize is not None
+        from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (  # noqa: E501
+            select_nvfp4_gemm_impl)
+
+        return select_nvfp4_gemm_impl(self.allow_flashinfer_cutlass, moe,
+                                      logger)
+
     def apply(
         self,
         layer: torch.nn.Module,
@@ -305,10 +337,23 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
                 global_num_experts=global_num_experts,
                 expert_map=expert_map)
 
+        # FlashInfer fused experts path
+        if self.fused_experts is not None:
+            return flashinfer_fp4_cutlass_moe_forward(
+                self.fused_experts,
+                layer,
+                x,
+                topk_weights,
+                topk_ids,
+                activation=activation,
+                global_num_experts=global_num_experts,
+                expert_map=expert_map,
+                apply_router_weight_on_input=apply_router_weight_on_input,
+            )
+
         assert expert_map is None, ("Expert Parallelism / expert_map "
                                     "is currently not supported for "
                                     "CompressedTensorsW4A4MoeMethod.")
-
         from vllm.model_executor.layers.fused_moe.cutlass_moe import (
             cutlass_moe_fp4)
 
@@ -378,6 +423,14 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
 
         self.rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled()
 
+        # cutlass path
+        self.is_fp8_w8a8_sm100 = quant_config._is_fp8_w8a8_sm100(
+            self.weight_quant, self.input_quant)
+        self.use_cutlass = (quant_config._is_fp8_w8a8_sm90(
+            self.weight_quant, self.input_quant) or self.is_fp8_w8a8_sm100)
+        self.fused_experts = None  # type: ignore[assignment]
+        self.disable_expert_map = False
+
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
                        hidden_size: int, intermediate_size_per_partition: int,
                        params_dtype: torch.dtype, **extra_weight_attrs):
@@ -558,6 +611,34 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
         prepare_finalize: FusedMoEPrepareAndFinalize,
         moe: FusedMoEConfig,
     ) -> FusedMoEPermuteExpertsUnpermute:
+        # cutlass path
+        if self.use_cutlass:
+            from vllm.model_executor.layers.fused_moe import CutlassExpertsFp8
+
+            use_batched_format = (prepare_finalize.activation_format ==
+                                  FusedMoEActivationFormat.BatchedExperts)
+
+            num_dispatchers = prepare_finalize.num_dispatchers()
+            num_experts = (moe.num_local_experts
+                           if use_batched_format else moe.num_experts)
+
+            logger.debug("CutlassExpertsFp8(%s)", self.__class__.__name__)
+
+            experts = CutlassExpertsFp8(
+                num_experts,
+                moe.in_dtype,
+                self.input_quant.strategy == QuantizationStrategy.TOKEN,
+                self.weight_quant.strategy == QuantizationStrategy.CHANNEL,
+                num_dispatchers=num_dispatchers,
+                use_batched_format=use_batched_format,
+            )
+
+            self.disable_expert_map = (num_dispatchers > 1
+                                       or not experts.supports_expert_map())
+
+            return experts
+
+        # triton path
         from vllm.model_executor.layers.fused_moe import TritonExperts
         from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
             BatchedTritonExperts)
@@ -629,6 +710,68 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
             indices_type=self.topk_indices_dtype,
         )
 
+        # cutlass path
+        if self.use_cutlass:
+            per_act_token = (
+                self.input_quant.strategy == QuantizationStrategy.TOKEN)
+            per_channel_quant = (
+                self.weight_quant.strategy == QuantizationStrategy.CHANNEL)
+
+            # small-batch fallback on SM100
+            if self.is_fp8_w8a8_sm100 and topk_ids.shape[0] <= 8:
+                from vllm.model_executor.layers.fused_moe import fused_experts
+                return fused_experts(
+                    hidden_states=x,
+                    w1=layer.w13_weight,
+                    w2=layer.w2_weight,
+                    topk_weights=topk_weights,
+                    topk_ids=topk_ids,
+                    inplace=True,
+                    activation=activation,
+                    apply_router_weight_on_input=apply_router_weight_on_input,
+                    use_fp8_w8a8=True,
+                    per_channel_quant=per_channel_quant,
+                    global_num_experts=global_num_experts,
+                    expert_map=None if self.disable_expert_map else expert_map,
+                    w1_scale=layer.w13_weight_scale,
+                    w2_scale=layer.w2_weight_scale,
+                    a1_scale=layer.w13_input_scale,
+                    a2_scale=layer.w2_input_scale)
+
+            if self.fused_experts is None:
+                from vllm.model_executor.layers.fused_moe.cutlass_moe import (
+                    cutlass_moe_fp8)
+                return cutlass_moe_fp8(
+                    x,
+                    layer.w13_weight,
+                    layer.w2_weight,
+                    topk_weights,
+                    topk_ids,
+                    per_act_token=per_act_token,
+                    activation=activation,
+                    global_num_experts=global_num_experts,
+                    expert_map=None if self.disable_expert_map else expert_map,
+                    w1_scale=layer.w13_weight_scale,
+                    w2_scale=layer.w2_weight_scale,
+                    a1_scale=layer.w13_input_scale,
+                    a2_scale=layer.w2_input_scale,
+                )
+            else:
+                return self.fused_experts(
+                    x,
+                    layer.w13_weight,
+                    layer.w2_weight,
+                    topk_weights,
+                    topk_ids,
+                    activation=activation,
+                    global_num_experts=global_num_experts,
+                    expert_map=None if self.disable_expert_map else expert_map,
+                    w1_scale=layer.w13_weight_scale,
+                    w2_scale=layer.w2_weight_scale,
+                    a1_scale=layer.w13_input_scale,
+                    a2_scale=layer.w2_input_scale,
+                )
+
         if self.rocm_aiter_moe_enabled:
             return self.rocm_aiter_fused_experts_func(
                 hidden_states=x,
@@ -685,291 +828,6 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
             a2_scale=layer.w2_input_scale)
 
 
-class CompressedTensorsW8A8Fp8MoECutlassMethod(CompressedTensorsMoEMethod):
-
-    def __init__(
-            self,
-            quant_config: "CompressedTensorsConfig"  # type: ignore # noqa E501
-    ):
-        self.quant_config = quant_config
-        self.weight_quant = self.quant_config.target_scheme_map["Linear"].get(
-            "weights")
-        self.input_quant = self.quant_config.target_scheme_map["Linear"].get(
-            "input_activations")
-
-        per_tensor = (self.weight_quant.strategy == QuantizationStrategy.TENSOR
-                      and self.input_quant.strategy
-                      == QuantizationStrategy.TENSOR)
-        per_channel = (
-            self.weight_quant.strategy == QuantizationStrategy.CHANNEL
-            and self.input_quant.strategy == QuantizationStrategy.TOKEN)
-        if not (per_tensor or per_channel):
-            raise ValueError(
-                "For FP8 Fused MoE layers, we require per tensor "
-                "or channelwise, dynamic per token quantization. Found "
-                f"{self.weight_quant}, {self.input_quant}")
-
-        self.static_input_scales = not self.input_quant.dynamic
-        if self.static_input_scales and per_channel:
-            raise ValueError(
-                "For FP8 Fused MoE layer, we require either per tensor or "
-                "channelwise, dynamic per token quantization.")
-
-        self.topk_indices_dtype = None
-        self.fused_experts = None  # type: ignore
-        self.disable_expert_map = False
-        self.is_fp8_w8a8_sm100 = self.quant_config._is_fp8_w8a8_sm100(
-            self.weight_quant, self.input_quant)
-
-    def create_weights(self, layer: torch.nn.Module, num_experts: int,
-                       hidden_size: int, intermediate_size_per_partition: int,
-                       params_dtype: torch.dtype, **extra_weight_attrs):
-
-        params_dtype = torch.float8_e4m3fn
-
-        # WEIGHTS
-        w13_weight = torch.nn.Parameter(torch.empty(
-            num_experts,
-            2 * intermediate_size_per_partition,
-            hidden_size,
-            dtype=params_dtype),
-                                        requires_grad=False)
-        layer.register_parameter("w13_weight", w13_weight)
-        set_weight_attrs(w13_weight, extra_weight_attrs)
-
-        w2_weight = torch.nn.Parameter(torch.empty(
-            num_experts,
-            hidden_size,
-            intermediate_size_per_partition,
-            dtype=params_dtype),
-                                       requires_grad=False)
-        layer.register_parameter("w2_weight", w2_weight)
-        set_weight_attrs(w2_weight, extra_weight_attrs)
-
-        # WEIGHT_SCALES
-        if self.weight_quant.strategy == QuantizationStrategy.TENSOR:
-            # Allocate 2 scales for w1 and w3 respectively.
-            # They are combined to a single scale after weight loading.
-            w13_weight_scale = torch.nn.Parameter(torch.ones(
-                num_experts, 2, dtype=torch.float32),
-                                                  requires_grad=False)
-            layer.register_parameter("w13_weight_scale", w13_weight_scale)
-            w2_weight_scale = torch.nn.Parameter(torch.ones(
-                num_experts, dtype=torch.float32),
-                                                 requires_grad=False)
-            layer.register_parameter("w2_weight_scale", w2_weight_scale)
-            # Add PER-TENSOR quantization for FusedMoE.weight_loader.
-            extra_weight_attrs.update(
-                {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
-            set_weight_attrs(w13_weight_scale, extra_weight_attrs)
-            set_weight_attrs(w2_weight_scale, extra_weight_attrs)
-
-        elif self.weight_quant.strategy == QuantizationStrategy.CHANNEL:
-            w13_weight_scale = torch.nn.Parameter(torch.ones(
-                num_experts,
-                2 * intermediate_size_per_partition,
-                1,
-                dtype=torch.float32),
-                                                  requires_grad=False)
-            layer.register_parameter("w13_weight_scale", w13_weight_scale)
-            w2_weight_scale = torch.nn.Parameter(torch.ones(
-                num_experts, hidden_size, 1, dtype=torch.float32),
-                                                 requires_grad=False)
-            layer.register_parameter("w2_weight_scale", w2_weight_scale)
-            # Add PER-CHANNEL quantization for FusedMoE.weight_loader.
-            extra_weight_attrs.update(
-                {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value})
-            set_weight_attrs(w13_weight_scale, extra_weight_attrs)
-            set_weight_attrs(w2_weight_scale, extra_weight_attrs)
-
-        # INPUT_SCALES
-        if self.static_input_scales:
-            w13_input_scale = torch.nn.Parameter(torch.ones(
-                num_experts, dtype=torch.float32),
-                                                 requires_grad=False)
-            layer.register_parameter("w13_input_scale", w13_input_scale)
-            set_weight_attrs(w13_input_scale, extra_weight_attrs)
-
-            w2_input_scale = torch.nn.Parameter(torch.ones(
-                num_experts, dtype=torch.float32),
-                                                requires_grad=False)
-            layer.register_parameter("w2_input_scale", w2_input_scale)
-            set_weight_attrs(w2_input_scale, extra_weight_attrs)
-        else:
-            layer.w13_input_scale = None
-            layer.w2_input_scale = None
-
-    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        # Fp8 moe kernels require a single activation scale.
-        # We take the max of all the scales in case they differ.
-        if self.static_input_scales:
-            assert self.input_quant.strategy == QuantizationStrategy.TENSOR
-            if (layer.w13_input_scale is None or layer.w2_input_scale is None):
-                raise ValueError(
-                    "QuantConfig has static quantization, but found "
-                    "activation scales are None.")
-            if (not all_close_1d(layer.w13_input_scale)
-                    or not all_close_1d(layer.w2_input_scale)):
-                logger.warning_once(
-                    "Found input_scales that are not equal for "
-                    "fp8 MoE layer. Using the maximum across experts "
-                    "for each layer.")
-            layer.w13_input_scale = torch.nn.Parameter(
-                layer.w13_input_scale.max(), requires_grad=False)
-            layer.w2_input_scale = torch.nn.Parameter(
-                layer.w2_input_scale.max(), requires_grad=False)
-
-        # For Per-TENSOR case, Fp8 moe kernel needs single weight scale
-        # for w13 per expert. Use max then dequant and requant each expert.
-        if self.weight_quant.strategy == QuantizationStrategy.TENSOR:
-            assert layer.w13_weight_scale is not None
-            shard_size = layer.intermediate_size_per_partition
-            max_w13_scales = layer.w13_weight_scale.max(dim=1).values
-            for expert_id in range(layer.local_num_experts):
-                start = 0
-                for shard_id in range(2):
-                    dq_weight = per_tensor_dequantize(
-                        layer.w13_weight[expert_id][start:start +
-                                                    shard_size, :],
-                        layer.w13_weight_scale[expert_id][shard_id])
-                    layer.w13_weight[expert_id][
-                        start:start + shard_size, :], _ = ops.scaled_fp8_quant(
-                            dq_weight, max_w13_scales[expert_id])
-                    start += shard_size
-            layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales,
-                                                        requires_grad=False)
-
-    def select_gemm_impl(
-        self,
-        prepare_finalize: FusedMoEPrepareAndFinalize,
-        moe: FusedMoEConfig,
-    ) -> FusedMoEPermuteExpertsUnpermute:
-        from vllm.model_executor.layers.fused_moe import CutlassExpertsFp8
-
-        use_batched_format = (prepare_finalize.activation_format ==
-                              FusedMoEActivationFormat.BatchedExperts)
-
-        num_dispatchers = prepare_finalize.num_dispatchers()
-
-        num_experts = (moe.num_local_experts
-                       if use_batched_format else moe.num_experts)
-
-        logger.debug("CutlassExpertsFp8(%s)", self.__class__.__name__)
-
-        experts = CutlassExpertsFp8(
-            num_experts,
-            moe.in_dtype,
-            self.input_quant.strategy == QuantizationStrategy.TOKEN,
-            self.weight_quant.strategy == QuantizationStrategy.CHANNEL,
-            num_dispatchers=num_dispatchers,
-            use_batched_format=use_batched_format,
-        )
-
-        self.disable_expert_map = (num_dispatchers > 1
-                                   or not experts.supports_expert_map())
-
-        return experts
-
-    def apply(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
-        global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
-        scoring_func: str = "softmax",
-        e_score_correction_bias: Optional[torch.Tensor] = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for "
-                "`CompressedTensorsW8A8Fp8MoECutlassMethod` yet.")
-
-        topk_weights, topk_ids = FusedMoE.select_experts(
-            hidden_states=x,
-            router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            e_score_correction_bias=e_score_correction_bias)
-
-        per_act_token = (
-            self.input_quant.strategy == QuantizationStrategy.TOKEN)
-        per_channel_quant = (
-            self.weight_quant.strategy == QuantizationStrategy.CHANNEL)
-        # Triton fused_experts is faster in small batch sizes on SM100.
-        # Fall back to fused_experts in small batch sizes.
-        if self.is_fp8_w8a8_sm100 and topk_ids.shape[0] <= 8:
-            from vllm.model_executor.layers.fused_moe import fused_experts
-            return fused_experts(
-                x,
-                layer.w13_weight,
-                layer.w2_weight,
-                topk_weights,
-                topk_ids,
-                inplace=True,
-                activation=activation,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-                use_fp8_w8a8=True,
-                per_channel_quant=per_channel_quant,
-                global_num_experts=global_num_experts,
-                expert_map=None if self.disable_expert_map else expert_map,
-                w1_scale=layer.w13_weight_scale,
-                w2_scale=layer.w2_weight_scale,
-                a1_scale=layer.w13_input_scale,
-                a2_scale=layer.w2_input_scale)
-        if self.fused_experts is None:
-            # If no modular kernel is provided, use cutlass_moe_fp8
-            from vllm.model_executor.layers.fused_moe.cutlass_moe import (
-                cutlass_moe_fp8)
-            return cutlass_moe_fp8(
-                x,
-                layer.w13_weight,
-                layer.w2_weight,
-                topk_weights,
-                topk_ids,
-                per_act_token=per_act_token,
-                activation=activation,
-                global_num_experts=global_num_experts,
-                expert_map=None if self.disable_expert_map else expert_map,
-                w1_scale=layer.w13_weight_scale,
-                w2_scale=layer.w2_weight_scale,
-                a1_scale=layer.w13_input_scale,
-                a2_scale=layer.w2_input_scale,
-            )
-        else:
-            return self.fused_experts(
-                x,
-                layer.w13_weight,
-                layer.w2_weight,
-                topk_weights,
-                topk_ids,
-                activation=activation,
-                global_num_experts=global_num_experts,
-                expert_map=None if self.disable_expert_map else expert_map,
-                w1_scale=layer.w13_weight_scale,
-                w2_scale=layer.w2_weight_scale,
-                a1_scale=layer.w13_input_scale,
-                a2_scale=layer.w2_input_scale,
-            )
-
-
 class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
 
     def __init__(
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 75f8adf34f7dd..8b6ed154bdbe4 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -23,6 +23,9 @@ from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
+from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
+    apply_flashinfer_per_tensor_scale_fp8, rotate_flashinfer_fp8_moe_weights,
+    swap_w13_to_w31)
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     get_col_major_tma_aligned_tensor, requant_weight_ue8m0_inplace)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
@@ -53,11 +56,6 @@ ACTIVATION_SCHEMES = ["static", "dynamic"]
 logger = init_logger(__name__)
 
 
-def _swap_w13_to_w31(x: torch.Tensor) -> torch.Tensor:
-    return x.reshape(-1, 2, x.shape[-2] // 2,
-                     x.shape[-1]).flip(dims=[1]).reshape(x.shape)
-
-
 def _is_col_major(x: torch.Tensor) -> bool:
     assert x.dim() == 3
     b, m, n = x.shape
@@ -695,11 +693,13 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             elif self.flashinfer_moe_enabled:
                 # NOTE: weights have to be swapped since the activation is
                 # applied on different half for flashinfer vs vllm
-                w13_weight = _swap_w13_to_w31(layer.w13_weight.data)
-                w13_weight_scale_inv = _swap_w13_to_w31(
+                w13_weight = swap_w13_to_w31(layer.w13_weight.data)
+                w13_weight_scale_inv = swap_w13_to_w31(
                     layer.w13_weight_scale_inv.data)
                 w2_weight = layer.w2_weight.data
                 w2_weight_scale_inv = layer.w2_weight_scale_inv.data
+                if not self.block_quant:
+                    rotate_flashinfer_fp8_moe_weights(w13_weight, w2_weight)
             else:
                 w13_weight = layer.w13_weight.data
                 w13_weight_scale_inv = layer.w13_weight_scale_inv.data
@@ -998,30 +998,43 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 global_num_experts=global_num_experts,
                 expert_map=expert_map)
         elif self.flashinfer_moe_enabled:
-            # Currently only work with DS models
-            assert self.block_quant
-            assert (renormalize and use_grouped_topk
-                    and scoring_func == 'sigmoid'
-                    and custom_routing_function is None)
-            assert activation == "silu"
-            return torch.ops.vllm.flashinfer_fused_moe_blockscale_fp8(
-                routing_logits=router_logits.to(torch.float32),
-                routing_bias=e_score_correction_bias,
-                x=x,
-                w13_weight=layer.w13_weight,
-                w13_weight_scale_inv=layer.w13_weight_scale_inv,
-                w2_weight=layer.w2_weight,
-                w2_weight_scale_inv=layer.w2_weight_scale_inv,
-                global_num_experts=global_num_experts,
-                top_k=top_k,
-                num_expert_group=num_expert_group,
-                topk_group=topk_group,
-                intermediate_size=layer.intermediate_size_per_partition,
-                expert_offset=layer.ep_rank * layer.local_num_experts,
-                local_num_experts=layer.local_num_experts,
-                block_shape=self.quant_config.weight_block_size,
-                routed_scaling=1.0,
-            )
+            assert activation == 'silu'
+            assert scoring_func == 'sigmoid'
+            if self.block_quant:
+                assert (renormalize and use_grouped_topk
+                        and custom_routing_function is None)
+
+                return torch.ops.vllm.flashinfer_fused_moe_blockscale_fp8(
+                    routing_logits=router_logits.to(torch.float32),
+                    routing_bias=e_score_correction_bias,
+                    x=x,
+                    w13_weight=layer.w13_weight,
+                    w13_weight_scale_inv=layer.w13_weight_scale_inv,
+                    w2_weight=layer.w2_weight,
+                    w2_weight_scale_inv=layer.w2_weight_scale_inv,
+                    global_num_experts=global_num_experts,
+                    top_k=top_k,
+                    num_expert_group=num_expert_group,
+                    topk_group=topk_group,
+                    intermediate_size=layer.intermediate_size_per_partition,
+                    expert_offset=layer.ep_rank * layer.local_num_experts,
+                    local_num_experts=layer.local_num_experts,
+                    block_shape=self.quant_config.weight_block_size,
+                    routed_scaling=1.0,
+                )
+            else:
+                assert (not renormalize
+                        and custom_routing_function is not None)
+                return apply_flashinfer_per_tensor_scale_fp8(
+                    layer=layer,
+                    hidden_states=x,
+                    router_logits=router_logits,
+                    routing_bias=e_score_correction_bias,
+                    global_num_experts=global_num_experts,
+                    top_k=top_k,
+                    num_expert_group=num_expert_group,
+                    topk_group=topk_group,
+                    apply_router_weight_on_input=apply_router_weight_on_input)
         else:
             return self.fused_experts(
                 hidden_states=x,
diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py
index d3ab1be3bee01..f18c936bac605 100644
--- a/vllm/model_executor/layers/quantization/gptq.py
+++ b/vllm/model_executor/layers/quantization/gptq.py
@@ -10,10 +10,11 @@ import torch
 from torch.nn.parameter import Parameter
 
 from vllm import _custom_ops as ops
+from vllm.model_executor.layers.fused_moe.layer import FusedMoE
 from vllm.model_executor.layers.linear import LinearMethodBase
 from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
+    QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.layers.quantization.utils.gptq_utils import (
     get_linear_quant_method)
 from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
@@ -110,8 +111,23 @@ class GPTQConfig(QuantizationConfig):
         return cls(weight_bits, group_size, desc_act, lm_head_quantized,
                    dynamic)
 
-    def get_quant_method(self, layer: torch.nn.Module,
-                         prefix: str) -> Optional["GPTQLinearMethod"]:
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional[Union["GPTQLinearMethod", "QuantizeMethodBase"]]:
+        if isinstance(layer, FusedMoE):
+            # GPTQ MoE support: fall back to MoeWNA16 for broad compatibility
+            from .moe_wna16 import MoeWNA16Config
+
+            config = {
+                "quant_method": "gptq",
+                "bits": self.weight_bits,
+                "group_size": self.group_size,
+                "sym": True,  # GPTQ typically uses symmetric quantization
+                "lm_head": False,
+            }
+            return MoeWNA16Config.from_config(config).get_quant_method(
+                layer, prefix)
+
         return get_linear_quant_method(self, layer, prefix, GPTQLinearMethod)
 
 
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 38866586ae29e..0334a2824512d 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -10,11 +10,8 @@ from torch.nn.parameter import Parameter
 import vllm.envs as envs
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
-from vllm.distributed import get_ep_group
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig
-from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import (  # noqa: E501
-    FlashInferCutlassMoEPrepareAndFinalize)
 from vllm.model_executor.layers.fused_moe.layer import (
     FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
@@ -23,6 +20,12 @@ from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
+from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (
+    build_flashinfer_fp4_cutlass_moe_kernel,
+    flashinfer_fp4_cutlass_moe_forward, reorder_w1w3_to_w3w1)
+from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
+    apply_flashinfer_per_tensor_scale_fp8, rotate_flashinfer_fp8_moe_weights,
+    swap_w13_to_w31)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
     apply_fp4_marlin_linear, is_fp4_marlin_supported,
     prepare_fp4_layer_for_marlin, prepare_moe_fp4_layer_for_marlin)
@@ -32,8 +35,8 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     Fp8LinearOp, requantize_with_max_scale)
 from vllm.model_executor.parameter import (ModelWeightParameter,
                                            PerTensorScaleParameter)
-from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
+from vllm.utils.flashinfer import has_flashinfer_moe
 
 logger = init_logger(__name__)
 
@@ -267,6 +270,11 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
         from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
             cutlass_fp8_supported)
         self.cutlass_fp8_supported = cutlass_fp8_supported()
+        self.flashinfer_moe_enabled = False
+        if envs.VLLM_USE_FLASHINFER_MOE_FP8 and has_flashinfer_moe():
+            logger.info_once(
+                "Using FlashInfer MoE FP8 kernels for ModelOptFp8MoEMethod.")
+            self.flashinfer_moe_enabled = True
 
     def create_weights(
         self,
@@ -410,6 +418,11 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
             layer.w2_input_scale = Parameter(layer.w2_input_scale.max(),
                                              requires_grad=False)
 
+        if self.flashinfer_moe_enabled:
+            layer.w13_weight.data = swap_w13_to_w31(layer.w13_weight.data)
+            rotate_flashinfer_fp8_moe_weights(layer.w13_weight,
+                                              layer.w2_weight)
+
     def apply(
         self,
         layer: torch.nn.Module,
@@ -436,6 +449,20 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
             raise NotImplementedError(
                 "EPLB not supported for `ModelOptFp8MoEMethod` yet.")
 
+        if self.flashinfer_moe_enabled:
+            assert activation == 'silu'
+            assert not renormalize
+            return apply_flashinfer_per_tensor_scale_fp8(
+                layer=layer,
+                hidden_states=x,
+                router_logits=router_logits,
+                routing_bias=e_score_correction_bias,
+                global_num_experts=global_num_experts,
+                top_k=top_k,
+                num_expert_group=num_expert_group,
+                topk_group=topk_group,
+                apply_router_weight_on_input=apply_router_weight_on_input)
+
         # Expert selection
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
@@ -778,8 +805,6 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
         # Swizzle the weight blockscale.
         # contracting dimension is input dimension
         # block_size = 16;
-        assert (layer.weight_scale.shape[1] % 16 == 0), (
-            "Expected weight_scale.dim(1) to be divisible by 16")
         assert (layer.weight_scale.dtype == torch.float8_e4m3fn), (
             "Weight Block scale must be represented as FP8-E4M3")
         swizzled_weight_scale = swizzle_blockscale(layer.weight_scale)
@@ -843,28 +868,12 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
 
     def __init__(self, quant_config: ModelOptNvFp4Config):
         self.quant_config = quant_config
-        self.cutlass_nvfp4_supported = cutlass_fp4_supported()
-        self.use_marlin = False
-        self.allow_flashinfer_cutlass = False
-
-        if envs.VLLM_USE_FLASHINFER_MOE_FP4:
-            if self.cutlass_nvfp4_supported and current_platform.is_cuda() \
-               and current_platform.is_device_capability(100):
-                logger.info_once(
-                    "Using FlashInfer kernels for ModelOptNvFp4FusedMoE.")
-                self.allow_flashinfer_cutlass = True
-            else:
-                logger.warning_once(
-                    "Flashinfer CUTLASS Fused MoE not supported "
-                    "or found on the current platform.")
-
-        if not self.cutlass_nvfp4_supported:
-            if is_fp4_marlin_supported():
-                self.use_marlin = True
-            else:
-                raise ValueError("Current platform does not support NVFP4"
-                                 " quantization. Please use Blackwell and"
-                                 " above.")
+        from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import (  # noqa: E501
+            detect_nvfp4_moe_support)
+        _nvfp4 = detect_nvfp4_moe_support(self.__class__.__name__)
+        self.cutlass_nvfp4_supported = _nvfp4.cutlass_supported
+        self.allow_flashinfer_cutlass = _nvfp4.allow_flashinfer_cutlass
+        self.use_marlin = _nvfp4.use_marlin
 
         self.fused_experts = None  # type: ignore
 
@@ -874,29 +883,8 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
     ):
         if not self.allow_flashinfer_cutlass:
             return
-
-        logger.debug_once("FlashInferExperts")
-        # default to TP/EP case only
-
-        experts_kwargs: dict[str, Any] = {
-            "use_nvfp4_w4a4": True,
-            "use_dp": moe_parallel_config.dp_size > 1,
-            "ep_rank": moe_parallel_config.ep_rank,
-            "ep_size": moe_parallel_config.ep_size,
-            "tp_rank": moe_parallel_config.tp_rank,
-            "tp_size": moe_parallel_config.tp_size,
-        }
-
-        from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (  # noqa: E501
-            FlashInferExperts)
-        experts = FlashInferExperts(**experts_kwargs)
-        self.fused_experts = mk.FusedMoEModularKernel(
-            FlashInferCutlassMoEPrepareAndFinalize(
-                quant_dtype=torch.uint8,
-                #meaning 2x e2m1 packed in one, kernel requirement
-            ),
-            experts,
-        )
+        self.fused_experts = build_flashinfer_fp4_cutlass_moe_kernel(
+            moe_parallel_config)
 
     # This method update self.fused_experts
     # only prepare_finalize is not None call select_gemm_impl
@@ -905,32 +893,12 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
     def select_gemm_impl(self, prepare_finalize,
                          moe) -> mk.FusedMoEPermuteExpertsUnpermute:
 
-        assert moe is not None
-        assert prepare_finalize is not None
-        experts = None
-        all2all_manager = get_ep_group().device_communicator.all2all_manager
-        assert all2all_manager is not None
-        if self.allow_flashinfer_cutlass:
-            from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (  # noqa: E501
-                FlashInferExperts)
-            logger.debug_once("Using FlashInferExperts")
-            experts = FlashInferExperts(
-                use_nvfp4_w4a4=True,
-                use_dp=moe.moe_parallel_config.dp_size > 1,
-                ep_rank=moe.moe_parallel_config.ep_rank,
-                ep_size=moe.moe_parallel_config.ep_size,
-                tp_rank=moe.moe_parallel_config.tp_rank,
-                tp_size=moe.moe_parallel_config.tp_size,
-            )
-        else:
-            assert moe.dp_size > 1
-            logger.debug_once("Using CutlassExpertsFp4")
-            # Currently CutlassExpertsFp4 doesn't support DP
-            raise ValueError("CutlassExpertsFp4 doesn't support DP. "
-                             "Use flashinfer CUTLASS FusedMoE backend instead "
-                             "(set VLLM_USE_FLASHINFER_MOE_FP4=1)")
+        assert moe is not None and prepare_finalize is not None
+        from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (  # noqa: E501
+            select_nvfp4_gemm_impl)
 
-        return experts
+        return select_nvfp4_gemm_impl(self.allow_flashinfer_cutlass, moe,
+                                      logger)
 
     def uses_weight_scale_2_pattern(self) -> bool:
         """
@@ -1036,18 +1004,8 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
         gemm1_weight_scale = layer.w13_weight_scale.data
 
         if self.allow_flashinfer_cutlass:
-            dim = -2
-            size = gemm1_weight.size(dim)
-            assert size % 2 == 0, f"Expected even size in dim {dim}, got {size}"
-            half = size // 2
-
-            # Reorder weight
-            w1, w3 = gemm1_weight.split(half, dim=dim)
-            gemm1_weight = torch.cat([w3, w1], dim=dim).contiguous()
-
-            # Reorder scale
-            s1, s3 = gemm1_weight_scale.split(half, dim=dim)
-            gemm1_weight_scale = torch.cat([s3, s1], dim=dim).contiguous()
+            gemm1_weight, gemm1_weight_scale = reorder_w1w3_to_w3w1(
+                gemm1_weight, gemm1_weight_scale, dim=-2)
 
         layer.w13_weight = Parameter(gemm1_weight, requires_grad=False)
         layer.w13_weight_scale = Parameter(gemm1_weight_scale,
@@ -1191,49 +1149,15 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 expert_map=expert_map,
                 apply_router_weight_on_input=apply_router_weight_on_input)
         else:
-            # TP or DP case
-            from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (  # noqa: E501
-                is_valid_flashinfer_cutlass_fused_moe)
-            assert is_valid_flashinfer_cutlass_fused_moe(
-                x, layer.w13_weight, layer.w2_weight), (
-                    "Flashinfer CUTLASS Fused MoE not applicable!")
-
-            a1_gscale = layer.w13_input_scale_quant
-            a2_gscale = layer.w2_input_scale_quant
-            extra_expert_args = {
-                'g1_alphas': layer.g1_alphas,
-                'g2_alphas': layer.g2_alphas,
-                'out_dtype': x.dtype,
-                # Avoid confusion with a1_scale and a2_scale
-                # where are batch size related.
-                'a1_gscale': a1_gscale,
-                'a2_gscale': a2_gscale,
-            }
-            extra_prepare_args = {
-                'use_dp': layer.dp_size > 1,
-                'local_tokens': x.shape[0],
-                'a1_gscale': a1_gscale,
-            }
-            extra_finalize_args = {
-                'use_dp': layer.dp_size > 1,
-                'local_tokens': x.shape[0],
-            }
-
-            out = self.fused_experts(
-                hidden_states=x,
-                w1=layer.w13_weight,
-                w2=layer.w2_weight,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                inplace=False,  # TODO(shuw): fix later, now output is high prec
+            out = flashinfer_fp4_cutlass_moe_forward(
+                self.fused_experts,
+                layer,
+                x,
+                topk_weights,
+                topk_ids,
                 activation=activation,
                 global_num_experts=global_num_experts,
                 expert_map=expert_map,
-                w1_scale=layer.w13_blockscale_swizzled,
-                w2_scale=layer.w2_blockscale_swizzled,
                 apply_router_weight_on_input=apply_router_weight_on_input,
-                extra_expert_args=extra_expert_args,
-                extra_prepare_args=extra_prepare_args,
-                extra_finalize_args=extra_finalize_args,
             )
         return out
diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
new file mode 100644
index 0000000000000..4c617e226041f
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
@@ -0,0 +1,154 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Utility helpers for NVFP4 + FlashInfer fused-MoE path"""
+from __future__ import annotations
+
+from typing import Optional
+
+import torch
+
+import vllm.envs as envs
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig
+from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
+    FlashInferExperts, is_valid_flashinfer_cutlass_fused_moe)
+from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import (  # noqa: E501
+    FlashInferCutlassMoEPrepareAndFinalize)
+from vllm.platforms import current_platform
+
+logger = init_logger(__name__)
+
+__all__ = [
+    "is_flashinfer_fp4_cutlass_moe_available",
+    "reorder_w1w3_to_w3w1",
+    "build_flashinfer_fp4_cutlass_moe_kernel",
+    "flashinfer_fp4_cutlass_moe_forward",
+]
+
+
+def is_flashinfer_fp4_cutlass_moe_available() -> bool:
+    """Return ``True`` when FlashInfer CUTLASS NV-FP4 kernels can be used."""
+    return (envs.VLLM_USE_FLASHINFER_MOE_FP4 and current_platform.is_cuda()
+            and current_platform.is_device_capability(100))
+
+
+def reorder_w1w3_to_w3w1(weight: torch.Tensor,
+                         scale: torch.Tensor,
+                         dim: int = -2) -> tuple[torch.Tensor, torch.Tensor]:
+    """Re-order the concatenated `[w1, w3]` tensors to `[w3, w1]`"""
+    size = weight.size(dim)
+    assert size % 2 == 0, f"Expected even size in dim {dim}, got {size}"
+    half = size // 2
+
+    w1, w3 = weight.split(half, dim=dim)
+    s1, s3 = scale.split(half, dim=dim)
+
+    return (torch.cat([w3, w1],
+                      dim=dim).contiguous(), torch.cat([s3, s1],
+                                                       dim=dim).contiguous())
+
+
+def build_flashinfer_fp4_cutlass_moe_kernel(
+    moe_parallel_config: FusedMoEParallelConfig, ) -> mk.FusedMoEModularKernel:
+    """Create *and return* a FlashInfer CUTLASS fused-MoE modular kernel"""
+    experts = FlashInferExperts(
+        use_nvfp4_w4a4=True,
+        use_dp=moe_parallel_config.dp_size > 1,
+        ep_rank=moe_parallel_config.ep_rank,
+        ep_size=moe_parallel_config.ep_size,
+        tp_rank=moe_parallel_config.tp_rank,
+        tp_size=moe_parallel_config.tp_size,
+    )
+    logger.debug_once("FlashInferExperts (util)")
+    return mk.FusedMoEModularKernel(
+        FlashInferCutlassMoEPrepareAndFinalize(quant_dtype=torch.uint8),
+        experts,
+    )
+
+
+def flashinfer_fp4_cutlass_moe_forward(
+    fused_experts: mk.FusedMoEModularKernel,
+    layer: torch.nn.Module,
+    x: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    activation: str,
+    global_num_experts: int,
+    expert_map: Optional[torch.Tensor],
+    apply_router_weight_on_input: bool,
+) -> torch.Tensor:
+    """Common forward wrapper for FlashInfer NV-FP4 fused-MoE"""
+
+    assert is_valid_flashinfer_cutlass_fused_moe(
+        x, layer.w13_weight,
+        layer.w2_weight), ("FlashInfer CUTLASS fused-MoE not applicable!")
+
+    a1_gscale = layer.w13_input_scale_quant
+    a2_gscale = layer.w2_input_scale_quant
+
+    extra_expert_args = {
+        "g1_alphas": layer.g1_alphas,
+        "g2_alphas": layer.g2_alphas,
+        # Avoid confusion with a1_scale and a2_scale
+        # where are batch size related.
+        "a1_gscale": a1_gscale,
+        "a2_gscale": a2_gscale,
+        "out_dtype": x.dtype,
+    }
+    extra_prepare_args = {
+        "use_dp": layer.dp_size > 1,
+        "local_tokens": x.shape[0],
+        "a1_gscale": a1_gscale,
+    }
+    extra_finalize_args = {
+        "use_dp": layer.dp_size > 1,
+        "local_tokens": x.shape[0],
+    }
+
+    return fused_experts(
+        hidden_states=x,
+        w1=layer.w13_weight,
+        w2=layer.w2_weight,
+        topk_weights=topk_weights,
+        topk_ids=topk_ids,
+        inplace=False,  # TODO(shuw): fix later, now output is high prec
+        activation=activation,
+        global_num_experts=global_num_experts,
+        expert_map=expert_map,
+        w1_scale=layer.w13_blockscale_swizzled,
+        w2_scale=layer.w2_blockscale_swizzled,
+        apply_router_weight_on_input=apply_router_weight_on_input,
+        extra_expert_args=extra_expert_args,
+        extra_prepare_args=extra_prepare_args,
+        extra_finalize_args=extra_finalize_args,
+    )
+
+
+def select_nvfp4_gemm_impl(
+        allow_flashinfer_cutlass: bool,
+        moe,  # FusedMoEConfig
+        logger):
+    """Return a GEMM *experts* implementation for NV-FP4 fused-MoE layers"""
+
+    # lazy import
+    from vllm.distributed import get_ep_group
+
+    all2all_manager = get_ep_group().device_communicator.all2all_manager
+    assert all2all_manager is not None
+
+    if allow_flashinfer_cutlass:
+        logger.debug_once("Using FlashInferExperts")
+        return FlashInferExperts(
+            use_nvfp4_w4a4=True,
+            use_dp=moe.moe_parallel_config.dp_size > 1,
+            ep_rank=moe.moe_parallel_config.ep_rank,
+            ep_size=moe.moe_parallel_config.ep_size,
+            tp_rank=moe.moe_parallel_config.tp_rank,
+            tp_size=moe.moe_parallel_config.tp_size,
+        )
+
+    # native cutlass experts currently don't support DP; TP case won't call this
+    raise ValueError(
+        "CutlassExpertsFp4 doesn't support DP. Use flashinfer CUTLASS "
+        "Fused MoE backend instead (set VLLM_USE_FLASHINFER_MOE_FP4=1)")
diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
new file mode 100644
index 0000000000000..c6f914febc0a2
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
@@ -0,0 +1,100 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+
+import torch
+
+
+def calculate_tile_tokens_dim(num_tokens, top_k, num_experts):
+    from flashinfer import next_positive_power_of_2
+
+    # Guess tokens per expert assuming perfect expert distribution first.
+    num_tokens_per_expert = (num_tokens * top_k) // num_experts
+    # And pad the number to the next power of 2.
+    tile_tokens_dim = next_positive_power_of_2(num_tokens_per_expert)
+    # Cap to 8-64 tokens per CTA tile as it's the range supported by the kernel.
+    tile_tokens_dim = min(max(tile_tokens_dim, 8), 64)
+    return tile_tokens_dim
+
+
+def swap_w13_to_w31(x: torch.Tensor) -> torch.Tensor:
+    return x.reshape(-1, 2, x.shape[-2] // 2,
+                     x.shape[-1]).flip(dims=[1]).reshape(x.shape)
+
+
+def rotate_flashinfer_fp8_moe_weights(gemm1_weights: torch.Tensor,
+                                      gemm2_weights: torch.Tensor):
+    from flashinfer import reorder_rows_for_gated_act_gemm, shuffle_matrix_a
+    epilogue_tile_m = 128
+    num_experts = gemm1_weights.shape[0]
+    hidden_size = gemm1_weights.shape[-1]
+    intermediate_size = gemm1_weights.shape[1] // 2
+
+    # Reorder rows of W1 for fused gated activation
+    gemm1_weights_fp8_interleaved = []
+    for i in range(num_experts):
+        gemm1_weights_fp8_interleaved.append(
+            reorder_rows_for_gated_act_gemm(gemm1_weights[i]))
+
+    # Stack weights and scales for all experts
+    gemm1_weights_fp8_interleaved = torch.stack(
+        gemm1_weights_fp8_interleaved).reshape(num_experts,
+                                               2 * intermediate_size,
+                                               hidden_size)
+
+    # Shuffle weights and scaling factors for transposed mma output
+    gemm1_weights_fp8_shuffled = []
+    gemm2_weights_fp8_shuffled = []
+    for i in range(num_experts):
+        gemm1_weights_fp8_shuffled.append(
+            shuffle_matrix_a(
+                gemm1_weights_fp8_interleaved[i].view(torch.uint8),
+                epilogue_tile_m))
+
+        gemm2_weights_fp8_shuffled.append(
+            shuffle_matrix_a(gemm2_weights[i].view(torch.uint8),
+                             epilogue_tile_m))
+
+    # Stack weights for all experts
+    gemm1_weights.data = torch.stack(gemm1_weights_fp8_shuffled).view(
+        torch.float8_e4m3fn)
+    gemm2_weights.data = torch.stack(gemm2_weights_fp8_shuffled).view(
+        torch.float8_e4m3fn)
+
+
+def apply_flashinfer_per_tensor_scale_fp8(
+    layer: torch.nn.Module,
+    hidden_states: torch.Tensor,
+    router_logits: torch.Tensor,
+    routing_bias: Optional[torch.Tensor],
+    top_k: int,
+    num_expert_group: Optional[int],
+    topk_group: Optional[int],
+    global_num_experts: int,
+    apply_router_weight_on_input: bool,
+) -> torch.Tensor:
+    from flashinfer.fused_moe import RoutingMethodType
+
+    from vllm.model_executor.models.llama4 import Llama4MoE
+    assert layer.custom_routing_function == Llama4MoE.custom_routing_function, \
+        "FusedMoE flashinfer kernels are only supported for Llama4"
+    return torch.ops.vllm.flashinfer_fused_moe_per_tensor_scale_fp8(
+        routing_logits=router_logits,
+        routing_bias=routing_bias,
+        hidden_states=hidden_states,
+        input_scale=layer.w13_input_scale,
+        gemm1_weights=layer.w13_weight,
+        gemm1_weights_scale=layer.w13_weight_scale,
+        gemm2_weights=layer.w2_weight,
+        gemm2_weights_scale=layer.w2_weight_scale,
+        activation_scale=layer.w2_input_scale,
+        num_experts=global_num_experts,
+        top_k=top_k,
+        num_expert_group=num_expert_group,
+        topk_group=topk_group,
+        intermediate_size=layer.intermediate_size_per_partition,
+        local_expert_offset=layer.ep_rank * layer.local_num_experts,
+        local_num_experts=layer.local_num_experts,
+        use_routing_scales_on_input=apply_router_weight_on_input,
+        routing_method_type=RoutingMethodType.Llama4,
+    )
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py b/vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py
new file mode 100644
index 0000000000000..23a749467f193
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py
@@ -0,0 +1,59 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass
+
+import vllm.envs as envs
+from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (
+    is_flashinfer_fp4_cutlass_moe_available)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
+    is_fp4_marlin_supported)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    cutlass_fp4_supported)
+
+__all__ = ["detect_nvfp4_moe_support", "NvFp4Support"]
+
+_logger = init_logger(__name__)
+
+
+@dataclass(frozen=True)
+class NvFp4Support:
+    """Result container for NV-FP4 capability probing."""
+
+    cutlass_supported: bool
+    allow_flashinfer_cutlass: bool
+    use_marlin: bool
+
+
+def detect_nvfp4_moe_support(class_name: str = "") -> NvFp4Support:
+    """Detect platform support for NV-FP4 fused-MoE path"""
+    cutlass_supported = cutlass_fp4_supported()
+
+    allow_flashinfer = (cutlass_supported
+                        and is_flashinfer_fp4_cutlass_moe_available())
+
+    if allow_flashinfer:
+        _logger.info_once("Using FlashInfer kernels for %s.", class_name
+                          or "NVFP4 path")
+    else:
+        if envs.VLLM_USE_FLASHINFER_MOE_FP4:
+            _logger.warning_once(
+                "FlashInfer kernels unavailable for %s on current platform.",
+                class_name or "NVFP4 path",
+            )
+
+    use_marlin = False
+    if not cutlass_supported:
+        if is_fp4_marlin_supported():
+            use_marlin = True
+            _logger.info_once("Falling back to Marlin FP4 MoE kernel.")
+        else:
+            raise ValueError(
+                "Current platform does not support NVFP4 quantization. "
+                "Please use Blackwell GPUs or enable FlashInfer.")
+
+    return NvFp4Support(
+        cutlass_supported=cutlass_supported,
+        allow_flashinfer_cutlass=allow_flashinfer,
+        use_marlin=use_marlin,
+    )
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
deleted file mode 100644
index dddd4d6a71170..0000000000000
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ /dev/null
@@ -1,1967 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# Adapted from
-# https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/llama/modeling_llama.py
-# Copyright 2023 The vLLM team.
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Rotary Positional Embeddings."""
-import itertools
-import math
-from typing import Any, Optional, Union
-
-import numpy as np
-import torch
-import torch.nn as nn
-from transformers import PretrainedConfig
-
-from vllm.model_executor.custom_op import CustomOp
-from vllm.platforms import current_platform
-
-if current_platform.is_cuda():
-    from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb
-
-
-def _rotate_neox(x: torch.Tensor) -> torch.Tensor:
-    x1 = x[..., :x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2:]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-def _rotate_gptj(x: torch.Tensor) -> torch.Tensor:
-    x1 = x[..., ::2]
-    x2 = x[..., 1::2]
-    x = torch.stack((-x2, x1), dim=-1)
-    return x.flatten(-2)
-
-
-def _apply_rotary_emb_torch(
-    x: torch.Tensor,
-    cos: torch.Tensor,
-    sin: torch.Tensor,
-    is_neox_style: bool,
-) -> torch.Tensor:
-    cos = cos.unsqueeze(-2).to(x.dtype)
-    sin = sin.unsqueeze(-2).to(x.dtype)
-    if is_neox_style:
-        x1, x2 = torch.chunk(x, 2, dim=-1)
-    else:
-        x1 = x[..., ::2]
-        x2 = x[..., 1::2]
-    o1 = x1 * cos - x2 * sin
-    o2 = x2 * cos + x1 * sin
-    if is_neox_style:
-        return torch.cat((o1, o2), dim=-1)
-    else:
-        return torch.stack((o1, o2), dim=-1).flatten(-2)
-
-
-def _apply_rotary_emb(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor,
-                      is_neox_style: bool) -> torch.Tensor:
-    """
-    Args:
-        x: [num_tokens, num_heads, head_size]
-        cos: [num_tokens, head_size // 2]
-        sin: [num_tokens, head_size // 2]
-        is_neox_style: Whether to use the Neox-style or GPT-J-style rotary
-            positional embeddings.
-    """
-    if current_platform.is_cuda():
-        return apply_rotary_emb(x.unsqueeze(0), cos, sin,
-                                not is_neox_style).squeeze(0)
-    else:
-        return _apply_rotary_emb_torch(x, cos, sin, is_neox_style)
-
-
-@CustomOp.register("rotary_embedding")
-class RotaryEmbedding(CustomOp):
-    """Original rotary positional embedding."""
-
-    def __init__(
-        self,
-        head_size: int,
-        rotary_dim: int,
-        max_position_embeddings: int,
-        base: float,
-        is_neox_style: bool,
-        dtype: torch.dtype,
-    ) -> None:
-        super().__init__()
-        self.head_size = head_size
-        self.rotary_dim = rotary_dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        self.is_neox_style = is_neox_style
-        self.dtype = dtype
-
-        cache = self._compute_cos_sin_cache()
-        cache = cache.to(dtype)
-        self.cos_sin_cache: torch.Tensor
-        self.register_buffer("cos_sin_cache", cache, persistent=False)
-
-    def _compute_inv_freq(self, base: float) -> torch.Tensor:
-        """Compute the inverse frequency."""
-        # NOTE(woosuk): To exactly match the HF implementation, we need to
-        # use CPU to compute the cache and then move it to GPU. However, we
-        # create the cache on GPU for faster initialization. This may cause
-        # a slight numerical difference between the HF implementation and ours.
-        inv_freq = 1.0 / (base**(torch.arange(
-            0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim))
-        return inv_freq
-
-    def _compute_cos_sin_cache(self) -> torch.Tensor:
-        """Compute the cos and sin cache."""
-        inv_freq = self._compute_inv_freq(self.base)
-        t = torch.arange(self.max_position_embeddings, dtype=torch.float)
-
-        freqs = torch.einsum("i,j -> ij", t, inv_freq)
-        cos = freqs.cos()
-        sin = freqs.sin()
-        cache = torch.cat((cos, sin), dim=-1)
-        return cache
-
-    def forward_native(
-        self,
-        positions: torch.Tensor,
-        query: torch.Tensor,
-        key: Optional[torch.Tensor] = None,
-        offsets: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
-        """A PyTorch-native implementation of forward()."""
-        if offsets is not None:
-            positions = positions + offsets
-        positions = positions.flatten()
-        num_tokens = positions.shape[0]
-        cos_sin = self.cos_sin_cache.index_select(0, positions)
-        cos, sin = cos_sin.chunk(2, dim=-1)
-
-        query_shape = query.shape
-        query = query.view(num_tokens, -1, self.head_size)
-        query_rot = query[..., :self.rotary_dim]
-        query_pass = query[..., self.rotary_dim:]
-        query_rot = _apply_rotary_emb_torch(query_rot, cos, sin,
-                                            self.is_neox_style)
-        query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
-
-        # key may be None in some cases, e.g. cross-layer KV sharing
-        if key is not None:
-            key_shape = key.shape
-            key = key.view(num_tokens, -1, self.head_size)
-            key_rot = key[..., :self.rotary_dim]
-            key_pass = key[..., self.rotary_dim:]
-            key_rot = _apply_rotary_emb_torch(key_rot, cos, sin,
-                                              self.is_neox_style)
-            key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
-        return query, key
-
-    def forward_cuda(
-        self,
-        positions: torch.Tensor,
-        query: torch.Tensor,
-        key: Optional[torch.Tensor] = None,
-        offsets: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
-        from vllm import _custom_ops as ops
-
-        # __setattr__ in nn.Module (called by `self.cos_sin_cache = ...`)
-        # is expensive, so avoid calling it if possible
-        if self.cos_sin_cache.device != query.device or \
-            self.cos_sin_cache.dtype != query.dtype:
-            self.cos_sin_cache = self.cos_sin_cache.to(query.device,
-                                                       dtype=query.dtype)
-
-        # ops.rotary_embedding()/batched_rotary_embedding()
-        # are in-place operations that update the query and key tensors.
-        if offsets is not None:
-            ops.batched_rotary_embedding(positions, query, key, self.head_size,
-                                         self.cos_sin_cache,
-                                         self.is_neox_style, self.rotary_dim,
-                                         offsets)
-        else:
-            ops.rotary_embedding(positions, query, key, self.head_size,
-                                 self.cos_sin_cache, self.is_neox_style)
-        return query, key
-
-    def forward_xpu(
-        self,
-        positions: torch.Tensor,
-        query: torch.Tensor,
-        key: Optional[torch.Tensor] = None,
-        offsets: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
-        from vllm._ipex_ops import ipex_ops as ops
-
-        self.cos_sin_cache = self.cos_sin_cache.to(positions.device,
-                                                   dtype=query.dtype)
-        # ops.rotary_embedding()/batched_rotary_embedding()
-        # are in-place operations that update the query and key tensors.
-        if key is None:
-            # XPU kernel doesn't support key=None so fall back to native impl
-            # TODO(sarckk): add support for optional key in
-            # ipex.llm.functional.rotary_embedding_batched
-            return self.forward_native(positions, query, key, offsets)
-        else:
-            if offsets is not None:
-                ops.batched_rotary_embedding(positions, query, key,
-                                             self.head_size,
-                                             self.cos_sin_cache,
-                                             self.is_neox_style,
-                                             self.rotary_dim, offsets)
-            else:
-                ops.rotary_embedding(positions, query, key, self.head_size,
-                                     self.cos_sin_cache, self.is_neox_style)
-        return query, key
-
-    def forward_neuron(
-        self,
-        positions: torch.Tensor,
-        query: torch.Tensor,
-        key: Optional[torch.Tensor] = None,
-        offsets: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
-
-        def _apply_rotary_emb_neuron(
-            x: torch.Tensor,
-            cos: torch.Tensor,
-            sin: torch.Tensor,
-            is_neox_style: bool,
-        ) -> torch.Tensor:
-            cos = cos.unsqueeze(-2).to(x.dtype)
-            sin = sin.unsqueeze(-2).to(x.dtype)
-            if is_neox_style:
-                x1, x2 = torch.chunk(x, 2, dim=-1)
-            else:
-                # x1 = x[..., ::2]
-
-                # x2 = x[..., 1::2]
-                d = x.shape[-1] // 2
-                x_reshaped = x.view(-1, x.shape[-1])
-                x1 = x_reshaped[:, ::2].view(*x.shape[:-1], d)
-                x2 = x_reshaped[:, 1::2].view(*x.shape[:-1], d)
-            o1 = x1 * cos - x2 * sin
-            o2 = x2 * cos + x1 * sin
-            if is_neox_style:
-                return torch.cat((o1, o2), dim=-1)
-            else:
-                return torch.stack((o1, o2), dim=-1).flatten(-2)
-
-        if offsets is not None:
-            positions = positions + offsets
-
-        self.cos_sin_cache = self.cos_sin_cache.to(query.device,
-                                                   dtype=query.dtype)
-
-        positions = positions.flatten()
-        num_tokens = positions.shape[0]
-        cos_sin = self.cos_sin_cache.index_select(0, positions)
-        cos, sin = cos_sin.chunk(2, dim=-1)
-
-        query_shape = query.shape
-        query = query.view(num_tokens, -1, self.head_size)
-        if key is not None:
-            key_shape = key.shape
-            key = key.view(num_tokens, -1, self.head_size)
-
-        if self.rotary_dim == self.head_size:
-            query = _apply_rotary_emb(query, cos, sin, self.is_neox_style)
-            query = query.reshape(query_shape)
-            if key is not None:
-                key = _apply_rotary_emb(key, cos, sin, self.is_neox_style)
-                key = key.reshape(key_shape)
-        else:
-            head_size = query.shape[-1]
-            query_reshaped = query.view(-1, head_size)
-            query_pass = query_reshaped[:, self.rotary_dim:].view(
-                *query.shape[:-1], head_size - self.rotary_dim)
-            query_rot = query_reshaped[:, :self.rotary_dim].view(
-                *query.shape[:-1], self.rotary_dim)
-            query_rot = _apply_rotary_emb_neuron(query_rot, cos, sin,
-                                                 self.is_neox_style)
-            query = torch.cat((query_rot, query_pass),
-                              dim=-1).reshape(query_shape)
-
-            if key is not None:
-                key_reshaped = key.view(-1, head_size)
-                key_pass = key_reshaped[:, self.rotary_dim:].view(
-                    *key.shape[:-1], head_size - self.rotary_dim)
-                key_rot = key_reshaped[:, :self.rotary_dim].view(
-                    *key.shape[:-1], self.rotary_dim)
-                key_rot = _apply_rotary_emb_neuron(key_rot, cos, sin,
-                                                   self.is_neox_style)
-                key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
-        return query, key
-
-    def extra_repr(self) -> str:
-        s = f"head_size={self.head_size}, rotary_dim={self.rotary_dim}"
-        s += f", max_position_embeddings={self.max_position_embeddings}"
-        s += f", base={self.base}, is_neox_style={self.is_neox_style}"
-        return s
-
-
-class LinearScalingRotaryEmbedding(RotaryEmbedding):
-    """RotaryEmbedding extended with linear scaling.
-
-    It supports multiple scaling factors. Since multiple LoRA adapters may have
-    different scaling factors, we need multiple cos/sin caches. In this way,
-    instead of running rotary embedding kernel per lora, we can run multiple
-    lora in a batched way.
-
-    In addition to that, we also keep the cos/sin cache for the scaling factor
-    of 1 (default) at all times.
-
-    Exemplary for two scaling factors x=1, y and z with embeddings
-    [[x11, x12, ... x1m], ..., [xn1, xn2, ..., xnm]] and
-    [[y11, y12, ... y1o], ..., [yn1, yn2, ..., yno]], and
-    [[z11, z12, ... z1p], ..., [zn1, zn2, ..., znp]],
-
-    we construct the cos/sin cache as follows:
-    [[x11, x12, ... x1m, y11, y12, ... y1o, z11, z12, ... z1p],
-        ...
-     [xn1, xn2, ... xnm, yn1, yn2, ... yno, zn1, zn2, ... znp]]
-
-    We then use offsets to index into the cos/sin cache for
-    the respective scaling factors.
-
-    The offset to cache can be accessed via `scaling_factor_to_offset` API.
-
-    Credits to the Reddit user /u/kaiokendev
-    """
-
-    def __init__(
-        self,
-        head_size: int,
-        rotary_dim: int,
-        max_position_embeddings: int,
-        base: float,
-        is_neox_style: bool,
-        scaling_factors: Union[list[float], float],
-        dtype: torch.dtype,
-    ) -> None:
-        if isinstance(scaling_factors, float):
-            scaling_factors = [scaling_factors]
-        self.scaling_factors: list[float] = scaling_factors  # noqa
-        super().__init__(head_size, rotary_dim, max_position_embeddings, base,
-                         is_neox_style, dtype)
-        # Lazy initialized.
-        self._scaling_factor_to_offset: dict[float, int]
-
-    def _compute_cos_sin_cache(self) -> torch.Tensor:
-        inv_freq = self._compute_inv_freq(self.base)
-        cache_list: list[torch.Tensor] = []
-        # offsets to the next cache in a tensor.
-        # Each offset corresponds to the same index in scaling_factors.
-        offsets: list[int] = []
-        for scaling_factor in self.scaling_factors:
-            # NOTE(woosuk): self.max_position_embeddings is the original
-            # maximum length before applying the rope scaling.
-            # Thus, the maximum length after applying the rope scaling is
-            # self.max_position_embeddings * self.scaling_factor.
-            max_len = self.max_position_embeddings * scaling_factor
-            t = torch.arange(max_len, dtype=torch.float)
-            t = t / scaling_factor
-
-            freqs = torch.einsum("i,j -> ij", t, inv_freq)
-            cos = freqs.cos()
-            sin = freqs.sin()
-            cache = torch.cat((cos, sin), dim=-1)
-            if not cache_list:
-                offset = 0
-            else:
-                last_offset = offsets[-1]
-                next_max_len = cache_list[-1].shape[0]
-                offset = last_offset + next_max_len
-            offsets.append(offset)
-            cache_list.append(cache)
-        self._scaling_factor_to_offset = {
-            float(scaling_factor): offsets[i]
-            for i, scaling_factor in enumerate(self.scaling_factors)
-        }
-        assert len(self.scaling_factors) == len(offsets)
-        return torch.cat(cache_list, dim=0)
-
-    @property
-    def scaling_factor_to_offset(self) -> dict[float, int]:
-        return self._scaling_factor_to_offset
-
-
-class NTKScalingRotaryEmbedding(RotaryEmbedding):
-    """RotaryEmbedding extended with fixed and mixed NTK scaling.
-    https://kexue.fm/archives/9706 """
-
-    def __init__(self,
-                 head_size: int,
-                 rotary_dim: int,
-                 max_position_embeddings: int,
-                 base: float,
-                 is_neox_style: bool,
-                 scaling_factor: float,
-                 dtype: torch.dtype,
-                 mixed_b: Optional[float] = None) -> None:
-        self.scaling_factor = scaling_factor
-        self.mixed_b = mixed_b
-        super().__init__(head_size, rotary_dim, max_position_embeddings, base,
-                         is_neox_style, dtype)
-
-    def _compute_inv_freq(self, base: float) -> torch.Tensor:
-        base = self.base * (self.scaling_factor if self.mixed_b is None else 1)
-        inv_freq = super()._compute_inv_freq(base)
-
-        if self.mixed_b is None:
-            inv_freq = inv_freq / self.scaling_factor**(2 / self.rotary_dim)
-        else:
-            a = torch.tensor(self.scaling_factor).log() / (self.rotary_dim /
-                                                           2)**self.mixed_b
-            lambda_1_m = (a * torch.arange(
-                1, self.rotary_dim // 2 + 1).float()**self.mixed_b).exp()
-            inv_freq = inv_freq / lambda_1_m
-
-        return inv_freq
-
-
-class DynamicNTKScalingRotaryEmbedding(RotaryEmbedding):
-    """RotaryEmbedding extended with Dynamic NTK scaling.
-
-    Credits to the Reddit users /u/bloc97 and /u/emozilla
-    """
-
-    def __init__(
-        self,
-        head_size: int,
-        rotary_dim: int,
-        max_position_embeddings: int,
-        base: float,
-        is_neox_style: bool,
-        scaling_factor: float,
-        dtype: torch.dtype,
-    ) -> None:
-        self.scaling_factor = scaling_factor
-        super().__init__(head_size, rotary_dim, max_position_embeddings, base,
-                         is_neox_style, dtype)
-
-    def _compute_cos_sin_cache(self) -> torch.Tensor:
-        # NOTE(woosuk): self.max_position_embeddings is the original
-        # maximum length before applying the rope scaling.
-        # Thus, the maximum length after applying the rope scaling is
-        # self.max_position_embeddings * self.scaling_factor.
-        max_len = self.max_position_embeddings * self.scaling_factor
-        base = self.base * (
-            (self.scaling_factor * max_len / self.max_position_embeddings) -
-            (self.scaling_factor - 1))**(self.rotary_dim /
-                                         (self.rotary_dim - 2))
-        inv_freq = self._compute_inv_freq(base)
-        t = torch.arange(max_len, dtype=torch.float)
-
-        freqs = torch.einsum("i,j -> ij", t, inv_freq)
-        cos = freqs.cos()
-        sin = freqs.sin()
-        cache = torch.cat((cos, sin), dim=-1)
-        return cache
-
-
-class DynamicNTKAlphaRotaryEmbedding(RotaryEmbedding):
-    """RotaryEmbedding extended with Dynamic NTK alpha.
-
-    Based on the original RotaryEmbedding implementation.
-    """
-
-    def __init__(
-        self,
-        head_size: int,
-        rotary_dim: int,
-        max_position_embeddings: int,
-        base: float,
-        is_neox_style: bool,
-        scaling_alpha: float,
-        dtype: torch.dtype,
-    ) -> None:
-        self.scaling_alpha = scaling_alpha
-        super().__init__(head_size, rotary_dim, max_position_embeddings, base,
-                         is_neox_style, dtype)
-
-    def _compute_cos_sin_cache(self) -> torch.Tensor:
-        # For Hunyuan DynamicNTKAlphaRotaryEmbedding
-        max_len = self.max_position_embeddings
-        base = self.base * self.scaling_alpha**(self.rotary_dim /
-                                                (self.rotary_dim - 2))
-        inv_freq = self._compute_inv_freq(base)
-        t = torch.arange(max_len, dtype=torch.float)
-
-        freqs = torch.einsum("i,j -> ij", t, inv_freq)
-        cos = freqs.cos()
-        sin = freqs.sin()
-        cache = torch.cat((cos, sin), dim=-1)
-        return cache
-
-
-# Inverse dim formula to find dim based on number of rotations
-def _yarn_find_correction_dim(num_rotations: int,
-                              dim: int,
-                              base: float = 10000,
-                              max_position_embeddings: int = 2048) -> float:
-    return (dim * math.log(max_position_embeddings /
-                           (num_rotations * 2 * math.pi))) / (2 *
-                                                              math.log(base))
-
-
-# Find dim range bounds based on rotations
-def _yarn_find_correction_range(
-        low_rot: int,
-        high_rot: int,
-        dim: int,
-        base: float = 10000,
-        max_position_embeddings: int = 2048) -> tuple[int, int]:
-    low = math.floor(
-        _yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings))
-    high = math.ceil(
-        _yarn_find_correction_dim(high_rot, dim, base,
-                                  max_position_embeddings))
-    return max(low, 0), min(high, dim - 1)  # Clamp values just in case
-
-
-def _yarn_linear_ramp_mask(low: float, high: float, dim: int,
-                           dtype: torch.dtype) -> torch.Tensor:
-    if low == high:
-        high += 0.001  # Prevent singularity
-
-    linear_func = (torch.arange(dim, dtype=dtype) - low) / (high - low)
-    ramp_func = torch.clamp(linear_func, 0, 1)
-    return ramp_func
-
-
-def _yarn_get_mscale(scale: float = 1) -> float:
-    if scale <= 1:
-        return 1.0
-    return 0.1 * math.log(scale) + 1.0
-
-
-class YaRNScalingRotaryEmbedding(RotaryEmbedding):
-    """RotaryEmbedding extended with YaRN method.
-
-    Credits to Peng et al. github.com/jquesnelle/yarn
-    """
-
-    def __init__(
-        self,
-        head_size: int,
-        rotary_dim: int,
-        max_position_embeddings: int,
-        base: float,
-        is_neox_style: bool,
-        scaling_factor: float,
-        dtype: torch.dtype,
-        *,
-        extrapolation_factor: float = 1,
-        attn_factor: float = 1,
-        beta_fast: int = 32,
-        beta_slow: int = 1,
-    ) -> None:
-        self.scaling_factor = scaling_factor
-        self.extrapolation_factor = extrapolation_factor
-        self.attn_factor = attn_factor
-        self.beta_fast = beta_fast
-        self.beta_slow = beta_slow
-        # Get n-d magnitude scaling corrected for interpolation
-        self.mscale = float(
-            _yarn_get_mscale(self.scaling_factor) * attn_factor)
-        super().__init__(head_size, rotary_dim, max_position_embeddings, base,
-                         is_neox_style, dtype)
-
-    def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor:
-        pos_freqs = self.base**(
-            torch.arange(0, self.rotary_dim, 2, dtype=torch.float) /
-            self.rotary_dim)
-        inv_freq_extrapolation = 1.0 / pos_freqs
-        inv_freq_interpolation = 1.0 / (scaling_factor * pos_freqs)
-
-        low, high = _yarn_find_correction_range(self.beta_fast, self.beta_slow,
-                                                self.rotary_dim, self.base,
-                                                self.max_position_embeddings)
-        # Get n-d rotational scaling corrected for extrapolation
-        inv_freq_mask = (1 - _yarn_linear_ramp_mask(
-            low, high, self.rotary_dim // 2,
-            dtype=torch.float)) * self.extrapolation_factor
-        inv_freq = inv_freq_interpolation * (
-            1 - inv_freq_mask) + inv_freq_extrapolation * inv_freq_mask
-        return inv_freq
-
-    def _compute_cos_sin_cache(self) -> torch.Tensor:
-        inv_freq = self._compute_inv_freq(self.scaling_factor)
-        t = torch.arange(self.max_position_embeddings * self.scaling_factor,
-                         dtype=torch.float32)
-        freqs = torch.einsum("i,j -> ij", t, inv_freq)
-        cos = (freqs.cos() * self.mscale)
-        sin = (freqs.sin() * self.mscale)
-        cache = torch.cat((cos, sin), dim=-1)
-        return cache
-
-
-class Phi3LongRoPEScaledRotaryEmbedding(nn.Module):
-    """Phi3 family of models scaled rotary embedding.
-
-    Based on the original RotaryEmbedding implementation.
-    """
-
-    def __init__(
-        self,
-        head_size: int,
-        rotary_dim: int,
-        max_position_embeddings: int,
-        original_max_position_embeddings: int,
-        base: float,
-        is_neox_style: bool,
-        dtype: torch.dtype,
-        short_factor: list[float],
-        long_factor: list[float],
-        short_mscale: Optional[float] = None,
-        long_mscale: Optional[float] = None,
-    ):
-        super().__init__()
-
-        if is_neox_style is False:
-            raise ValueError(
-                "`Phi3LongRoPEScaledRotaryEmbedding` only supports neox_style."
-            )
-
-        self.rotary_dim = rotary_dim
-        self.head_size = head_size
-        self.max_position_embeddings = max_position_embeddings
-        self.original_max_position_embeddings = original_max_position_embeddings
-        self.base = base
-        self.short_factor = short_factor
-        self.long_factor = long_factor
-
-        scale = self.max_position_embeddings / \
-                self.original_max_position_embeddings
-        if scale <= 1.0:
-            scaling_factor = 1.0
-        else:
-            scaling_factor = math.sqrt(
-                1 + math.log(scale) /
-                math.log(self.original_max_position_embeddings))
-        if short_mscale is None:
-            short_mscale = scaling_factor
-        if long_mscale is None:
-            long_mscale = scaling_factor
-
-        self.short_mscale = short_mscale
-        self.long_mscale = long_mscale
-
-        short_cache = self._compute_cos_sin_cache(
-            original_max_position_embeddings, short_factor, short_mscale)
-        short_cache = short_cache.to(dtype)
-
-        long_cache = self._compute_cos_sin_cache(max_position_embeddings,
-                                                 long_factor, long_mscale)
-        long_cache = long_cache.to(dtype)
-
-        long_short_cache = torch.cat([short_cache, long_cache], dim=0)
-        self.register_buffer("long_short_cos_sin_cache",
-                             long_short_cache,
-                             persistent=False)
-
-    def _compute_inv_freq(self, rescale_factors: list[float]) -> torch.Tensor:
-        rescale_factors = torch.tensor(rescale_factors, dtype=torch.float32)
-        inv_freq = 1.0 / (rescale_factors * (self.base**(torch.arange(
-            0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim)))
-        return inv_freq
-
-    def _compute_cos_sin_cache(
-        self,
-        max_position_embeddings: int,
-        rescale_factors: list[float],
-        mscale: float,
-    ) -> torch.Tensor:
-        inv_freq = self._compute_inv_freq(rescale_factors)
-        t = torch.arange(max_position_embeddings, dtype=torch.float)
-        freqs = torch.einsum("i,j -> ij", t, inv_freq)
-        cos = freqs.cos() * mscale
-        sin = freqs.sin() * mscale
-        cache = torch.cat((cos, sin), dim=-1)
-        return cache
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        query: torch.Tensor,
-        key: Optional[torch.Tensor] = None,
-        offsets: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
-        assert key is not None
-        query = query.view(*query.shape[:-1], -1, self.head_size)
-        key = key.view(*key.shape[:-1], -1, self.head_size)
-
-        k = self.original_max_position_embeddings
-        long_prompt_offset = (torch.any(positions > k).float() *
-                              torch.full_like(positions, k)).long()
-        idx = (torch.add(positions, long_prompt_offset)
-               if long_prompt_offset is not None else positions)
-        idx = torch.add(idx, offsets) if offsets is not None else idx
-        cos_sin = torch.index_select(self.long_short_cos_sin_cache, 0, idx)
-
-        cos, sin = cos_sin.chunk(2, dim=-1)
-        cos = cos.repeat(1, 2).unsqueeze(-2)
-        sin = sin.repeat(1, 2).unsqueeze(-2)
-
-        query_rot = query[..., :self.rotary_dim]
-        query_pass = query[..., self.rotary_dim:]
-        query_rot = query_rot * cos + _rotate_neox(query_rot) * sin
-        query = torch.cat((query_rot, query_pass), dim=-1)
-
-        key_rot = key[..., :self.rotary_dim]
-        key_pass = key[..., self.rotary_dim:]
-        key_rot = key_rot * cos + _rotate_neox(key_rot) * sin
-        key = torch.cat((key_rot, key_pass), dim=-1)
-
-        return query.flatten(-2), key.flatten(-2)
-
-
-def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
-    if scale <= 1:
-        return 1.0
-    return 0.1 * mscale * math.log(scale) + 1.0
-
-
-class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
-    """RotaryEmbedding extended with YaRN method.
-
-    Credits to Peng et al. github.com/jquesnelle/yarn
-    """
-
-    def __init__(
-        self,
-        head_size: int,
-        rotary_dim: int,
-        max_position_embeddings: int,
-        base: float,
-        is_neox_style: bool,
-        scaling_factor: float,
-        dtype: torch.dtype,
-        *,
-        extrapolation_factor: float = 1,
-        attn_factor: float = 1,
-        beta_fast: int = 32,
-        beta_slow: int = 1,
-        mscale: float = 1,
-        mscale_all_dim: float = 0,
-    ) -> None:
-        self.scaling_factor = scaling_factor
-        self.extrapolation_factor = extrapolation_factor
-        self.attn_factor = attn_factor
-        self.beta_fast = beta_fast
-        self.beta_slow = beta_slow
-        # Get n-d magnitude scaling corrected for interpolation.
-        self.mscale = float(
-            yarn_get_mscale(self.scaling_factor, float(mscale)) /
-            yarn_get_mscale(self.scaling_factor, float(mscale_all_dim)) *
-            attn_factor)
-        super().__init__(head_size, rotary_dim, max_position_embeddings, base,
-                         is_neox_style, dtype)
-
-    def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor:
-        pos_freqs = self.base**(
-            torch.arange(0,
-                         self.rotary_dim,
-                         2,
-                         dtype=torch.float,
-                         device=current_platform.device_type) /
-            self.rotary_dim)
-        inv_freq_extrapolation = 1.0 / pos_freqs
-        inv_freq_interpolation = 1.0 / (scaling_factor * pos_freqs)
-
-        low, high = _yarn_find_correction_range(self.beta_fast, self.beta_slow,
-                                                self.rotary_dim, self.base,
-                                                self.max_position_embeddings)
-        # Get n-d rotational scaling corrected for extrapolation
-        inv_freq_mask = (1 - _yarn_linear_ramp_mask(
-            low, high, self.rotary_dim // 2,
-            dtype=torch.float)) * self.extrapolation_factor
-        inv_freq = inv_freq_interpolation * (
-            1 - inv_freq_mask) + inv_freq_extrapolation * inv_freq_mask
-        return inv_freq
-
-    def _compute_cos_sin_cache(self) -> torch.Tensor:
-        inv_freq = self._compute_inv_freq(self.scaling_factor)
-        t = torch.arange(self.max_position_embeddings * self.scaling_factor,
-                         device=current_platform.device_type,
-                         dtype=torch.float32)
-        freqs = torch.einsum("i,j -> ij", t, inv_freq)
-        cos = (freqs.cos() * self.mscale)
-        sin = (freqs.sin() * self.mscale)
-        cache = torch.cat((cos, sin), dim=-1)
-        return cache
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        query: torch.Tensor,
-        key: Optional[torch.Tensor] = None,
-        offsets: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
-        """PyTorch-native implementation equivalent to forward()."""
-        assert key is not None
-        query_rot = query[..., :self.rotary_dim]
-        key_rot = key[..., :self.rotary_dim]
-        if self.rotary_dim < self.head_size:
-            query_pass = query[..., self.rotary_dim:]
-            key_pass = key[..., self.rotary_dim:]
-
-        if self.cos_sin_cache.device != positions.device:
-            self.cos_sin_cache: torch.Tensor = self.cos_sin_cache.to(
-                positions.device)
-        cos_sin = self.cos_sin_cache[torch.add(positions, offsets)
-                                     if offsets is not None else positions]
-        cos, sin = cos_sin.chunk(2, dim=-1)
-        if self.is_neox_style:
-            # NOTE(woosuk): Here we assume that the positions tensor has the
-            # shape [batch_size, seq_len].
-            cos = cos.repeat(1, 1, 2).unsqueeze(-2)
-            sin = sin.repeat(1, 1, 2).unsqueeze(-2)
-        else:
-            cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2)
-            sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2)
-
-        rotate_fn = _rotate_neox if self.is_neox_style else _rotate_gptj
-        query_rot = query_rot * cos + rotate_fn(query_rot) * sin
-        key_rot = key_rot * cos + rotate_fn(key_rot) * sin
-
-        if self.rotary_dim < self.head_size:
-            query = torch.cat((query_rot, query_pass), dim=-1)
-            key = torch.cat((key_rot, key_pass), dim=-1)
-        else:
-            query = query_rot
-            key = key_rot
-        return query, key
-
-
-class Llama3RotaryEmbedding(RotaryEmbedding):
-
-    def __init__(
-        self,
-        head_size: int,
-        rotary_dim: int,
-        max_position_embeddings: int,
-        base: float,
-        is_neox_style: bool,
-        dtype: torch.dtype,
-        scaling_factor: float,
-        low_freq_factor: float,
-        high_freq_factor: float,
-        orig_max_position: int,
-    ) -> None:
-        self.scaling_factor = scaling_factor
-        self.low_freq_factor = low_freq_factor
-        self.high_freq_factor = high_freq_factor
-        self.orig_max_position = orig_max_position
-        super().__init__(head_size, rotary_dim, max_position_embeddings, base,
-                         is_neox_style, dtype)
-
-    def _compute_inv_freq(self, base: float) -> torch.Tensor:
-        inv_freqs = super()._compute_inv_freq(base)
-        low_freq_wavelen = self.orig_max_position / self.low_freq_factor
-        high_freq_wavelen = self.orig_max_position / self.high_freq_factor
-
-        wave_len = 2 * math.pi / inv_freqs
-        if self.low_freq_factor != self.high_freq_factor:
-            smooth = (self.orig_max_position / wave_len - self.low_freq_factor
-                      ) / (self.high_freq_factor - self.low_freq_factor)
-        else:
-            smooth = 0
-        new_freqs = torch.where(
-            wave_len < high_freq_wavelen,
-            inv_freqs,
-            torch.where(
-                wave_len > low_freq_wavelen,
-                inv_freqs / self.scaling_factor,
-                (1 - smooth) * inv_freqs / self.scaling_factor +
-                smooth * inv_freqs,
-            ),
-        )
-        return new_freqs
-
-
-class Llama4VisionRotaryEmbedding(RotaryEmbedding):
-
-    def __init__(
-        self,
-        head_size: int,
-        rotary_dim: int,
-        max_position_embeddings: int,
-        base: float,
-        is_neox_style: bool,
-        dtype: torch.dtype,
-    ):
-        super().__init__(head_size, rotary_dim, max_position_embeddings, base,
-                         is_neox_style, dtype)
-
-    def _compute_inv_freq(self, base: float) -> torch.Tensor:
-        inv_freqs = super()._compute_inv_freq(base)
-        inv_freqs = inv_freqs[:(self.rotary_dim // 2)]
-        return inv_freqs
-
-    def _compute_cos_sin_cache(self) -> torch.Tensor:
-        inv_freq = self._compute_inv_freq(self.base)
-
-        # self.max_position_embeddings here is number of image patches
-        # i.e. (image_size // patch_size) ** 2
-        num_patches = self.max_position_embeddings
-        img_idx = torch.arange(num_patches,
-                    dtype=torch.int32) \
-                    .reshape(num_patches, 1)
-        img_idx = torch.cat([img_idx, img_idx[:1]], dim=0)
-        img_idx[-1, -1] = -2  # set to ID_CLS_TOKEN
-        num_patches_single_dim = int(math.sqrt(num_patches))
-        frequencies_x = img_idx % num_patches_single_dim
-        frequencies_y = img_idx // num_patches_single_dim
-        freqs_x = ((frequencies_x + 1)[..., None] *
-                   inv_freq[None, None, :]).repeat_interleave(2, dim=-1)
-        freqs_y = ((frequencies_y + 1)[..., None] *
-                   inv_freq[None, None, :]).repeat_interleave(2, dim=-1)
-        freqs = torch.cat([freqs_x, freqs_y],
-                          dim=-1).float().contiguous()[..., ::2]
-        freqs = freqs.masked_fill(img_idx.reshape(-1, 1, 1) < 0, 0)
-        cache = torch.view_as_complex(
-            torch.stack([torch.cos(freqs), torch.sin(freqs)], dim=-1))
-        return cache
-
-    def forward(
-        self,
-        query: torch.Tensor,
-        key: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
-        assert key is not None
-        self.cos_sin_cache: torch.Tensor = self.cos_sin_cache.to(query.device)
-        query_ = torch.view_as_complex(query.float().reshape(
-            *query.shape[:-1], -1, 2))
-        key_ = torch.view_as_complex(key.float().reshape(
-            *key.shape[:-1], -1, 2))
-        broadcast_shape = [
-            d if i == 1 or i == (query_.ndim - 1) else 1
-            for i, d in enumerate(query_.shape)
-        ]
-        freqs_ci = self.cos_sin_cache.view(*broadcast_shape)
-        query_out = torch.view_as_real(query_ * freqs_ci).flatten(3)
-        key_out = torch.view_as_real(key_ * freqs_ci).flatten(3)
-        return query_out.type_as(query), key_out.type_as(key)
-
-
-class MRotaryEmbedding(RotaryEmbedding):
-    """Rotary Embedding with Multimodal Sections."""
-
-    def __init__(
-        self,
-        head_size: int,
-        rotary_dim: int,
-        max_position_embeddings: int,
-        base: float,
-        is_neox_style: bool,
-        dtype: torch.dtype,
-        mrope_section: Optional[list[int]] = None,
-    ) -> None:
-        # In Qwen2.5-VL, the maximum index value is related to the duration of
-        # the input video. We enlarge max_position_embeddings to 4 times to get
-        # a larger the cos and sin cache.
-        self.cache_max_position_num = max_position_embeddings * 4
-        super().__init__(head_size, rotary_dim, self.cache_max_position_num,
-                         base, is_neox_style, dtype)
-
-        self.mrope_section = mrope_section
-        if self.mrope_section:
-            assert sum(self.mrope_section) == rotary_dim // 2
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        query: torch.Tensor,
-        key: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
-        """PyTorch-native implementation equivalent to forward().
-
-        Args:
-            positions:
-                [num_tokens,] (text only) or
-                [3, num_tokens] (T/H/W positions with multimodal inputs)
-            query: [num_tokens, num_heads * head_size]
-            key: [num_tokens, num_kv_heads * head_size]
-        """
-        assert positions.ndim == 1 or positions.ndim == 2
-        assert key is not None
-
-        num_tokens = positions.shape[-1]
-        cos_sin = self.cos_sin_cache[positions]
-        cos, sin = cos_sin.chunk(2, dim=-1)
-        if positions.ndim == 2:
-            assert self.mrope_section
-
-            cos = torch.cat([
-                m[i]
-                for i, m in enumerate(cos.split(self.mrope_section, dim=-1))
-            ],
-                            dim=-1)
-            sin = torch.cat([
-                m[i]
-                for i, m in enumerate(sin.split(self.mrope_section, dim=-1))
-            ],
-                            dim=-1)
-
-        query_shape = query.shape
-        query = query.view(num_tokens, -1, self.head_size)
-        query_rot = query[..., :self.rotary_dim]
-        query_pass = query[..., self.rotary_dim:]
-        query_rot = _apply_rotary_emb(query_rot, cos, sin, self.is_neox_style)
-        query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
-
-        key_shape = key.shape
-        key = key.view(num_tokens, -1, self.head_size)
-        key_rot = key[..., :self.rotary_dim]
-        key_pass = key[..., self.rotary_dim:]
-        key_rot = _apply_rotary_emb(key_rot, cos, sin, self.is_neox_style)
-        key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
-        return query, key
-
-    @classmethod
-    def get_input_positions(
-        cls,
-        input_tokens: list[int],
-        hf_config: PretrainedConfig,
-        image_grid_thw: Optional[Union[list[list[int]], torch.Tensor]],
-        video_grid_thw: Optional[Union[list[list[int]], torch.Tensor]],
-        second_per_grid_ts: Optional[list[float]],
-        context_len: int = 0,
-        seq_len: Optional[int] = None,
-        audio_feature_lengths: Optional[torch.Tensor] = None,
-        use_audio_in_video: bool = False,
-    ) -> tuple[list[list[int]], int]:
-        """Get mrope input positions and delta value."""
-
-        image_grid_thw = [] if image_grid_thw is None else image_grid_thw
-        video_grid_thw = [] if video_grid_thw is None else video_grid_thw
-        second_per_grid_ts = [] if second_per_grid_ts is None else \
-            second_per_grid_ts
-
-        llm_positions, mrope_position_delta = \
-            cls.get_input_positions_tensor(
-                input_tokens=input_tokens,
-                hf_config=hf_config,
-                image_grid_thw=image_grid_thw,
-                video_grid_thw=video_grid_thw,
-                second_per_grid_ts=second_per_grid_ts,
-                context_len=context_len,
-                seq_len=seq_len,
-                audio_feature_lengths=audio_feature_lengths,
-                use_audio_in_video=use_audio_in_video,
-            )
-
-        return llm_positions.tolist(), mrope_position_delta
-
-    @classmethod
-    def get_input_positions_tensor(
-        cls,
-        input_tokens: list[int],
-        hf_config: PretrainedConfig,
-        image_grid_thw: Union[list[list[int]], torch.Tensor],
-        video_grid_thw: Union[list[list[int]], torch.Tensor],
-        second_per_grid_ts: list[float],
-        context_len: int = 0,
-        seq_len: Optional[int] = None,
-        audio_feature_lengths: Optional[torch.Tensor] = None,
-        use_audio_in_video: bool = False,
-    ) -> tuple[torch.Tensor, int]:
-        from vllm.transformers_utils.config import thinker_uses_mrope
-        if thinker_uses_mrope(hf_config):
-            return cls._omni_get_input_positions_tensor(
-                input_tokens=input_tokens,
-                hf_config=hf_config,
-                image_grid_thw=image_grid_thw,
-                video_grid_thw=video_grid_thw,
-                second_per_grid_ts=second_per_grid_ts,
-                context_len=context_len,
-                seq_len=seq_len,
-                audio_feature_lengths=audio_feature_lengths,
-                use_audio_in_video=use_audio_in_video,
-            )
-        elif "glm4v" in hf_config.model_type:
-            return cls._glm4v_get_input_positions_tensor(
-                input_tokens=input_tokens,
-                hf_config=hf_config,
-                image_grid_thw=image_grid_thw,
-                video_grid_thw=video_grid_thw,
-                context_len=context_len,
-                seq_len=seq_len,
-            )
-        else:
-            return cls._vl_get_input_positions_tensor(
-                input_tokens=input_tokens,
-                hf_config=hf_config,
-                image_grid_thw=image_grid_thw,
-                video_grid_thw=video_grid_thw,
-                second_per_grid_ts=second_per_grid_ts,
-                context_len=context_len,
-                seq_len=seq_len,
-            )
-
-    @classmethod
-    def _glm4v_get_input_positions_tensor(
-        cls,
-        input_tokens: list[int],
-        hf_config: PretrainedConfig,
-        image_grid_thw: Union[list[list[int]], torch.Tensor],
-        video_grid_thw: Union[list[list[int]], torch.Tensor],
-        context_len: int = 0,
-        seq_len: Optional[int] = None,
-    ) -> tuple[torch.Tensor, int]:
-        """Get mrope input positions and delta value for GLM4V."""
-
-        image_token_id = hf_config.image_token_id
-        video_start_token_id = hf_config.video_start_token_id
-        video_end_token_id = hf_config.video_end_token_id
-        spatial_merge_size = hf_config.vision_config.spatial_merge_size
-        llm_pos_ids_list: list = []
-
-        if not (image_grid_thw is None and video_grid_thw is None):
-            if isinstance(image_grid_thw, torch.Tensor):
-                image_grid_thw = image_grid_thw.tolist()
-
-            input_token_type: list[str] = []
-            video_check_flg = False
-            for token in input_tokens:
-                if token == video_start_token_id:
-                    video_check_flg = True
-                elif token == video_end_token_id:
-                    video_check_flg = False
-
-                if (token == image_token_id) and (video_check_flg is False):
-                    input_token_type.append("image")
-                elif (token == image_token_id) and (video_check_flg is True):
-                    input_token_type.append("video")
-                else:
-                    input_token_type.append("text")
-
-            input_type_group: list[tuple[str, int, int]] = []
-            for key, group_iter in itertools.groupby(
-                    enumerate(input_token_type), lambda x: x[1]):
-                group_list = list(group_iter)
-                start_index = group_list[0][0]
-                end_index = group_list[-1][0] + 1
-                input_type_group.append((key, start_index, end_index))
-
-            video_frame_num = 1
-            mm_data_idx = 0
-            for modality_type, start_idx, end_idx in input_type_group:
-                st_idx = llm_pos_ids_list[-1].max() + 1 if len(
-                    llm_pos_ids_list) > 0 else 0
-                if modality_type == "image":
-                    t, h, w = (
-                        image_grid_thw[mm_data_idx][0],
-                        image_grid_thw[mm_data_idx][1],
-                        image_grid_thw[mm_data_idx][2],
-                    )
-                    llm_grid_t, llm_grid_h, llm_grid_w = \
-                        t, h // spatial_merge_size, w // spatial_merge_size
-
-                    t_index = torch.arange(llm_grid_t).view(-1, 1).expand(
-                        -1, llm_grid_h * llm_grid_w).flatten()
-                    h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(
-                        llm_grid_t, -1, llm_grid_w).flatten()
-                    w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(
-                        llm_grid_t, llm_grid_h, -1).flatten()
-                    llm_pos_ids_list.append(
-                        torch.stack([t_index, h_index, w_index]) + st_idx)
-                    mm_data_idx += 1
-
-                elif modality_type == "video":
-                    t, h, w = (
-                        video_frame_num,
-                        image_grid_thw[mm_data_idx][1],
-                        image_grid_thw[mm_data_idx][2],
-                    )
-                    llm_grid_t, llm_grid_h, llm_grid_w = \
-                        t, h // spatial_merge_size, w // spatial_merge_size
-
-                    for t_idx in range(llm_grid_t):
-                        t_index = torch.tensor(t_idx).view(-1, 1).expand(
-                            -1, llm_grid_h * llm_grid_w).flatten()
-                        h_index = torch.arange(llm_grid_h).view(
-                            1, -1, 1).expand(1, -1, llm_grid_w).flatten()
-                        w_index = torch.arange(llm_grid_w).view(
-                            1, 1, -1).expand(1, llm_grid_h, -1).flatten()
-                        llm_pos_ids_list.append(
-                            torch.stack([t_index, h_index, w_index]) + st_idx)
-
-                    mm_data_idx += 1
-                    video_frame_num += 1
-
-                else:
-                    text_len = end_idx - start_idx
-                    llm_pos_ids_list.append(
-                        torch.arange(text_len).view(1, -1).expand(3, -1) +
-                        st_idx)
-                    video_frame_num = 1
-
-        else:
-            text_len = len(input_tokens)
-            llm_pos_ids_list.append(
-                torch.arange(text_len).view(1, -1).expand(3, -1))
-
-        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
-        llm_positions = llm_positions[:, context_len:seq_len]
-        mrope_position_delta = (llm_positions.max() + 1 -
-                                len(input_tokens)).item()
-        return llm_positions, mrope_position_delta
-
-    @classmethod
-    def _vl_get_input_positions_tensor(
-        cls,
-        input_tokens: list[int],
-        hf_config: PretrainedConfig,
-        image_grid_thw: Union[list[list[int]], torch.Tensor],
-        video_grid_thw: Union[list[list[int]], torch.Tensor],
-        second_per_grid_ts: list[float],
-        context_len: int = 0,
-        seq_len: Optional[int] = None,
-    ) -> tuple[torch.Tensor, int]:
-        """Get mrope input positions and delta value."""
-
-        image_token_id = hf_config.image_token_id
-        video_token_id = hf_config.video_token_id
-        vision_start_token_id = hf_config.vision_start_token_id
-        spatial_merge_size = hf_config.vision_config.spatial_merge_size
-        tokens_per_second = getattr(hf_config.vision_config,
-                                    "tokens_per_second", 1.0)
-
-        input_tokens_tensor = torch.tensor(input_tokens)
-        vision_start_indices = torch.argwhere(
-            input_tokens_tensor == vision_start_token_id).squeeze(1)
-        vision_tokens = input_tokens_tensor[vision_start_indices + 1]
-        image_nums = (vision_tokens == image_token_id).sum()
-        video_nums = (vision_tokens == video_token_id).sum()
-        llm_pos_ids_list: list = []
-
-        st = 0
-        remain_images, remain_videos = image_nums, video_nums
-
-        image_index, video_index = 0, 0
-        for _ in range(image_nums + video_nums):
-            video_second_per_grid_t = 0.0
-            if image_token_id in input_tokens and remain_images > 0:
-                ed_image = input_tokens.index(image_token_id, st)
-            else:
-                ed_image = len(input_tokens) + 1
-            if video_token_id in input_tokens and remain_videos > 0:
-                ed_video = input_tokens.index(video_token_id, st)
-            else:
-                ed_video = len(input_tokens) + 1
-            if ed_image < ed_video:
-                t, h, w = (
-                    image_grid_thw[image_index][0],
-                    image_grid_thw[image_index][1],
-                    image_grid_thw[image_index][2],
-                )
-                image_index += 1
-                remain_images -= 1
-                ed = ed_image
-            else:
-                t, h, w = (
-                    video_grid_thw[video_index][0],
-                    video_grid_thw[video_index][1],
-                    video_grid_thw[video_index][2],
-                )
-                video_second_per_grid_t = 1.0
-                if second_per_grid_ts:
-                    video_second_per_grid_t = second_per_grid_ts[video_index]
-                video_index += 1
-                remain_videos -= 1
-                ed = ed_video
-
-            llm_grid_t, llm_grid_h, llm_grid_w = \
-                t, h // spatial_merge_size, w // spatial_merge_size
-            text_len = ed - st
-
-            st_idx = llm_pos_ids_list[-1].max() + 1 if len(
-                llm_pos_ids_list) > 0 else 0
-            llm_pos_ids_list.append(
-                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
-
-            t_index = (torch.arange(llm_grid_t).view(-1, 1).expand(
-                -1, llm_grid_h * llm_grid_w) * video_second_per_grid_t *
-                       tokens_per_second).long().flatten()
-
-            h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(
-                llm_grid_t, -1, llm_grid_w).flatten()
-            w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(
-                llm_grid_t, llm_grid_h, -1).flatten()
-            llm_pos_ids_list.append(
-                torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
-            st = ed + llm_grid_t * llm_grid_h * llm_grid_w
-
-        if st < len(input_tokens):
-            st_idx = llm_pos_ids_list[-1].max() + 1 if len(
-                llm_pos_ids_list) > 0 else 0
-            text_len = len(input_tokens) - st
-            llm_pos_ids_list.append(
-                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
-
-        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
-        mrope_position_delta = (llm_positions.max() + 1 -
-                                len(input_tokens)).item()
-        llm_positions = llm_positions[:, context_len:seq_len]
-
-        return llm_positions, mrope_position_delta
-
-    @classmethod
-    def _omni_get_input_positions_tensor(
-        cls,
-        input_tokens: list[int],
-        hf_config: PretrainedConfig,
-        image_grid_thw: Union[list[list[int]], torch.Tensor],
-        video_grid_thw: Union[list[list[int]], torch.Tensor],
-        second_per_grid_ts: Optional[list[float]] = None,
-        context_len: int = 0,
-        seq_len: Optional[int] = None,
-        audio_feature_lengths: Optional[torch.Tensor] = None,
-        use_audio_in_video: bool = False,
-    ) -> tuple[torch.Tensor, int]:
-        """Get mrope input positions and delta value (Qwen2.5-Omni version).
-
-        Differences from MRotaryEmbedding:
-            1. Add audio support (and related `audio_feature_lengths`).
-            2. Add `use_audio_in_video` option to read audio from video inputs.
-                In this case, audio and vision position ids will be split into
-                chunks and interleaved.
-
-        Example:
-
-            (V_i are vision position ids, A_i are audio position ids)
-
-            |V_1 ...    V_n|A_1 ...   A_n|V_n+1 ... V_2n|A_n+1 ... A_2n|...
-            |vision chunk 1|audio chunk 1|vision chunk 2|audio chunk 2 |...
-        """
-
-        # TODO(fyabc): refactor and share more code with
-        #  _vl_get_input_positions_tensor.
-
-        thinker_config = hf_config.thinker_config
-        audio_token_id = thinker_config.audio_token_index
-        image_token_id = thinker_config.image_token_index
-        video_token_id = thinker_config.video_token_index
-        audio_start_token_id = thinker_config.audio_start_token_id
-        audio_end_token_id = thinker_config.audio_end_token_id
-        vision_start_token_id = thinker_config.vision_start_token_id
-        vision_end_token_id = thinker_config.vision_end_token_id
-        seconds_per_chunk = thinker_config.seconds_per_chunk
-        spatial_merge_size = thinker_config.vision_config.spatial_merge_size
-        tokens_per_second = getattr(thinker_config.vision_config,
-                                    "tokens_per_second", 25)
-
-        if isinstance(image_grid_thw, list):
-            image_grid_thw = torch.tensor(image_grid_thw)
-        if isinstance(video_grid_thw, list):
-            video_grid_thw = torch.tensor(video_grid_thw)
-
-        src_item = input_tokens
-        audio_seqlens = audio_feature_lengths
-        if not second_per_grid_ts:
-            second_per_grid_ts = [1] * video_grid_thw.shape[0]
-        audio_idx = 0
-        video_idx = 0
-        image_idx = 0
-        new_src_item: list[int] = []
-        llm_pos_ids_list: list[torch.Tensor] = []
-
-        idx = 0
-        while idx < len(src_item):
-            new_src_item_len = len(new_src_item)
-            start_idx = llm_pos_ids_list[-1].max() + 1 if len(
-                llm_pos_ids_list) > 0 else 0
-            if src_item[idx] not in [
-                    audio_token_id, video_token_id, image_token_id
-            ]:
-                if use_audio_in_video and idx > 0:
-                    if src_item[idx] == vision_end_token_id and \
-                        src_item[idx - 1] == audio_end_token_id:
-                        # processing the <|audio_eos|> before <|vision_eos|>
-                        start_idx -= 1
-                    elif src_item[idx] == audio_start_token_id and \
-                        src_item[idx - 1] == vision_start_token_id:
-                        # processing the <|audio_bos|> after <|vision_eos|>
-                        start_idx -= 1
-                new_src_item.append(src_item[idx])
-                llm_pos_ids = torch.tensor([start_idx],
-                                           dtype=torch.long).expand(3, -1)
-                llm_pos_ids_list.append(llm_pos_ids)
-            elif src_item[idx] == audio_token_id:
-                assert audio_seqlens is not None
-                audio_seqlen = audio_seqlens[audio_idx]
-                place_num = (((audio_seqlen - 1) // 2 + 1 - 2) // 2 + 1)
-                new_src_item.extend([audio_token_id] * place_num)
-                llm_pos_ids = torch.arange(place_num).expand(3, -1) + start_idx
-                llm_pos_ids_list.append(llm_pos_ids)
-                audio_idx += 1
-            elif src_item[idx] == image_token_id:
-                grid_t = image_grid_thw[image_idx][0]
-                grid_hs = image_grid_thw[:, 1]
-                grid_ws = image_grid_thw[:, 2]
-                t_index = (torch.arange(grid_t) * 1 * tokens_per_second).long()
-                llm_pos_ids = cls._get_llm_pos_ids_for_vision(
-                    start_idx, image_idx, spatial_merge_size, t_index, grid_hs,
-                    grid_ws)
-                llm_pos_ids_list.append(llm_pos_ids)
-                vision_seqlen = image_grid_thw[image_idx].prod() // (
-                    spatial_merge_size**2)
-                new_src_item.extend([image_token_id] * vision_seqlen)
-                image_idx += 1
-            elif src_item[idx] == video_token_id and not use_audio_in_video:
-                grid_t = video_grid_thw[video_idx][0]
-                grid_hs = video_grid_thw[:, 1]
-                grid_ws = video_grid_thw[:, 2]
-                t_index = (torch.arange(grid_t) *
-                           second_per_grid_ts[video_idx] *
-                           tokens_per_second).long()
-                llm_pos_ids = cls._get_llm_pos_ids_for_vision(
-                    start_idx, video_idx, spatial_merge_size, t_index, grid_hs,
-                    grid_ws)
-                llm_pos_ids_list.append(llm_pos_ids)
-                vision_seqlen = video_grid_thw[video_idx].prod() // (
-                    spatial_merge_size**2)
-                new_src_item.extend([video_token_id] * vision_seqlen)
-                video_idx += 1
-            else:
-                # read audio from video
-                assert audio_seqlens is not None
-                audio_seqlen = audio_seqlens[audio_idx]
-                vision_seqlen = video_grid_thw[video_idx].prod() // (
-                    spatial_merge_size**2)
-                grid_t = video_grid_thw[video_idx][0]
-                grid_h = video_grid_thw[video_idx][1]
-                grid_w = video_grid_thw[video_idx][2]
-                grid_hs = video_grid_thw[:, 1]
-                grid_ws = video_grid_thw[:, 2]
-                t_ntoken_per_chunk = int(tokens_per_second * seconds_per_chunk)
-                t_index = (torch.arange(grid_t) *
-                           second_per_grid_ts[video_idx] *
-                           tokens_per_second).long()
-                t_index_split_chunk = cls._split_list_into_ranges(
-                    t_index, t_ntoken_per_chunk)
-                place_num = (((audio_seqlen - 1) // 2 + 1 - 2) // 2 + 1) + 2
-                pure_audio_len = place_num - 2
-                added_audio_len = 0
-                audio_llm_pos_ids_list: list[torch.Tensor] = []
-                for t_chunk in t_index_split_chunk:
-                    vision_ntoken_per_chunk = len(
-                        t_chunk) * grid_h * grid_w // (spatial_merge_size**2)
-                    new_src_item.extend([video_token_id] *
-                                        vision_ntoken_per_chunk)
-                    vision_llm_pos_ids_list = cls._get_llm_pos_ids_for_vision(
-                        start_idx, video_idx, spatial_merge_size, t_chunk,
-                        grid_hs, grid_ws).split(1, dim=1)
-                    llm_pos_ids_list.extend(vision_llm_pos_ids_list)
-                    new_src_item.extend(
-                        min(t_ntoken_per_chunk, pure_audio_len -
-                            added_audio_len) * [audio_token_id])
-                    audio_start_idx = start_idx if len(
-                        audio_llm_pos_ids_list
-                    ) == 0 else audio_llm_pos_ids_list[-1][0].item() + 1
-                    if min(t_ntoken_per_chunk,
-                           pure_audio_len - added_audio_len) > 0:
-                        audio_llm_pos_ids_list = (torch.arange(
-                            min(t_ntoken_per_chunk, pure_audio_len -
-                                added_audio_len)).expand(3, -1) +
-                                                  audio_start_idx).split(1,
-                                                                         dim=1)
-                    else:
-                        audio_llm_pos_ids_list = []
-                    added_audio_len += min(t_ntoken_per_chunk,
-                                           pure_audio_len - added_audio_len)
-                    llm_pos_ids_list.extend(audio_llm_pos_ids_list)
-                if added_audio_len < pure_audio_len:
-                    new_src_item.extend(
-                        (pure_audio_len - added_audio_len) * [audio_token_id])
-                    audio_llm_pos_ids_list = (
-                        torch.arange(pure_audio_len - added_audio_len).expand(
-                            3, -1) + llm_pos_ids_list[-1].max() + 1).split(
-                                1, dim=1)
-                    llm_pos_ids_list.extend(audio_llm_pos_ids_list)
-                audio_idx += 1
-                video_idx += 1
-            # move to the next token
-            idx += len(new_src_item) - new_src_item_len
-
-        llm_positions = torch.cat(llm_pos_ids_list, dim=1)
-        mrope_position_delta = torch.cat(llm_pos_ids_list,
-                                         dim=1).max() + 1 - len(src_item)
-        llm_positions = llm_positions[:, context_len:seq_len]
-
-        return llm_positions, mrope_position_delta
-
-    @staticmethod
-    def _get_llm_pos_ids_for_vision(
-        start_idx: int,
-        vision_idx: int,
-        spatial_merge_size: int,
-        t_index: list[int],
-        grid_hs: torch.Tensor,
-        grid_ws: torch.Tensor,
-    ) -> torch.Tensor:
-        llm_pos_ids_list = []
-        llm_grid_h = grid_hs[vision_idx] // spatial_merge_size
-        llm_grid_w = grid_ws[vision_idx] // spatial_merge_size
-        h_index = (torch.arange(llm_grid_h).view(1, -1, 1).expand(
-            len(t_index), -1, llm_grid_w).flatten())
-        w_index = (torch.arange(llm_grid_w).view(1, 1, -1).expand(
-            len(t_index), llm_grid_h, -1).flatten())
-        t_index_tensor = torch.Tensor(t_index).to(llm_grid_h.device).view(
-            -1, 1).expand(-1, llm_grid_h * llm_grid_w).long().flatten()
-        _llm_pos_ids = torch.stack([t_index_tensor, h_index, w_index])
-        llm_pos_ids_list.append(_llm_pos_ids + start_idx)
-        llm_pos_ids = torch.cat(llm_pos_ids_list, dim=1)
-        return llm_pos_ids
-
-    @staticmethod
-    def _split_list_into_ranges(lst: torch.Tensor,
-                                interval: int) -> list[list[int]]:
-        ranges: list[list[int]] = [[]
-                                   for _ in range((max(lst) // interval) + 1)]
-        for num in lst:
-            index = num // interval
-            ranges[index].append(num)
-        return ranges
-
-    @staticmethod
-    def get_next_input_positions(
-        mrope_position_delta: int,
-        context_len: int,
-        seq_len: int,
-    ) -> list[list[int]]:
-        return [
-            list(
-                range(context_len + mrope_position_delta,
-                      seq_len + mrope_position_delta)) for _ in range(3)
-        ]
-
-    @staticmethod
-    def get_next_input_positions_tensor(out: np.ndarray, out_offset: int,
-                                        mrope_position_delta: int,
-                                        context_len: int, num_new_tokens: int):
-
-        values = np.arange(mrope_position_delta + context_len,
-                           mrope_position_delta + context_len + num_new_tokens,
-                           dtype=out.dtype)
-        out[:, out_offset:out_offset + num_new_tokens] = values
-
-    @classmethod
-    def omni_get_updates_use_audio_in_video(
-        cls,
-        thinker_config: PretrainedConfig,
-        audio_len: int,
-        video_grid_thw: Union[list[int], torch.Tensor],
-        video_second_per_grid_t: float,
-    ) -> list[int]:
-        """Get video prompt updates when `use_audio_in_video` is True.
-
-        In this case, audio and vision update ids will be split into
-        chunks and interleaved (details in `_omni_get_input_positions_tensor`).
-
-        <|video_bos|><|VIDEO|><|video_eos|> =>
-        <|video_bos|><|audio_bos|>(... chunks ...)<|audio_eos|><|video_eos|>
-        """
-
-        audio_token_id = thinker_config.audio_token_index
-        video_token_id = thinker_config.video_token_index
-        audio_start_token_id = thinker_config.audio_start_token_id
-        audio_end_token_id = thinker_config.audio_end_token_id
-        seconds_per_chunk = thinker_config.seconds_per_chunk
-        spatial_merge_size = thinker_config.vision_config.spatial_merge_size
-        tokens_per_second = getattr(thinker_config.vision_config,
-                                    "tokens_per_second", 25)
-
-        grid_t = video_grid_thw[0]
-        grid_h = video_grid_thw[1]
-        grid_w = video_grid_thw[2]
-        t_ntoken_per_chunk = int(tokens_per_second * seconds_per_chunk)
-        t_index = (torch.arange(grid_t) * video_second_per_grid_t *
-                   tokens_per_second).long()
-        t_index_split_chunk = cls._split_list_into_ranges(
-            t_index, t_ntoken_per_chunk)
-
-        updates = [audio_start_token_id]
-        added_audio_len = 0
-        for t_chunk in t_index_split_chunk:
-            vision_ntoken_per_chunk = len(t_chunk) * grid_h * grid_w // (
-                spatial_merge_size**2)
-            updates.extend([video_token_id] * vision_ntoken_per_chunk)
-
-            audio_chunk_size = min(t_ntoken_per_chunk,
-                                   audio_len - added_audio_len)
-            updates.extend(audio_chunk_size * [audio_token_id])
-            added_audio_len += audio_chunk_size
-        if added_audio_len < audio_len:
-            updates.extend((audio_len - added_audio_len) * [audio_token_id])
-        updates.extend([audio_end_token_id])
-
-        return updates
-
-
-@CustomOp.register("dual_chunk_rotary_embedding")
-class DualChunkRotaryEmbedding(CustomOp):
-    """Rotary positional embedding for Dual Chunk Attention."""
-
-    def __init__(
-        self,
-        head_size: int,
-        rotary_dim: int,
-        max_position_embeddings: int,
-        base: float,
-        is_neox_style: bool,
-        dtype: torch.dtype,
-        chunk_size: int,
-        local_size: int,
-    ) -> None:
-        super().__init__()
-        self.head_size = head_size
-        self.rotary_dim = rotary_dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        self.is_neox_style = is_neox_style
-        self.chunk_size = chunk_size
-        self.local_size = local_size
-        self.dtype = dtype
-        self.device = torch.device(f"cuda:{torch.cuda.current_device()}")
-        (q_cache, qc_cache, k_cache, qc_no_clamp_cache,
-         q_inter_cache) = self._compute_cos_sin_cache()
-
-        self.register_buffer("cos_sin_q_cache", q_cache, persistent=False)
-        self.register_buffer("cos_sin_qc_cache", qc_cache, persistent=False)
-        self.register_buffer("cos_sin_k_cache", k_cache, persistent=False)
-        self.register_buffer("cos_sin_qc_no_clamp_cache",
-                             qc_no_clamp_cache,
-                             persistent=False)
-        self.register_buffer("cos_sin_q_inter_cache",
-                             q_inter_cache,
-                             persistent=False)
-
-    def _compute_inv_freq(self, base: float) -> torch.Tensor:
-        """Compute the inverse frequency."""
-        # NOTE(woosuk): The HF implementation uses `torch.arange(...).float()`.
-        # However, we use `torch.arange(..., dtype=torch.float)` instead to
-        # avoid numerical issues with large base values (e.g., 10000000).
-        # This may cause a slight numerical difference between the HF
-        # implementation and ours.
-        # NOTE(woosuk): To exactly match the HF implementation, we need to
-        # use CPU to compute the cache and then move it to GPU. However, we
-        # create the cache on GPU for faster initialization. This may cause
-        # a slight numerical difference between the HF implementation and ours.
-        inv_freq = 1.0 / (base**(torch.arange(
-            0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim))
-        return inv_freq
-
-    def _compute_cos_sin_cache(self) -> torch.Tensor:
-        """Compute the cos and sin cache."""
-        inv_freq = self._compute_inv_freq(self.base)
-        chunk_len = self.chunk_size - self.local_size
-        q_t = torch.arange(chunk_len, dtype=torch.float)
-        qc_t = (torch.arange(chunk_len, dtype=torch.float) +
-                chunk_len).clamp(max=self.chunk_size)
-        k_t = torch.arange(self.max_position_embeddings,
-                           dtype=torch.float) % chunk_len
-
-        # count from chunk_len, no clamp(self.chunk_size) restriction
-        qc_no_clamp_t = torch.arange(chunk_len, dtype=torch.float) + chunk_len
-        # count from self.chunk_size for q_inter's rope
-        q_inter_t = torch.arange(chunk_len,
-                                 dtype=torch.float) + self.chunk_size
-
-        q_freqs = torch.outer(q_t, inv_freq)
-        qc_freqs = torch.outer(qc_t, inv_freq)
-        k_freqs = torch.outer(k_t, inv_freq)
-        qc_no_clamp_freqs = torch.outer(qc_no_clamp_t, inv_freq)
-        q_inter_freqs = torch.outer(q_inter_t, inv_freq)
-
-        q_cos = q_freqs.cos()
-        q_sin = q_freqs.sin()
-        qc_cos = qc_freqs.cos()
-        qc_sin = qc_freqs.sin()
-        k_cos = k_freqs.cos()
-        k_sin = k_freqs.sin()
-
-        qc_no_clamp_cos = qc_no_clamp_freqs.cos()
-        qc_no_clamp_sin = qc_no_clamp_freqs.sin()
-        q_inter_cos = q_inter_freqs.cos()
-        q_inter_sin = q_inter_freqs.sin()
-
-        q_cache = torch.cat((q_cos, q_sin), dim=-1).to(dtype=self.dtype,
-                                                       device=self.device)
-        qc_cache = torch.cat((qc_cos, qc_sin), dim=-1).to(dtype=self.dtype,
-                                                          device=self.device)
-        k_cache = torch.cat((k_cos, k_sin), dim=-1).to(dtype=self.dtype,
-                                                       device=self.device)
-        qc_no_clamp_cache = torch.cat((qc_no_clamp_cos, qc_no_clamp_sin),
-                                      dim=-1).to(dtype=self.dtype,
-                                                 device=self.device)
-        q_inter_cache = torch.cat((q_inter_cos, q_inter_sin),
-                                  dim=-1).to(dtype=self.dtype,
-                                             device=self.device)
-        return q_cache, qc_cache, k_cache, qc_no_clamp_cache, q_inter_cache
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        offsets: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        query = query.view(*query.shape[:-1], -1, self.head_size)
-        key = key.view(*key.shape[:-1], -1, self.head_size)
-        query_rot = query[..., :self.rotary_dim]
-        key_rot = key[..., :self.rotary_dim]
-        if self.rotary_dim < self.head_size:
-            query_pass = query[..., self.rotary_dim:]
-            key_pass = key[..., self.rotary_dim:]
-        else:
-            query_pass = None
-            key_pass = None
-
-        positions_with_offsets = (torch.add(positions, offsets)
-                                  if offsets is not None else positions)
-        key = self._apply_rotary_embedding(
-            self.cos_sin_k_cache[positions_with_offsets], key_rot, key_pass)
-        chunk_len = self.chunk_size - self.local_size
-        query = self._apply_rotary_embedding(
-            self.cos_sin_q_cache[positions_with_offsets % chunk_len],
-            query_rot, query_pass)
-        query_succ = self._apply_rotary_embedding(
-            self.cos_sin_qc_cache[positions_with_offsets % chunk_len],
-            query_rot, query_pass)
-        query_inter = self._apply_rotary_embedding(
-            self.cos_sin_qc_cache[chunk_len - 1].repeat(positions.shape[0], 1),
-            query_rot, query_pass)
-        query_succ_critical = self._apply_rotary_embedding(
-            self.cos_sin_qc_no_clamp_cache[positions_with_offsets % chunk_len],
-            query_rot, query_pass)
-        query_inter_critical = self._apply_rotary_embedding(
-            self.cos_sin_q_inter_cache[positions_with_offsets % chunk_len],
-            query_rot, query_pass)
-
-        # merge query into one tensor to simplify the interfaces
-        query = torch.cat((
-            query,
-            query_succ,
-            query_inter,
-            query_succ_critical,
-            query_inter_critical,
-        ),
-                          dim=-1)
-        return query, key
-
-    def _apply_rotary_embedding(self, cos_sin, hidden_rot, hidden_pass):
-        cos, sin = cos_sin.chunk(2, dim=-1)
-        if self.is_neox_style:
-            # NOTE(woosuk): Here we assume that the positions tensor has the
-            # shape [batch_size, seq_len].
-            cos = cos.repeat(1, 1, 2).unsqueeze(-2)
-            sin = sin.repeat(1, 1, 2).unsqueeze(-2)
-        else:
-            cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2)
-            sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2)
-        rotate_fn = _rotate_neox if self.is_neox_style else _rotate_gptj
-        hidden_rot = hidden_rot * cos + rotate_fn(hidden_rot) * sin
-
-        if self.rotary_dim < self.head_size:
-            hidden = torch.cat((hidden_rot, hidden_pass), dim=-1)
-        else:
-            hidden = hidden_rot
-        return hidden.flatten(-2).squeeze(0)
-
-    def extra_repr(self) -> str:
-        s = f"head_size={self.head_size}, rotary_dim={self.rotary_dim}"
-        s += f", max_position_embeddings={self.max_position_embeddings}"
-        s += f", base={self.base}, is_neox_style={self.is_neox_style}"
-        s += f", chunk_size={self.chunk_size}, local_size={self.local_size}"
-        return s
-
-
-_ROPE_DICT: dict[tuple, RotaryEmbedding] = {}
-
-
-def get_rope(
-    head_size: int,
-    rotary_dim: int,
-    max_position: int,
-    base: float,
-    is_neox_style: bool = True,
-    rope_scaling: Optional[dict[str, Any]] = None,
-    dtype: Optional[torch.dtype] = None,
-    partial_rotary_factor: float = 1.0,
-    dual_chunk_attention_config: Optional[dict[str, Any]] = None,
-) -> RotaryEmbedding:
-    if dtype is None:
-        dtype = torch.get_default_dtype()
-    if rope_scaling is not None:
-        # Transforms every value that is a list into a tuple for caching calls
-        rope_scaling_tuple = {
-            k: tuple(v) if isinstance(v, list) else v
-            for k, v in rope_scaling.items()
-        }
-        rope_scaling_args = tuple(rope_scaling_tuple.items())
-    else:
-        rope_scaling_args = None
-
-    if dual_chunk_attention_config is not None:
-        dual_chunk_attention_tuple = {
-            k: tuple(v) if isinstance(v, list) else v
-            for k, v in dual_chunk_attention_config.items()
-            if k != "sparse_attention_config"
-        }
-        dual_chunk_attention_args = tuple(dual_chunk_attention_tuple.items())
-    else:
-        dual_chunk_attention_args = None
-
-    if partial_rotary_factor < 1.0:
-        rotary_dim = int(rotary_dim * partial_rotary_factor)
-    key = (head_size, rotary_dim, max_position, base, is_neox_style,
-           rope_scaling_args, dual_chunk_attention_args, dtype)
-    if key in _ROPE_DICT:
-        return _ROPE_DICT[key]
-
-    if dual_chunk_attention_config is not None:
-        extra_kwargs = {
-            k: v
-            for k, v in dual_chunk_attention_config.items()
-            if k in ("chunk_size", "local_size")
-        }
-        rotary_emb = DualChunkRotaryEmbedding(head_size, rotary_dim,
-                                              max_position, base,
-                                              is_neox_style, dtype,
-                                              **extra_kwargs)
-    elif not rope_scaling:
-        rotary_emb = RotaryEmbedding(head_size, rotary_dim, max_position, base,
-                                     is_neox_style, dtype)
-    else:
-        scaling_type = rope_scaling["rope_type"]
-
-        if scaling_type == "llama3":
-            scaling_factor = rope_scaling["factor"]
-            low_freq_factor = rope_scaling["low_freq_factor"]
-            high_freq_factor = rope_scaling["high_freq_factor"]
-            original_max_position = rope_scaling[
-                "original_max_position_embeddings"]
-            rotary_emb = Llama3RotaryEmbedding(head_size, rotary_dim,
-                                               max_position, base,
-                                               is_neox_style, dtype,
-                                               scaling_factor, low_freq_factor,
-                                               high_freq_factor,
-                                               original_max_position)
-        elif scaling_type == "mllama4":
-            rotary_emb = Llama4VisionRotaryEmbedding(head_size, rotary_dim,
-                                                     max_position, base,
-                                                     is_neox_style, dtype)
-        elif scaling_type == "default":
-            if "mrope_section" in rope_scaling:
-                rotary_emb = MRotaryEmbedding(
-                    head_size,
-                    rotary_dim,
-                    max_position,
-                    base,
-                    is_neox_style,
-                    dtype,
-                    mrope_section=rope_scaling["mrope_section"],
-                )
-            else:
-                rotary_emb = RotaryEmbedding(
-                    head_size,
-                    rotary_dim,
-                    max_position,
-                    base,
-                    is_neox_style,
-                    dtype,
-                )
-        elif scaling_type == "linear":
-            scaling_factor = rope_scaling["factor"]
-            rotary_emb = LinearScalingRotaryEmbedding(head_size, rotary_dim,
-                                                      max_position, base,
-                                                      is_neox_style,
-                                                      scaling_factor, dtype)
-        elif scaling_type == "ntk":
-            scaling_factor = rope_scaling["factor"]
-            mixed_b = rope_scaling.get('mixed_b', None)
-            rotary_emb = NTKScalingRotaryEmbedding(head_size, rotary_dim,
-                                                   max_position, base,
-                                                   is_neox_style,
-                                                   scaling_factor, dtype,
-                                                   mixed_b)
-        elif scaling_type == "dynamic":
-            if "alpha" in rope_scaling:
-                scaling_alpha = rope_scaling["alpha"]
-                rotary_emb = DynamicNTKAlphaRotaryEmbedding(
-                    head_size, rotary_dim, max_position, base, is_neox_style,
-                    scaling_alpha, dtype)
-            elif "factor" in rope_scaling:
-                scaling_factor = rope_scaling["factor"]
-                rotary_emb = DynamicNTKScalingRotaryEmbedding(
-                    head_size, rotary_dim, max_position, base, is_neox_style,
-                    scaling_factor, dtype)
-            else:
-                raise ValueError("Dynamic rope scaling must contain either "
-                                 "'alpha' or 'factor' field")
-        elif scaling_type == "yarn":
-            scaling_factor = rope_scaling["factor"]
-            original_max_position = rope_scaling[
-                "original_max_position_embeddings"]
-            extra_kwargs = {
-                k: v
-                for k, v in rope_scaling.items()
-                if k in ("extrapolation_factor", "attn_factor", "beta_fast",
-                         "beta_slow")
-            }
-            rotary_emb = YaRNScalingRotaryEmbedding(head_size, rotary_dim,
-                                                    original_max_position,
-                                                    base, is_neox_style,
-                                                    scaling_factor, dtype,
-                                                    **extra_kwargs)
-        elif scaling_type == "deepseek_yarn":
-            scaling_factor = rope_scaling["factor"]
-            original_max_position = rope_scaling[
-                "original_max_position_embeddings"]
-            # assert max_position == original_max_position * scaling_factor
-            extra_kwargs = {
-                k: v
-                for k, v in rope_scaling.items()
-                if k in ("extrapolation_factor", "attn_factor", "beta_fast",
-                         "beta_slow", "mscale", "mscale_all_dim")
-            }
-            rotary_emb = DeepseekScalingRotaryEmbedding(
-                head_size, rotary_dim, original_max_position, base,
-                is_neox_style, scaling_factor, dtype, **extra_kwargs)
-        elif scaling_type == "longrope":
-            short_factor = rope_scaling["short_factor"]
-            long_factor = rope_scaling["long_factor"]
-            original_max_position = rope_scaling[
-                "original_max_position_embeddings"]
-            extra_kwargs = {
-                k: v
-                for k, v in rope_scaling.items()
-                if k in ("short_mscale", "long_mscale")
-            }
-            rotary_emb = Phi3LongRoPEScaledRotaryEmbedding(
-                head_size, rotary_dim, max_position, original_max_position,
-                base, is_neox_style, dtype, short_factor, long_factor,
-                **extra_kwargs)
-        else:
-            raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
-    _ROPE_DICT[key] = rotary_emb
-    return rotary_emb
diff --git a/vllm/model_executor/layers/rotary_embedding/__init__.py b/vllm/model_executor/layers/rotary_embedding/__init__.py
new file mode 100644
index 0000000000000..564f9a5c00750
--- /dev/null
+++ b/vllm/model_executor/layers/rotary_embedding/__init__.py
@@ -0,0 +1,190 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Rotary Positional Embeddings."""
+from typing import Any, Optional
+
+import torch
+
+from .base import RotaryEmbedding
+from .deepseek_scaling_rope import DeepseekScalingRotaryEmbedding
+from .dual_chunk_rope import DualChunkRotaryEmbedding
+from .dynamic_ntk_alpha_rope import DynamicNTKAlphaRotaryEmbedding
+from .dynamic_ntk_scaling_rope import DynamicNTKScalingRotaryEmbedding
+from .linear_scaling_rope import LinearScalingRotaryEmbedding
+from .llama3_rope import Llama3RotaryEmbedding
+from .llama4_vision_rope import Llama4VisionRotaryEmbedding
+from .mrope import MRotaryEmbedding
+from .ntk_scaling_rope import NTKScalingRotaryEmbedding
+from .phi3_long_rope_scaled_rope import Phi3LongRoPEScaledRotaryEmbedding
+from .yarn_scaling_rope import YaRNScalingRotaryEmbedding
+
+_ROPE_DICT: dict[tuple, RotaryEmbedding] = {}
+
+
+def get_rope(
+    head_size: int,
+    rotary_dim: int,
+    max_position: int,
+    base: float,
+    is_neox_style: bool = True,
+    rope_scaling: Optional[dict[str, Any]] = None,
+    dtype: Optional[torch.dtype] = None,
+    partial_rotary_factor: float = 1.0,
+    dual_chunk_attention_config: Optional[dict[str, Any]] = None,
+) -> RotaryEmbedding:
+    if dtype is None:
+        dtype = torch.get_default_dtype()
+    if rope_scaling is not None:
+        # Transforms every value that is a list into a tuple for caching calls
+        rope_scaling_tuple = {
+            k: tuple(v) if isinstance(v, list) else v
+            for k, v in rope_scaling.items()
+        }
+        rope_scaling_args = tuple(rope_scaling_tuple.items())
+    else:
+        rope_scaling_args = None
+
+    if dual_chunk_attention_config is not None:
+        dual_chunk_attention_tuple = {
+            k: tuple(v) if isinstance(v, list) else v
+            for k, v in dual_chunk_attention_config.items()
+            if k != "sparse_attention_config"
+        }
+        dual_chunk_attention_args = tuple(dual_chunk_attention_tuple.items())
+    else:
+        dual_chunk_attention_args = None
+
+    if partial_rotary_factor < 1.0:
+        rotary_dim = int(rotary_dim * partial_rotary_factor)
+    key = (head_size, rotary_dim, max_position, base, is_neox_style,
+           rope_scaling_args, dual_chunk_attention_args, dtype)
+    if key in _ROPE_DICT:
+        return _ROPE_DICT[key]
+
+    if dual_chunk_attention_config is not None:
+        extra_kwargs = {
+            k: v
+            for k, v in dual_chunk_attention_config.items()
+            if k in ("chunk_size", "local_size")
+        }
+        rotary_emb = DualChunkRotaryEmbedding(head_size, rotary_dim,
+                                              max_position, base,
+                                              is_neox_style, dtype,
+                                              **extra_kwargs)
+    elif not rope_scaling:
+        rotary_emb = RotaryEmbedding(head_size, rotary_dim, max_position, base,
+                                     is_neox_style, dtype)
+    else:
+        scaling_type = rope_scaling["rope_type"]
+
+        if scaling_type == "llama3":
+            scaling_factor = rope_scaling["factor"]
+            low_freq_factor = rope_scaling["low_freq_factor"]
+            high_freq_factor = rope_scaling["high_freq_factor"]
+            original_max_position = rope_scaling[
+                "original_max_position_embeddings"]
+            rotary_emb = Llama3RotaryEmbedding(head_size, rotary_dim,
+                                               max_position, base,
+                                               is_neox_style, dtype,
+                                               scaling_factor, low_freq_factor,
+                                               high_freq_factor,
+                                               original_max_position)
+        elif scaling_type == "mllama4":
+            rotary_emb = Llama4VisionRotaryEmbedding(head_size, rotary_dim,
+                                                     max_position, base,
+                                                     is_neox_style, dtype)
+        elif scaling_type == "default":
+            if "mrope_section" in rope_scaling:
+                rotary_emb = MRotaryEmbedding(
+                    head_size,
+                    rotary_dim,
+                    max_position,
+                    base,
+                    is_neox_style,
+                    dtype,
+                    mrope_section=rope_scaling["mrope_section"],
+                )
+            else:
+                rotary_emb = RotaryEmbedding(
+                    head_size,
+                    rotary_dim,
+                    max_position,
+                    base,
+                    is_neox_style,
+                    dtype,
+                )
+        elif scaling_type == "linear":
+            scaling_factor = rope_scaling["factor"]
+            rotary_emb = LinearScalingRotaryEmbedding(head_size, rotary_dim,
+                                                      max_position, base,
+                                                      is_neox_style,
+                                                      scaling_factor, dtype)
+        elif scaling_type == "ntk":
+            scaling_factor = rope_scaling["factor"]
+            mixed_b = rope_scaling.get('mixed_b', None)
+            rotary_emb = NTKScalingRotaryEmbedding(head_size, rotary_dim,
+                                                   max_position, base,
+                                                   is_neox_style,
+                                                   scaling_factor, dtype,
+                                                   mixed_b)
+        elif scaling_type == "dynamic":
+            if "alpha" in rope_scaling:
+                scaling_alpha = rope_scaling["alpha"]
+                rotary_emb = DynamicNTKAlphaRotaryEmbedding(
+                    head_size, rotary_dim, max_position, base, is_neox_style,
+                    scaling_alpha, dtype)
+            elif "factor" in rope_scaling:
+                scaling_factor = rope_scaling["factor"]
+                rotary_emb = DynamicNTKScalingRotaryEmbedding(
+                    head_size, rotary_dim, max_position, base, is_neox_style,
+                    scaling_factor, dtype)
+            else:
+                raise ValueError("Dynamic rope scaling must contain either "
+                                 "'alpha' or 'factor' field")
+        elif scaling_type == "yarn":
+            scaling_factor = rope_scaling["factor"]
+            original_max_position = rope_scaling[
+                "original_max_position_embeddings"]
+            extra_kwargs = {
+                k: v
+                for k, v in rope_scaling.items()
+                if k in ("extrapolation_factor", "attn_factor", "beta_fast",
+                         "beta_slow")
+            }
+            rotary_emb = YaRNScalingRotaryEmbedding(head_size, rotary_dim,
+                                                    original_max_position,
+                                                    base, is_neox_style,
+                                                    scaling_factor, dtype,
+                                                    **extra_kwargs)
+        elif scaling_type == "deepseek_yarn":
+            scaling_factor = rope_scaling["factor"]
+            original_max_position = rope_scaling[
+                "original_max_position_embeddings"]
+            # assert max_position == original_max_position * scaling_factor
+            extra_kwargs = {
+                k: v
+                for k, v in rope_scaling.items()
+                if k in ("extrapolation_factor", "attn_factor", "beta_fast",
+                         "beta_slow", "mscale", "mscale_all_dim")
+            }
+            rotary_emb = DeepseekScalingRotaryEmbedding(
+                head_size, rotary_dim, original_max_position, base,
+                is_neox_style, scaling_factor, dtype, **extra_kwargs)
+        elif scaling_type == "longrope":
+            short_factor = rope_scaling["short_factor"]
+            long_factor = rope_scaling["long_factor"]
+            original_max_position = rope_scaling[
+                "original_max_position_embeddings"]
+            extra_kwargs = {
+                k: v
+                for k, v in rope_scaling.items()
+                if k in ("short_mscale", "long_mscale")
+            }
+            rotary_emb = Phi3LongRoPEScaledRotaryEmbedding(
+                head_size, rotary_dim, max_position, original_max_position,
+                base, is_neox_style, dtype, short_factor, long_factor,
+                **extra_kwargs)
+        else:
+            raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+    _ROPE_DICT[key] = rotary_emb
+    return rotary_emb
diff --git a/vllm/model_executor/layers/rotary_embedding/base.py b/vllm/model_executor/layers/rotary_embedding/base.py
new file mode 100644
index 0000000000000..10fce857a8ae2
--- /dev/null
+++ b/vllm/model_executor/layers/rotary_embedding/base.py
@@ -0,0 +1,237 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Rotary Positional Embeddings Base Class."""
+from typing import Optional
+
+import torch
+
+from vllm.model_executor.custom_op import CustomOp
+
+from .common import apply_rotary_emb_dispatch, apply_rotary_emb_torch
+
+
+@CustomOp.register("rotary_embedding")
+class RotaryEmbedding(CustomOp):
+    """Original rotary positional embedding."""
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: float,
+        is_neox_style: bool,
+        dtype: torch.dtype,
+    ) -> None:
+        super().__init__()
+        self.head_size = head_size
+        self.rotary_dim = rotary_dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        self.is_neox_style = is_neox_style
+        self.dtype = dtype
+
+        cache = self._compute_cos_sin_cache()
+        cache = cache.to(dtype)
+        self.cos_sin_cache: torch.Tensor
+        self.register_buffer("cos_sin_cache", cache, persistent=False)
+
+    def _compute_inv_freq(self, base: float) -> torch.Tensor:
+        """Compute the inverse frequency."""
+        # NOTE(woosuk): To exactly match the HF implementation, we need to
+        # use CPU to compute the cache and then move it to GPU. However, we
+        # create the cache on GPU for faster initialization. This may cause
+        # a slight numerical difference between the HF implementation and ours.
+        inv_freq = 1.0 / (base**(torch.arange(
+            0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim))
+        return inv_freq
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        """Compute the cos and sin cache."""
+        inv_freq = self._compute_inv_freq(self.base)
+        t = torch.arange(self.max_position_embeddings, dtype=torch.float)
+
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = freqs.cos()
+        sin = freqs.sin()
+        cache = torch.cat((cos, sin), dim=-1)
+        return cache
+
+    def forward_native(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: Optional[torch.Tensor] = None,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """A PyTorch-native implementation of forward()."""
+        if offsets is not None:
+            positions = positions + offsets
+        positions = positions.flatten()
+        num_tokens = positions.shape[0]
+        cos_sin = self.cos_sin_cache.index_select(0, positions)
+        cos, sin = cos_sin.chunk(2, dim=-1)
+
+        query_shape = query.shape
+        query = query.view(num_tokens, -1, self.head_size)
+        query_rot = query[..., :self.rotary_dim]
+        query_pass = query[..., self.rotary_dim:]
+        query_rot = apply_rotary_emb_torch(query_rot, cos, sin,
+                                           self.is_neox_style)
+        query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
+
+        # key may be None in some cases, e.g. cross-layer KV sharing
+        if key is not None:
+            key_shape = key.shape
+            key = key.view(num_tokens, -1, self.head_size)
+            key_rot = key[..., :self.rotary_dim]
+            key_pass = key[..., self.rotary_dim:]
+            key_rot = apply_rotary_emb_torch(key_rot, cos, sin,
+                                             self.is_neox_style)
+            key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
+        return query, key
+
+    def forward_cuda(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: Optional[torch.Tensor] = None,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        from vllm import _custom_ops as ops
+
+        # __setattr__ in nn.Module (called by `self.cos_sin_cache = ...`)
+        # is expensive, so avoid calling it if possible
+        if self.cos_sin_cache.device != query.device or \
+            self.cos_sin_cache.dtype != query.dtype:
+            self.cos_sin_cache = self.cos_sin_cache.to(query.device,
+                                                       dtype=query.dtype)
+
+        # ops.rotary_embedding()/batched_rotary_embedding()
+        # are in-place operations that update the query and key tensors.
+        if offsets is not None:
+            ops.batched_rotary_embedding(positions, query, key, self.head_size,
+                                         self.cos_sin_cache,
+                                         self.is_neox_style, self.rotary_dim,
+                                         offsets)
+        else:
+            ops.rotary_embedding(positions, query, key, self.head_size,
+                                 self.cos_sin_cache, self.is_neox_style)
+        return query, key
+
+    def forward_xpu(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: Optional[torch.Tensor] = None,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        from vllm._ipex_ops import ipex_ops as ops
+
+        self.cos_sin_cache = self.cos_sin_cache.to(positions.device,
+                                                   dtype=query.dtype)
+        # ops.rotary_embedding()/batched_rotary_embedding()
+        # are in-place operations that update the query and key tensors.
+        if key is None:
+            # XPU kernel doesn't support key=None so fall back to native impl
+            # TODO(sarckk): add support for optional key in
+            # ipex.llm.functional.rotary_embedding_batched
+            return self.forward_native(positions, query, key, offsets)
+        else:
+            if offsets is not None:
+                ops.batched_rotary_embedding(positions, query, key,
+                                             self.head_size,
+                                             self.cos_sin_cache,
+                                             self.is_neox_style,
+                                             self.rotary_dim, offsets)
+            else:
+                ops.rotary_embedding(positions, query, key, self.head_size,
+                                     self.cos_sin_cache, self.is_neox_style)
+        return query, key
+
+    def forward_neuron(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: Optional[torch.Tensor] = None,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+
+        def _apply_rotary_emb_neuron(
+            x: torch.Tensor,
+            cos: torch.Tensor,
+            sin: torch.Tensor,
+            is_neox_style: bool,
+        ) -> torch.Tensor:
+            cos = cos.unsqueeze(-2).to(x.dtype)
+            sin = sin.unsqueeze(-2).to(x.dtype)
+            if is_neox_style:
+                x1, x2 = torch.chunk(x, 2, dim=-1)
+            else:
+                # x1 = x[..., ::2]
+
+                # x2 = x[..., 1::2]
+                d = x.shape[-1] // 2
+                x_reshaped = x.view(-1, x.shape[-1])
+                x1 = x_reshaped[:, ::2].view(*x.shape[:-1], d)
+                x2 = x_reshaped[:, 1::2].view(*x.shape[:-1], d)
+            o1 = x1 * cos - x2 * sin
+            o2 = x2 * cos + x1 * sin
+            if is_neox_style:
+                return torch.cat((o1, o2), dim=-1)
+            else:
+                return torch.stack((o1, o2), dim=-1).flatten(-2)
+
+        if offsets is not None:
+            positions = positions + offsets
+
+        self.cos_sin_cache = self.cos_sin_cache.to(query.device,
+                                                   dtype=query.dtype)
+
+        positions = positions.flatten()
+        num_tokens = positions.shape[0]
+        cos_sin = self.cos_sin_cache.index_select(0, positions)
+        cos, sin = cos_sin.chunk(2, dim=-1)
+
+        query_shape = query.shape
+        query = query.view(num_tokens, -1, self.head_size)
+        if key is not None:
+            key_shape = key.shape
+            key = key.view(num_tokens, -1, self.head_size)
+
+        if self.rotary_dim == self.head_size:
+            query = apply_rotary_emb_dispatch(query, cos, sin,
+                                              self.is_neox_style)
+            query = query.reshape(query_shape)
+            if key is not None:
+                key = apply_rotary_emb_dispatch(key, cos, sin,
+                                                self.is_neox_style)
+                key = key.reshape(key_shape)
+        else:
+            head_size = query.shape[-1]
+            query_reshaped = query.view(-1, head_size)
+            query_pass = query_reshaped[:, self.rotary_dim:].view(
+                *query.shape[:-1], head_size - self.rotary_dim)
+            query_rot = query_reshaped[:, :self.rotary_dim].view(
+                *query.shape[:-1], self.rotary_dim)
+            query_rot = _apply_rotary_emb_neuron(query_rot, cos, sin,
+                                                 self.is_neox_style)
+            query = torch.cat((query_rot, query_pass),
+                              dim=-1).reshape(query_shape)
+
+            if key is not None:
+                key_reshaped = key.view(-1, head_size)
+                key_pass = key_reshaped[:, self.rotary_dim:].view(
+                    *key.shape[:-1], head_size - self.rotary_dim)
+                key_rot = key_reshaped[:, :self.rotary_dim].view(
+                    *key.shape[:-1], self.rotary_dim)
+                key_rot = _apply_rotary_emb_neuron(key_rot, cos, sin,
+                                                   self.is_neox_style)
+                key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
+        return query, key
+
+    def extra_repr(self) -> str:
+        s = f"head_size={self.head_size}, rotary_dim={self.rotary_dim}"
+        s += f", max_position_embeddings={self.max_position_embeddings}"
+        s += f", base={self.base}, is_neox_style={self.is_neox_style}"
+        return s
diff --git a/vllm/model_executor/layers/rotary_embedding/common.py b/vllm/model_executor/layers/rotary_embedding/common.py
new file mode 100644
index 0000000000000..8d821bea19e3e
--- /dev/null
+++ b/vllm/model_executor/layers/rotary_embedding/common.py
@@ -0,0 +1,105 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import math
+
+import torch
+
+from vllm.platforms import current_platform
+
+if current_platform.is_cuda():
+    from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb
+
+
+# common functions
+def rotate_neox(x: torch.Tensor) -> torch.Tensor:
+    x1 = x[..., :x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2:]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def rotate_gptj(x: torch.Tensor) -> torch.Tensor:
+    x1 = x[..., ::2]
+    x2 = x[..., 1::2]
+    x = torch.stack((-x2, x1), dim=-1)
+    return x.flatten(-2)
+
+
+def apply_rotary_emb_torch(
+    x: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    is_neox_style: bool,
+) -> torch.Tensor:
+    cos = cos.unsqueeze(-2).to(x.dtype)
+    sin = sin.unsqueeze(-2).to(x.dtype)
+    if is_neox_style:
+        x1, x2 = torch.chunk(x, 2, dim=-1)
+    else:
+        x1 = x[..., ::2]
+        x2 = x[..., 1::2]
+    o1 = x1 * cos - x2 * sin
+    o2 = x2 * cos + x1 * sin
+    if is_neox_style:
+        return torch.cat((o1, o2), dim=-1)
+    else:
+        return torch.stack((o1, o2), dim=-1).flatten(-2)
+
+
+def apply_rotary_emb_dispatch(x: torch.Tensor, cos: torch.Tensor,
+                              sin: torch.Tensor,
+                              is_neox_style: bool) -> torch.Tensor:
+    """
+    Args:
+        x: [num_tokens, num_heads, head_size]
+        cos: [num_tokens, head_size // 2]
+        sin: [num_tokens, head_size // 2]
+        is_neox_style: Whether to use the Neox-style or GPT-J-style rotary
+            positional embeddings.
+    """
+    if current_platform.is_cuda():
+        return apply_rotary_emb(x.unsqueeze(0), cos, sin,
+                                not is_neox_style).squeeze(0)
+    else:
+        return apply_rotary_emb_torch(x, cos, sin, is_neox_style)
+
+
+# yarn functions
+# Inverse dim formula to find dim based on number of rotations
+def yarn_find_correction_dim(num_rotations: int,
+                             dim: int,
+                             base: float = 10000,
+                             max_position_embeddings: int = 2048) -> float:
+    return (dim * math.log(max_position_embeddings /
+                           (num_rotations * 2 * math.pi))) / (2 *
+                                                              math.log(base))
+
+
+# Find dim range bounds based on rotations
+def yarn_find_correction_range(
+        low_rot: int,
+        high_rot: int,
+        dim: int,
+        base: float = 10000,
+        max_position_embeddings: int = 2048) -> tuple[int, int]:
+    low = math.floor(
+        yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings))
+    high = math.ceil(
+        yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings))
+    return max(low, 0), min(high, dim - 1)  # Clamp values just in case
+
+
+def yarn_linear_ramp_mask(low: float, high: float, dim: int,
+                          dtype: torch.dtype) -> torch.Tensor:
+    if low == high:
+        high += 0.001  # Prevent singularity
+
+    linear_func = (torch.arange(dim, dtype=dtype) - low) / (high - low)
+    ramp_func = torch.clamp(linear_func, 0, 1)
+    return ramp_func
+
+
+def yarn_get_mscale(scale: float = 1) -> float:
+    if scale <= 1:
+        return 1.0
+    return 0.1 * math.log(scale) + 1.0
diff --git a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
new file mode 100644
index 0000000000000..cd888b733426b
--- /dev/null
+++ b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
@@ -0,0 +1,131 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import math
+from typing import Optional
+
+import torch
+
+from vllm.platforms import current_platform
+
+from .base import RotaryEmbedding
+from .common import (rotate_gptj, rotate_neox, yarn_find_correction_range,
+                     yarn_linear_ramp_mask)
+
+
+def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
+    if scale <= 1:
+        return 1.0
+    return 0.1 * mscale * math.log(scale) + 1.0
+
+
+class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
+    """RotaryEmbedding extended with YaRN method.
+
+    Credits to Peng et al. github.com/jquesnelle/yarn
+    """
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: float,
+        is_neox_style: bool,
+        scaling_factor: float,
+        dtype: torch.dtype,
+        *,
+        extrapolation_factor: float = 1,
+        attn_factor: float = 1,
+        beta_fast: int = 32,
+        beta_slow: int = 1,
+        mscale: float = 1,
+        mscale_all_dim: float = 0,
+    ) -> None:
+        self.scaling_factor = scaling_factor
+        self.extrapolation_factor = extrapolation_factor
+        self.attn_factor = attn_factor
+        self.beta_fast = beta_fast
+        self.beta_slow = beta_slow
+        # Get n-d magnitude scaling corrected for interpolation.
+        self.mscale = float(
+            yarn_get_mscale(self.scaling_factor, float(mscale)) /
+            yarn_get_mscale(self.scaling_factor, float(mscale_all_dim)) *
+            attn_factor)
+        super().__init__(head_size, rotary_dim, max_position_embeddings, base,
+                         is_neox_style, dtype)
+
+    def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor:
+        pos_freqs = self.base**(
+            torch.arange(0,
+                         self.rotary_dim,
+                         2,
+                         dtype=torch.float,
+                         device=current_platform.device_type) /
+            self.rotary_dim)
+        inv_freq_extrapolation = 1.0 / pos_freqs
+        inv_freq_interpolation = 1.0 / (scaling_factor * pos_freqs)
+
+        low, high = yarn_find_correction_range(self.beta_fast, self.beta_slow,
+                                               self.rotary_dim, self.base,
+                                               self.max_position_embeddings)
+        # Get n-d rotational scaling corrected for extrapolation
+        inv_freq_mask = (1 - yarn_linear_ramp_mask(
+            low, high, self.rotary_dim // 2,
+            dtype=torch.float)) * self.extrapolation_factor
+        inv_freq = inv_freq_interpolation * (
+            1 - inv_freq_mask) + inv_freq_extrapolation * inv_freq_mask
+        return inv_freq
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        inv_freq = self._compute_inv_freq(self.scaling_factor)
+        t = torch.arange(self.max_position_embeddings * self.scaling_factor,
+                         device=current_platform.device_type,
+                         dtype=torch.float32)
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = (freqs.cos() * self.mscale)
+        sin = (freqs.sin() * self.mscale)
+        cache = torch.cat((cos, sin), dim=-1)
+        return cache
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: Optional[torch.Tensor] = None,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """PyTorch-native implementation equivalent to forward()."""
+        assert key is not None
+        query_rot = query[..., :self.rotary_dim]
+        key_rot = key[..., :self.rotary_dim]
+        if self.rotary_dim < self.head_size:
+            query_pass = query[..., self.rotary_dim:]
+            key_pass = key[..., self.rotary_dim:]
+
+        if self.cos_sin_cache.device != positions.device:
+            self.cos_sin_cache: torch.Tensor = self.cos_sin_cache.to(
+                positions.device)
+        cos_sin = self.cos_sin_cache[torch.add(positions, offsets)
+                                     if offsets is not None else positions]
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        if self.is_neox_style:
+            # NOTE(woosuk): Here we assume that the positions tensor has the
+            # shape [batch_size, seq_len].
+            cos = cos.repeat(1, 1, 2).unsqueeze(-2)
+            sin = sin.repeat(1, 1, 2).unsqueeze(-2)
+        else:
+            cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2)
+            sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2)
+
+        rotate_fn = rotate_neox if self.is_neox_style else rotate_gptj
+        query_rot = query_rot * cos + rotate_fn(query_rot) * sin
+        key_rot = key_rot * cos + rotate_fn(key_rot) * sin
+
+        if self.rotary_dim < self.head_size:
+            query = torch.cat((query_rot, query_pass), dim=-1)
+            key = torch.cat((key_rot, key_pass), dim=-1)
+        else:
+            query = query_rot
+            key = key_rot
+        return query, key
diff --git a/vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py b/vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py
new file mode 100644
index 0000000000000..3d8da0fa9d8f5
--- /dev/null
+++ b/vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py
@@ -0,0 +1,188 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import torch
+
+from vllm.model_executor.custom_op import CustomOp
+
+from .common import rotate_gptj, rotate_neox
+
+
+@CustomOp.register("dual_chunk_rotary_embedding")
+class DualChunkRotaryEmbedding(CustomOp):
+    """Rotary positional embedding for Dual Chunk Attention."""
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: float,
+        is_neox_style: bool,
+        dtype: torch.dtype,
+        chunk_size: int,
+        local_size: int,
+    ) -> None:
+        super().__init__()
+        self.head_size = head_size
+        self.rotary_dim = rotary_dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        self.is_neox_style = is_neox_style
+        self.chunk_size = chunk_size
+        self.local_size = local_size
+        self.dtype = dtype
+        self.device = torch.device(f"cuda:{torch.cuda.current_device()}")
+        (q_cache, qc_cache, k_cache, qc_no_clamp_cache,
+         q_inter_cache) = self._compute_cos_sin_cache()
+
+        self.register_buffer("cos_sin_q_cache", q_cache, persistent=False)
+        self.register_buffer("cos_sin_qc_cache", qc_cache, persistent=False)
+        self.register_buffer("cos_sin_k_cache", k_cache, persistent=False)
+        self.register_buffer("cos_sin_qc_no_clamp_cache",
+                             qc_no_clamp_cache,
+                             persistent=False)
+        self.register_buffer("cos_sin_q_inter_cache",
+                             q_inter_cache,
+                             persistent=False)
+
+    def _compute_inv_freq(self, base: float) -> torch.Tensor:
+        """Compute the inverse frequency."""
+        # NOTE(woosuk): The HF implementation uses `torch.arange(...).float()`.
+        # However, we use `torch.arange(..., dtype=torch.float)` instead to
+        # avoid numerical issues with large base values (e.g., 10000000).
+        # This may cause a slight numerical difference between the HF
+        # implementation and ours.
+        # NOTE(woosuk): To exactly match the HF implementation, we need to
+        # use CPU to compute the cache and then move it to GPU. However, we
+        # create the cache on GPU for faster initialization. This may cause
+        # a slight numerical difference between the HF implementation and ours.
+        inv_freq = 1.0 / (base**(torch.arange(
+            0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim))
+        return inv_freq
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        """Compute the cos and sin cache."""
+        inv_freq = self._compute_inv_freq(self.base)
+        chunk_len = self.chunk_size - self.local_size
+        q_t = torch.arange(chunk_len, dtype=torch.float)
+        qc_t = (torch.arange(chunk_len, dtype=torch.float) +
+                chunk_len).clamp(max=self.chunk_size)
+        k_t = torch.arange(self.max_position_embeddings,
+                           dtype=torch.float) % chunk_len
+
+        # count from chunk_len, no clamp(self.chunk_size) restriction
+        qc_no_clamp_t = torch.arange(chunk_len, dtype=torch.float) + chunk_len
+        # count from self.chunk_size for q_inter's rope
+        q_inter_t = torch.arange(chunk_len,
+                                 dtype=torch.float) + self.chunk_size
+
+        q_freqs = torch.outer(q_t, inv_freq)
+        qc_freqs = torch.outer(qc_t, inv_freq)
+        k_freqs = torch.outer(k_t, inv_freq)
+        qc_no_clamp_freqs = torch.outer(qc_no_clamp_t, inv_freq)
+        q_inter_freqs = torch.outer(q_inter_t, inv_freq)
+
+        q_cos = q_freqs.cos()
+        q_sin = q_freqs.sin()
+        qc_cos = qc_freqs.cos()
+        qc_sin = qc_freqs.sin()
+        k_cos = k_freqs.cos()
+        k_sin = k_freqs.sin()
+
+        qc_no_clamp_cos = qc_no_clamp_freqs.cos()
+        qc_no_clamp_sin = qc_no_clamp_freqs.sin()
+        q_inter_cos = q_inter_freqs.cos()
+        q_inter_sin = q_inter_freqs.sin()
+
+        q_cache = torch.cat((q_cos, q_sin), dim=-1).to(dtype=self.dtype,
+                                                       device=self.device)
+        qc_cache = torch.cat((qc_cos, qc_sin), dim=-1).to(dtype=self.dtype,
+                                                          device=self.device)
+        k_cache = torch.cat((k_cos, k_sin), dim=-1).to(dtype=self.dtype,
+                                                       device=self.device)
+        qc_no_clamp_cache = torch.cat((qc_no_clamp_cos, qc_no_clamp_sin),
+                                      dim=-1).to(dtype=self.dtype,
+                                                 device=self.device)
+        q_inter_cache = torch.cat((q_inter_cos, q_inter_sin),
+                                  dim=-1).to(dtype=self.dtype,
+                                             device=self.device)
+        return q_cache, qc_cache, k_cache, qc_no_clamp_cache, q_inter_cache
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        query = query.view(*query.shape[:-1], -1, self.head_size)
+        key = key.view(*key.shape[:-1], -1, self.head_size)
+        query_rot = query[..., :self.rotary_dim]
+        key_rot = key[..., :self.rotary_dim]
+        if self.rotary_dim < self.head_size:
+            query_pass = query[..., self.rotary_dim:]
+            key_pass = key[..., self.rotary_dim:]
+        else:
+            query_pass = None
+            key_pass = None
+
+        positions_with_offsets = (torch.add(positions, offsets)
+                                  if offsets is not None else positions)
+        key = self._apply_rotary_embedding(
+            self.cos_sin_k_cache[positions_with_offsets], key_rot, key_pass)
+        chunk_len = self.chunk_size - self.local_size
+        query = self._apply_rotary_embedding(
+            self.cos_sin_q_cache[positions_with_offsets % chunk_len],
+            query_rot, query_pass)
+        query_succ = self._apply_rotary_embedding(
+            self.cos_sin_qc_cache[positions_with_offsets % chunk_len],
+            query_rot, query_pass)
+        query_inter = self._apply_rotary_embedding(
+            self.cos_sin_qc_cache[chunk_len - 1].repeat(positions.shape[0], 1),
+            query_rot, query_pass)
+        query_succ_critical = self._apply_rotary_embedding(
+            self.cos_sin_qc_no_clamp_cache[positions_with_offsets % chunk_len],
+            query_rot, query_pass)
+        query_inter_critical = self._apply_rotary_embedding(
+            self.cos_sin_q_inter_cache[positions_with_offsets % chunk_len],
+            query_rot, query_pass)
+
+        # merge query into one tensor to simplify the interfaces
+        query = torch.cat((
+            query,
+            query_succ,
+            query_inter,
+            query_succ_critical,
+            query_inter_critical,
+        ),
+                          dim=-1)
+        return query, key
+
+    def _apply_rotary_embedding(self, cos_sin, hidden_rot, hidden_pass):
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        if self.is_neox_style:
+            # NOTE(woosuk): Here we assume that the positions tensor has the
+            # shape [batch_size, seq_len].
+            cos = cos.repeat(1, 1, 2).unsqueeze(-2)
+            sin = sin.repeat(1, 1, 2).unsqueeze(-2)
+        else:
+            cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2)
+            sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2)
+        rotate_fn = rotate_neox if self.is_neox_style else rotate_gptj
+        hidden_rot = hidden_rot * cos + rotate_fn(hidden_rot) * sin
+
+        if self.rotary_dim < self.head_size:
+            hidden = torch.cat((hidden_rot, hidden_pass), dim=-1)
+        else:
+            hidden = hidden_rot
+        return hidden.flatten(-2).squeeze(0)
+
+    def extra_repr(self) -> str:
+        s = f"head_size={self.head_size}, rotary_dim={self.rotary_dim}"
+        s += f", max_position_embeddings={self.max_position_embeddings}"
+        s += f", base={self.base}, is_neox_style={self.is_neox_style}"
+        s += f", chunk_size={self.chunk_size}, local_size={self.local_size}"
+        return s
diff --git a/vllm/model_executor/layers/rotary_embedding/dynamic_ntk_alpha_rope.py b/vllm/model_executor/layers/rotary_embedding/dynamic_ntk_alpha_rope.py
new file mode 100644
index 0000000000000..1da39bbd303bd
--- /dev/null
+++ b/vllm/model_executor/layers/rotary_embedding/dynamic_ntk_alpha_rope.py
@@ -0,0 +1,41 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+from .base import RotaryEmbedding
+
+
+class DynamicNTKAlphaRotaryEmbedding(RotaryEmbedding):
+    """RotaryEmbedding extended with Dynamic NTK alpha.
+
+    Based on the original RotaryEmbedding implementation.
+    """
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: float,
+        is_neox_style: bool,
+        scaling_alpha: float,
+        dtype: torch.dtype,
+    ) -> None:
+        self.scaling_alpha = scaling_alpha
+        super().__init__(head_size, rotary_dim, max_position_embeddings, base,
+                         is_neox_style, dtype)
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        # For Hunyuan DynamicNTKAlphaRotaryEmbedding
+        max_len = self.max_position_embeddings
+        base = self.base * self.scaling_alpha**(self.rotary_dim /
+                                                (self.rotary_dim - 2))
+        inv_freq = self._compute_inv_freq(base)
+        t = torch.arange(max_len, dtype=torch.float)
+
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = freqs.cos()
+        sin = freqs.sin()
+        cache = torch.cat((cos, sin), dim=-1)
+        return cache
diff --git a/vllm/model_executor/layers/rotary_embedding/dynamic_ntk_scaling_rope.py b/vllm/model_executor/layers/rotary_embedding/dynamic_ntk_scaling_rope.py
new file mode 100644
index 0000000000000..ec2008b90cfb8
--- /dev/null
+++ b/vllm/model_executor/layers/rotary_embedding/dynamic_ntk_scaling_rope.py
@@ -0,0 +1,67 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+
+from .base import RotaryEmbedding
+
+
+class DynamicNTKScalingRotaryEmbedding(RotaryEmbedding):
+    """RotaryEmbedding extended with Dynamic NTK scaling.
+
+    Credits to the Reddit users /u/bloc97 and /u/emozilla
+    """
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: float,
+        is_neox_style: bool,
+        scaling_factor: float,
+        dtype: torch.dtype,
+    ) -> None:
+        self.scaling_factor = scaling_factor
+        super().__init__(head_size, rotary_dim, max_position_embeddings, base,
+                         is_neox_style, dtype)
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        # NOTE(woosuk): self.max_position_embeddings is the original
+        # maximum length before applying the rope scaling.
+        # Thus, the maximum length after applying the rope scaling is
+        # self.max_position_embeddings * self.scaling_factor.
+        max_len = self.max_position_embeddings * self.scaling_factor
+        base = self.base * (
+            (self.scaling_factor * max_len / self.max_position_embeddings) -
+            (self.scaling_factor - 1))**(self.rotary_dim /
+                                         (self.rotary_dim - 2))
+        inv_freq = self._compute_inv_freq(base)
+        t = torch.arange(max_len, dtype=torch.float)
+
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = freqs.cos()
+        sin = freqs.sin()
+        cache = torch.cat((cos, sin), dim=-1)
+        return cache
diff --git a/vllm/model_executor/layers/rotary_embedding/linear_scaling_rope.py b/vllm/model_executor/layers/rotary_embedding/linear_scaling_rope.py
new file mode 100644
index 0000000000000..6e920991882d4
--- /dev/null
+++ b/vllm/model_executor/layers/rotary_embedding/linear_scaling_rope.py
@@ -0,0 +1,115 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Union
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+
+from .base import RotaryEmbedding
+
+
+class LinearScalingRotaryEmbedding(RotaryEmbedding):
+    """RotaryEmbedding extended with linear scaling.
+
+    It supports multiple scaling factors. Since multiple LoRA adapters may have
+    different scaling factors, we need multiple cos/sin caches. In this way,
+    instead of running rotary embedding kernel per lora, we can run multiple
+    lora in a batched way.
+
+    In addition to that, we also keep the cos/sin cache for the scaling factor
+    of 1 (default) at all times.
+
+    Exemplary for two scaling factors x=1, y and z with embeddings
+    [[x11, x12, ... x1m], ..., [xn1, xn2, ..., xnm]] and
+    [[y11, y12, ... y1o], ..., [yn1, yn2, ..., yno]], and
+    [[z11, z12, ... z1p], ..., [zn1, zn2, ..., znp]],
+
+    we construct the cos/sin cache as follows:
+    [[x11, x12, ... x1m, y11, y12, ... y1o, z11, z12, ... z1p],
+        ...
+     [xn1, xn2, ... xnm, yn1, yn2, ... yno, zn1, zn2, ... znp]]
+
+    We then use offsets to index into the cos/sin cache for
+    the respective scaling factors.
+
+    The offset to cache can be accessed via `scaling_factor_to_offset` API.
+
+    Credits to the Reddit user /u/kaiokendev
+    """
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: float,
+        is_neox_style: bool,
+        scaling_factors: Union[list[float], float],
+        dtype: torch.dtype,
+    ) -> None:
+        if isinstance(scaling_factors, float):
+            scaling_factors = [scaling_factors]
+        self.scaling_factors: list[float] = scaling_factors  # noqa
+        super().__init__(head_size, rotary_dim, max_position_embeddings, base,
+                         is_neox_style, dtype)
+        # Lazy initialized.
+        self._scaling_factor_to_offset: dict[float, int]
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        inv_freq = self._compute_inv_freq(self.base)
+        cache_list: list[torch.Tensor] = []
+        # offsets to the next cache in a tensor.
+        # Each offset corresponds to the same index in scaling_factors.
+        offsets: list[int] = []
+        for scaling_factor in self.scaling_factors:
+            # NOTE(woosuk): self.max_position_embeddings is the original
+            # maximum length before applying the rope scaling.
+            # Thus, the maximum length after applying the rope scaling is
+            # self.max_position_embeddings * self.scaling_factor.
+            max_len = self.max_position_embeddings * scaling_factor
+            t = torch.arange(max_len, dtype=torch.float)
+            t = t / scaling_factor
+
+            freqs = torch.einsum("i,j -> ij", t, inv_freq)
+            cos = freqs.cos()
+            sin = freqs.sin()
+            cache = torch.cat((cos, sin), dim=-1)
+            if not cache_list:
+                offset = 0
+            else:
+                last_offset = offsets[-1]
+                next_max_len = cache_list[-1].shape[0]
+                offset = last_offset + next_max_len
+            offsets.append(offset)
+            cache_list.append(cache)
+        self._scaling_factor_to_offset = {
+            float(scaling_factor): offsets[i]
+            for i, scaling_factor in enumerate(self.scaling_factors)
+        }
+        assert len(self.scaling_factors) == len(offsets)
+        return torch.cat(cache_list, dim=0)
+
+    @property
+    def scaling_factor_to_offset(self) -> dict[float, int]:
+        return self._scaling_factor_to_offset
diff --git a/vllm/model_executor/layers/rotary_embedding/llama3_rope.py b/vllm/model_executor/layers/rotary_embedding/llama3_rope.py
new file mode 100644
index 0000000000000..adcef549bc4c2
--- /dev/null
+++ b/vllm/model_executor/layers/rotary_embedding/llama3_rope.py
@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import math
+
+import torch
+
+from .base import RotaryEmbedding
+
+
+class Llama3RotaryEmbedding(RotaryEmbedding):
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: float,
+        is_neox_style: bool,
+        dtype: torch.dtype,
+        scaling_factor: float,
+        low_freq_factor: float,
+        high_freq_factor: float,
+        orig_max_position: int,
+    ) -> None:
+        self.scaling_factor = scaling_factor
+        self.low_freq_factor = low_freq_factor
+        self.high_freq_factor = high_freq_factor
+        self.orig_max_position = orig_max_position
+        super().__init__(head_size, rotary_dim, max_position_embeddings, base,
+                         is_neox_style, dtype)
+
+    def _compute_inv_freq(self, base: float) -> torch.Tensor:
+        inv_freqs = super()._compute_inv_freq(base)
+        low_freq_wavelen = self.orig_max_position / self.low_freq_factor
+        high_freq_wavelen = self.orig_max_position / self.high_freq_factor
+
+        wave_len = 2 * math.pi / inv_freqs
+        if self.low_freq_factor != self.high_freq_factor:
+            smooth = (self.orig_max_position / wave_len - self.low_freq_factor
+                      ) / (self.high_freq_factor - self.low_freq_factor)
+        else:
+            smooth = 0
+        new_freqs = torch.where(
+            wave_len < high_freq_wavelen,
+            inv_freqs,
+            torch.where(
+                wave_len > low_freq_wavelen,
+                inv_freqs / self.scaling_factor,
+                (1 - smooth) * inv_freqs / self.scaling_factor +
+                smooth * inv_freqs,
+            ),
+        )
+        return new_freqs
diff --git a/vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py b/vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py
new file mode 100644
index 0000000000000..415a85ab698bc
--- /dev/null
+++ b/vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py
@@ -0,0 +1,74 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import math
+from typing import Optional
+
+import torch
+
+from .base import RotaryEmbedding
+
+
+class Llama4VisionRotaryEmbedding(RotaryEmbedding):
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: float,
+        is_neox_style: bool,
+        dtype: torch.dtype,
+    ):
+        super().__init__(head_size, rotary_dim, max_position_embeddings, base,
+                         is_neox_style, dtype)
+
+    def _compute_inv_freq(self, base: float) -> torch.Tensor:
+        inv_freqs = super()._compute_inv_freq(base)
+        inv_freqs = inv_freqs[:(self.rotary_dim // 2)]
+        return inv_freqs
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        inv_freq = self._compute_inv_freq(self.base)
+
+        # self.max_position_embeddings here is number of image patches
+        # i.e. (image_size // patch_size) ** 2
+        num_patches = self.max_position_embeddings
+        img_idx = torch.arange(num_patches,
+                    dtype=torch.int32) \
+                    .reshape(num_patches, 1)
+        img_idx = torch.cat([img_idx, img_idx[:1]], dim=0)
+        img_idx[-1, -1] = -2  # set to ID_CLS_TOKEN
+        num_patches_single_dim = int(math.sqrt(num_patches))
+        frequencies_x = img_idx % num_patches_single_dim
+        frequencies_y = img_idx // num_patches_single_dim
+        freqs_x = ((frequencies_x + 1)[..., None] *
+                   inv_freq[None, None, :]).repeat_interleave(2, dim=-1)
+        freqs_y = ((frequencies_y + 1)[..., None] *
+                   inv_freq[None, None, :]).repeat_interleave(2, dim=-1)
+        freqs = torch.cat([freqs_x, freqs_y],
+                          dim=-1).float().contiguous()[..., ::2]
+        freqs = freqs.masked_fill(img_idx.reshape(-1, 1, 1) < 0, 0)
+        cache = torch.view_as_complex(
+            torch.stack([torch.cos(freqs), torch.sin(freqs)], dim=-1))
+        return cache
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        assert key is not None
+        self.cos_sin_cache: torch.Tensor = self.cos_sin_cache.to(query.device)
+        query_ = torch.view_as_complex(query.float().reshape(
+            *query.shape[:-1], -1, 2))
+        key_ = torch.view_as_complex(key.float().reshape(
+            *key.shape[:-1], -1, 2))
+        broadcast_shape = [
+            d if i == 1 or i == (query_.ndim - 1) else 1
+            for i, d in enumerate(query_.shape)
+        ]
+        freqs_ci = self.cos_sin_cache.view(*broadcast_shape)
+        query_out = torch.view_as_real(query_ * freqs_ci).flatten(3)
+        key_out = torch.view_as_real(key_ * freqs_ci).flatten(3)
+        return query_out.type_as(query), key_out.type_as(key)
diff --git a/vllm/model_executor/layers/rotary_embedding/mrope.py b/vllm/model_executor/layers/rotary_embedding/mrope.py
new file mode 100644
index 0000000000000..a75b9e5eb435c
--- /dev/null
+++ b/vllm/model_executor/layers/rotary_embedding/mrope.py
@@ -0,0 +1,670 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import itertools
+from typing import Optional, Union
+
+import numpy as np
+import torch
+from transformers import PretrainedConfig
+
+from .base import RotaryEmbedding
+from .common import apply_rotary_emb_dispatch
+
+
+class MRotaryEmbedding(RotaryEmbedding):
+    """Rotary Embedding with Multimodal Sections."""
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: float,
+        is_neox_style: bool,
+        dtype: torch.dtype,
+        mrope_section: Optional[list[int]] = None,
+    ) -> None:
+        # In Qwen2.5-VL, the maximum index value is related to the duration of
+        # the input video. We enlarge max_position_embeddings to 4 times to get
+        # a larger the cos and sin cache.
+        self.cache_max_position_num = max_position_embeddings * 4
+        super().__init__(head_size, rotary_dim, self.cache_max_position_num,
+                         base, is_neox_style, dtype)
+
+        self.mrope_section = mrope_section
+        if self.mrope_section:
+            assert sum(self.mrope_section) == rotary_dim // 2
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """PyTorch-native implementation equivalent to forward().
+
+        Args:
+            positions:
+                [num_tokens,] (text only) or
+                [3, num_tokens] (T/H/W positions with multimodal inputs)
+            query: [num_tokens, num_heads * head_size]
+            key: [num_tokens, num_kv_heads * head_size]
+        """
+        assert positions.ndim == 1 or positions.ndim == 2
+        assert key is not None
+
+        num_tokens = positions.shape[-1]
+        cos_sin = self.cos_sin_cache[positions]
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        if positions.ndim == 2:
+            assert self.mrope_section
+
+            cos = torch.cat([
+                m[i]
+                for i, m in enumerate(cos.split(self.mrope_section, dim=-1))
+            ],
+                            dim=-1)
+            sin = torch.cat([
+                m[i]
+                for i, m in enumerate(sin.split(self.mrope_section, dim=-1))
+            ],
+                            dim=-1)
+
+        query_shape = query.shape
+        query = query.view(num_tokens, -1, self.head_size)
+        query_rot = query[..., :self.rotary_dim]
+        query_pass = query[..., self.rotary_dim:]
+        query_rot = apply_rotary_emb_dispatch(query_rot, cos, sin,
+                                              self.is_neox_style)
+        query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
+
+        key_shape = key.shape
+        key = key.view(num_tokens, -1, self.head_size)
+        key_rot = key[..., :self.rotary_dim]
+        key_pass = key[..., self.rotary_dim:]
+        key_rot = apply_rotary_emb_dispatch(key_rot, cos, sin,
+                                            self.is_neox_style)
+        key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
+        return query, key
+
+    @classmethod
+    def get_input_positions(
+        cls,
+        input_tokens: list[int],
+        hf_config: PretrainedConfig,
+        image_grid_thw: Optional[Union[list[list[int]], torch.Tensor]],
+        video_grid_thw: Optional[Union[list[list[int]], torch.Tensor]],
+        second_per_grid_ts: Optional[list[float]],
+        context_len: int = 0,
+        seq_len: Optional[int] = None,
+        audio_feature_lengths: Optional[torch.Tensor] = None,
+        use_audio_in_video: bool = False,
+    ) -> tuple[list[list[int]], int]:
+        """Get mrope input positions and delta value."""
+
+        image_grid_thw = [] if image_grid_thw is None else image_grid_thw
+        video_grid_thw = [] if video_grid_thw is None else video_grid_thw
+        second_per_grid_ts = [] if second_per_grid_ts is None else \
+            second_per_grid_ts
+
+        llm_positions, mrope_position_delta = \
+            cls.get_input_positions_tensor(
+                input_tokens=input_tokens,
+                hf_config=hf_config,
+                image_grid_thw=image_grid_thw,
+                video_grid_thw=video_grid_thw,
+                second_per_grid_ts=second_per_grid_ts,
+                context_len=context_len,
+                seq_len=seq_len,
+                audio_feature_lengths=audio_feature_lengths,
+                use_audio_in_video=use_audio_in_video,
+            )
+
+        return llm_positions.tolist(), mrope_position_delta
+
+    @classmethod
+    def get_input_positions_tensor(
+        cls,
+        input_tokens: list[int],
+        hf_config: PretrainedConfig,
+        image_grid_thw: Union[list[list[int]], torch.Tensor],
+        video_grid_thw: Union[list[list[int]], torch.Tensor],
+        second_per_grid_ts: list[float],
+        context_len: int = 0,
+        seq_len: Optional[int] = None,
+        audio_feature_lengths: Optional[torch.Tensor] = None,
+        use_audio_in_video: bool = False,
+    ) -> tuple[torch.Tensor, int]:
+        from vllm.transformers_utils.config import thinker_uses_mrope
+        if thinker_uses_mrope(hf_config):
+            return cls._omni_get_input_positions_tensor(
+                input_tokens=input_tokens,
+                hf_config=hf_config,
+                image_grid_thw=image_grid_thw,
+                video_grid_thw=video_grid_thw,
+                second_per_grid_ts=second_per_grid_ts,
+                context_len=context_len,
+                seq_len=seq_len,
+                audio_feature_lengths=audio_feature_lengths,
+                use_audio_in_video=use_audio_in_video,
+            )
+        elif hf_config.model_type in ["glm4v", "glm4v_moe"]:
+            return cls._glm4v_get_input_positions_tensor(
+                input_tokens=input_tokens,
+                hf_config=hf_config,
+                image_grid_thw=image_grid_thw,
+                video_grid_thw=video_grid_thw,
+                context_len=context_len,
+                seq_len=seq_len,
+            )
+        else:
+            return cls._vl_get_input_positions_tensor(
+                input_tokens=input_tokens,
+                hf_config=hf_config,
+                image_grid_thw=image_grid_thw,
+                video_grid_thw=video_grid_thw,
+                second_per_grid_ts=second_per_grid_ts,
+                context_len=context_len,
+                seq_len=seq_len,
+            )
+
+    @classmethod
+    def _glm4v_get_input_positions_tensor(
+        cls,
+        input_tokens: list[int],
+        hf_config: PretrainedConfig,
+        image_grid_thw: Union[list[list[int]], torch.Tensor],
+        video_grid_thw: Union[list[list[int]], torch.Tensor],
+        context_len: int = 0,
+        seq_len: Optional[int] = None,
+    ) -> tuple[torch.Tensor, int]:
+        """Get mrope input positions and delta value for GLM4V."""
+
+        image_token_id = hf_config.image_token_id
+        video_start_token_id = hf_config.video_start_token_id
+        video_end_token_id = hf_config.video_end_token_id
+        spatial_merge_size = hf_config.vision_config.spatial_merge_size
+        llm_pos_ids_list: list = []
+
+        if not (image_grid_thw is None and video_grid_thw is None):
+            if isinstance(image_grid_thw, torch.Tensor):
+                image_grid_thw = image_grid_thw.tolist()
+
+            input_token_type: list[str] = []
+            video_check_flg = False
+            for token in input_tokens:
+                if token == video_start_token_id:
+                    video_check_flg = True
+                elif token == video_end_token_id:
+                    video_check_flg = False
+
+                if (token == image_token_id) and (video_check_flg is False):
+                    input_token_type.append("image")
+                elif (token == image_token_id) and (video_check_flg is True):
+                    input_token_type.append("video")
+                else:
+                    input_token_type.append("text")
+
+            input_type_group: list[tuple[str, int, int]] = []
+            for key, group_iter in itertools.groupby(
+                    enumerate(input_token_type), lambda x: x[1]):
+                group_list = list(group_iter)
+                start_index = group_list[0][0]
+                end_index = group_list[-1][0] + 1
+                input_type_group.append((key, start_index, end_index))
+
+            video_frame_num = 1
+            mm_data_idx = 0
+            for modality_type, start_idx, end_idx in input_type_group:
+                st_idx = llm_pos_ids_list[-1].max() + 1 if len(
+                    llm_pos_ids_list) > 0 else 0
+                if modality_type == "image":
+                    t, h, w = (
+                        image_grid_thw[mm_data_idx][0],
+                        image_grid_thw[mm_data_idx][1],
+                        image_grid_thw[mm_data_idx][2],
+                    )
+                    llm_grid_t, llm_grid_h, llm_grid_w = \
+                        t, h // spatial_merge_size, w // spatial_merge_size
+
+                    t_index = torch.arange(llm_grid_t).view(-1, 1).expand(
+                        -1, llm_grid_h * llm_grid_w).flatten()
+                    h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(
+                        llm_grid_t, -1, llm_grid_w).flatten()
+                    w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(
+                        llm_grid_t, llm_grid_h, -1).flatten()
+                    llm_pos_ids_list.append(
+                        torch.stack([t_index, h_index, w_index]) + st_idx)
+                    mm_data_idx += 1
+
+                elif modality_type == "video":
+                    t, h, w = (
+                        video_frame_num,
+                        image_grid_thw[mm_data_idx][1],
+                        image_grid_thw[mm_data_idx][2],
+                    )
+                    llm_grid_t, llm_grid_h, llm_grid_w = \
+                        t, h // spatial_merge_size, w // spatial_merge_size
+
+                    for t_idx in range(llm_grid_t):
+                        t_index = torch.tensor(t_idx).view(-1, 1).expand(
+                            -1, llm_grid_h * llm_grid_w).flatten()
+                        h_index = torch.arange(llm_grid_h).view(
+                            1, -1, 1).expand(1, -1, llm_grid_w).flatten()
+                        w_index = torch.arange(llm_grid_w).view(
+                            1, 1, -1).expand(1, llm_grid_h, -1).flatten()
+                        llm_pos_ids_list.append(
+                            torch.stack([t_index, h_index, w_index]) + st_idx)
+
+                    mm_data_idx += 1
+                    video_frame_num += 1
+
+                else:
+                    text_len = end_idx - start_idx
+                    llm_pos_ids_list.append(
+                        torch.arange(text_len).view(1, -1).expand(3, -1) +
+                        st_idx)
+                    video_frame_num = 1
+
+        else:
+            text_len = len(input_tokens)
+            llm_pos_ids_list.append(
+                torch.arange(text_len).view(1, -1).expand(3, -1))
+
+        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+        llm_positions = llm_positions[:, context_len:seq_len]
+        mrope_position_delta = (llm_positions.max() + 1 -
+                                len(input_tokens)).item()
+        return llm_positions, mrope_position_delta
+
+    @classmethod
+    def _vl_get_input_positions_tensor(
+        cls,
+        input_tokens: list[int],
+        hf_config: PretrainedConfig,
+        image_grid_thw: Union[list[list[int]], torch.Tensor],
+        video_grid_thw: Union[list[list[int]], torch.Tensor],
+        second_per_grid_ts: list[float],
+        context_len: int = 0,
+        seq_len: Optional[int] = None,
+    ) -> tuple[torch.Tensor, int]:
+        """Get mrope input positions and delta value."""
+
+        image_token_id = hf_config.image_token_id
+        video_token_id = hf_config.video_token_id
+        vision_start_token_id = hf_config.vision_start_token_id
+        spatial_merge_size = hf_config.vision_config.spatial_merge_size
+        tokens_per_second = getattr(hf_config.vision_config,
+                                    "tokens_per_second", 1.0)
+
+        input_tokens_tensor = torch.tensor(input_tokens)
+        vision_start_indices = torch.argwhere(
+            input_tokens_tensor == vision_start_token_id).squeeze(1)
+        vision_tokens = input_tokens_tensor[vision_start_indices + 1]
+        image_nums = (vision_tokens == image_token_id).sum()
+        video_nums = (vision_tokens == video_token_id).sum()
+        llm_pos_ids_list: list = []
+
+        st = 0
+        remain_images, remain_videos = image_nums, video_nums
+
+        image_index, video_index = 0, 0
+        for _ in range(image_nums + video_nums):
+            video_second_per_grid_t = 0.0
+            if image_token_id in input_tokens and remain_images > 0:
+                ed_image = input_tokens.index(image_token_id, st)
+            else:
+                ed_image = len(input_tokens) + 1
+            if video_token_id in input_tokens and remain_videos > 0:
+                ed_video = input_tokens.index(video_token_id, st)
+            else:
+                ed_video = len(input_tokens) + 1
+            if ed_image < ed_video:
+                t, h, w = (
+                    image_grid_thw[image_index][0],
+                    image_grid_thw[image_index][1],
+                    image_grid_thw[image_index][2],
+                )
+                image_index += 1
+                remain_images -= 1
+                ed = ed_image
+            else:
+                t, h, w = (
+                    video_grid_thw[video_index][0],
+                    video_grid_thw[video_index][1],
+                    video_grid_thw[video_index][2],
+                )
+                video_second_per_grid_t = 1.0
+                if second_per_grid_ts:
+                    video_second_per_grid_t = second_per_grid_ts[video_index]
+                video_index += 1
+                remain_videos -= 1
+                ed = ed_video
+
+            llm_grid_t, llm_grid_h, llm_grid_w = \
+                t, h // spatial_merge_size, w // spatial_merge_size
+            text_len = ed - st
+
+            st_idx = llm_pos_ids_list[-1].max() + 1 if len(
+                llm_pos_ids_list) > 0 else 0
+            llm_pos_ids_list.append(
+                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+
+            t_index = (torch.arange(llm_grid_t).view(-1, 1).expand(
+                -1, llm_grid_h * llm_grid_w) * video_second_per_grid_t *
+                       tokens_per_second).long().flatten()
+
+            h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(
+                llm_grid_t, -1, llm_grid_w).flatten()
+            w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(
+                llm_grid_t, llm_grid_h, -1).flatten()
+            llm_pos_ids_list.append(
+                torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
+            st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+
+        if st < len(input_tokens):
+            st_idx = llm_pos_ids_list[-1].max() + 1 if len(
+                llm_pos_ids_list) > 0 else 0
+            text_len = len(input_tokens) - st
+            llm_pos_ids_list.append(
+                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+
+        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+        mrope_position_delta = (llm_positions.max() + 1 -
+                                len(input_tokens)).item()
+        llm_positions = llm_positions[:, context_len:seq_len]
+
+        return llm_positions, mrope_position_delta
+
+    @classmethod
+    def _omni_get_input_positions_tensor(
+        cls,
+        input_tokens: list[int],
+        hf_config: PretrainedConfig,
+        image_grid_thw: Union[list[list[int]], torch.Tensor],
+        video_grid_thw: Union[list[list[int]], torch.Tensor],
+        second_per_grid_ts: Optional[list[float]] = None,
+        context_len: int = 0,
+        seq_len: Optional[int] = None,
+        audio_feature_lengths: Optional[torch.Tensor] = None,
+        use_audio_in_video: bool = False,
+    ) -> tuple[torch.Tensor, int]:
+        """Get mrope input positions and delta value (Qwen2.5-Omni version).
+
+        Differences from MRotaryEmbedding:
+            1. Add audio support (and related `audio_feature_lengths`).
+            2. Add `use_audio_in_video` option to read audio from video inputs.
+                In this case, audio and vision position ids will be split into
+                chunks and interleaved.
+
+        Example:
+
+            (V_i are vision position ids, A_i are audio position ids)
+
+            |V_1 ...    V_n|A_1 ...   A_n|V_n+1 ... V_2n|A_n+1 ... A_2n|...
+            |vision chunk 1|audio chunk 1|vision chunk 2|audio chunk 2 |...
+        """
+
+        # TODO(fyabc): refactor and share more code with
+        #  _vl_get_input_positions_tensor.
+
+        thinker_config = hf_config.thinker_config
+        audio_token_id = thinker_config.audio_token_index
+        image_token_id = thinker_config.image_token_index
+        video_token_id = thinker_config.video_token_index
+        audio_start_token_id = thinker_config.audio_start_token_id
+        audio_end_token_id = thinker_config.audio_end_token_id
+        vision_start_token_id = thinker_config.vision_start_token_id
+        vision_end_token_id = thinker_config.vision_end_token_id
+        seconds_per_chunk = thinker_config.seconds_per_chunk
+        spatial_merge_size = thinker_config.vision_config.spatial_merge_size
+        tokens_per_second = getattr(thinker_config.vision_config,
+                                    "tokens_per_second", 25)
+
+        if isinstance(image_grid_thw, list):
+            image_grid_thw = torch.tensor(image_grid_thw)
+        if isinstance(video_grid_thw, list):
+            video_grid_thw = torch.tensor(video_grid_thw)
+
+        src_item = input_tokens
+        audio_seqlens = audio_feature_lengths
+        if not second_per_grid_ts:
+            second_per_grid_ts = [1] * video_grid_thw.shape[0]
+        audio_idx = 0
+        video_idx = 0
+        image_idx = 0
+        new_src_item: list[int] = []
+        llm_pos_ids_list: list[torch.Tensor] = []
+
+        idx = 0
+        while idx < len(src_item):
+            new_src_item_len = len(new_src_item)
+            start_idx = llm_pos_ids_list[-1].max() + 1 if len(
+                llm_pos_ids_list) > 0 else 0
+            if src_item[idx] not in [
+                    audio_token_id, video_token_id, image_token_id
+            ]:
+                if use_audio_in_video and idx > 0:
+                    if src_item[idx] == vision_end_token_id and \
+                        src_item[idx - 1] == audio_end_token_id:
+                        # processing the <|audio_eos|> before <|vision_eos|>
+                        start_idx -= 1
+                    elif src_item[idx] == audio_start_token_id and \
+                        src_item[idx - 1] == vision_start_token_id:
+                        # processing the <|audio_bos|> after <|vision_eos|>
+                        start_idx -= 1
+                new_src_item.append(src_item[idx])
+                llm_pos_ids = torch.tensor([start_idx],
+                                           dtype=torch.long).expand(3, -1)
+                llm_pos_ids_list.append(llm_pos_ids)
+            elif src_item[idx] == audio_token_id:
+                assert audio_seqlens is not None
+                audio_seqlen = audio_seqlens[audio_idx]
+                place_num = (((audio_seqlen - 1) // 2 + 1 - 2) // 2 + 1)
+                new_src_item.extend([audio_token_id] * place_num)
+                llm_pos_ids = torch.arange(place_num).expand(3, -1) + start_idx
+                llm_pos_ids_list.append(llm_pos_ids)
+                audio_idx += 1
+            elif src_item[idx] == image_token_id:
+                grid_t = image_grid_thw[image_idx][0]
+                grid_hs = image_grid_thw[:, 1]
+                grid_ws = image_grid_thw[:, 2]
+                t_index = (torch.arange(grid_t) * 1 * tokens_per_second).long()
+                llm_pos_ids = cls._get_llm_pos_ids_for_vision(
+                    start_idx, image_idx, spatial_merge_size, t_index, grid_hs,
+                    grid_ws)
+                llm_pos_ids_list.append(llm_pos_ids)
+                vision_seqlen = image_grid_thw[image_idx].prod() // (
+                    spatial_merge_size**2)
+                new_src_item.extend([image_token_id] * vision_seqlen)
+                image_idx += 1
+            elif src_item[idx] == video_token_id and not use_audio_in_video:
+                grid_t = video_grid_thw[video_idx][0]
+                grid_hs = video_grid_thw[:, 1]
+                grid_ws = video_grid_thw[:, 2]
+                t_index = (torch.arange(grid_t) *
+                           second_per_grid_ts[video_idx] *
+                           tokens_per_second).long()
+                llm_pos_ids = cls._get_llm_pos_ids_for_vision(
+                    start_idx, video_idx, spatial_merge_size, t_index, grid_hs,
+                    grid_ws)
+                llm_pos_ids_list.append(llm_pos_ids)
+                vision_seqlen = video_grid_thw[video_idx].prod() // (
+                    spatial_merge_size**2)
+                new_src_item.extend([video_token_id] * vision_seqlen)
+                video_idx += 1
+            else:
+                # read audio from video
+                assert audio_seqlens is not None
+                audio_seqlen = audio_seqlens[audio_idx]
+                vision_seqlen = video_grid_thw[video_idx].prod() // (
+                    spatial_merge_size**2)
+                grid_t = video_grid_thw[video_idx][0]
+                grid_h = video_grid_thw[video_idx][1]
+                grid_w = video_grid_thw[video_idx][2]
+                grid_hs = video_grid_thw[:, 1]
+                grid_ws = video_grid_thw[:, 2]
+                t_ntoken_per_chunk = int(tokens_per_second * seconds_per_chunk)
+                t_index = (torch.arange(grid_t) *
+                           second_per_grid_ts[video_idx] *
+                           tokens_per_second).long()
+                t_index_split_chunk = cls._split_list_into_ranges(
+                    t_index, t_ntoken_per_chunk)
+                place_num = (((audio_seqlen - 1) // 2 + 1 - 2) // 2 + 1) + 2
+                pure_audio_len = place_num - 2
+                added_audio_len = 0
+                audio_llm_pos_ids_list: list[torch.Tensor] = []
+                for t_chunk in t_index_split_chunk:
+                    vision_ntoken_per_chunk = len(
+                        t_chunk) * grid_h * grid_w // (spatial_merge_size**2)
+                    new_src_item.extend([video_token_id] *
+                                        vision_ntoken_per_chunk)
+                    vision_llm_pos_ids_list = cls._get_llm_pos_ids_for_vision(
+                        start_idx, video_idx, spatial_merge_size, t_chunk,
+                        grid_hs, grid_ws).split(1, dim=1)
+                    llm_pos_ids_list.extend(vision_llm_pos_ids_list)
+                    new_src_item.extend(
+                        min(t_ntoken_per_chunk, pure_audio_len -
+                            added_audio_len) * [audio_token_id])
+                    audio_start_idx = start_idx if len(
+                        audio_llm_pos_ids_list
+                    ) == 0 else audio_llm_pos_ids_list[-1][0].item() + 1
+                    if min(t_ntoken_per_chunk,
+                           pure_audio_len - added_audio_len) > 0:
+                        audio_llm_pos_ids_list = (torch.arange(
+                            min(t_ntoken_per_chunk, pure_audio_len -
+                                added_audio_len)).expand(3, -1) +
+                                                  audio_start_idx).split(1,
+                                                                         dim=1)
+                    else:
+                        audio_llm_pos_ids_list = []
+                    added_audio_len += min(t_ntoken_per_chunk,
+                                           pure_audio_len - added_audio_len)
+                    llm_pos_ids_list.extend(audio_llm_pos_ids_list)
+                if added_audio_len < pure_audio_len:
+                    new_src_item.extend(
+                        (pure_audio_len - added_audio_len) * [audio_token_id])
+                    audio_llm_pos_ids_list = (
+                        torch.arange(pure_audio_len - added_audio_len).expand(
+                            3, -1) + llm_pos_ids_list[-1].max() + 1).split(
+                                1, dim=1)
+                    llm_pos_ids_list.extend(audio_llm_pos_ids_list)
+                audio_idx += 1
+                video_idx += 1
+            # move to the next token
+            idx += len(new_src_item) - new_src_item_len
+
+        llm_positions = torch.cat(llm_pos_ids_list, dim=1)
+        mrope_position_delta = torch.cat(llm_pos_ids_list,
+                                         dim=1).max() + 1 - len(src_item)
+        llm_positions = llm_positions[:, context_len:seq_len]
+
+        return llm_positions, mrope_position_delta
+
+    @staticmethod
+    def _get_llm_pos_ids_for_vision(
+        start_idx: int,
+        vision_idx: int,
+        spatial_merge_size: int,
+        t_index: list[int],
+        grid_hs: torch.Tensor,
+        grid_ws: torch.Tensor,
+    ) -> torch.Tensor:
+        llm_pos_ids_list = []
+        llm_grid_h = grid_hs[vision_idx] // spatial_merge_size
+        llm_grid_w = grid_ws[vision_idx] // spatial_merge_size
+        h_index = (torch.arange(llm_grid_h).view(1, -1, 1).expand(
+            len(t_index), -1, llm_grid_w).flatten())
+        w_index = (torch.arange(llm_grid_w).view(1, 1, -1).expand(
+            len(t_index), llm_grid_h, -1).flatten())
+        t_index_tensor = torch.Tensor(t_index).to(llm_grid_h.device).view(
+            -1, 1).expand(-1, llm_grid_h * llm_grid_w).long().flatten()
+        _llm_pos_ids = torch.stack([t_index_tensor, h_index, w_index])
+        llm_pos_ids_list.append(_llm_pos_ids + start_idx)
+        llm_pos_ids = torch.cat(llm_pos_ids_list, dim=1)
+        return llm_pos_ids
+
+    @staticmethod
+    def _split_list_into_ranges(lst: torch.Tensor,
+                                interval: int) -> list[list[int]]:
+        ranges: list[list[int]] = [[]
+                                   for _ in range((max(lst) // interval) + 1)]
+        for num in lst:
+            index = num // interval
+            ranges[index].append(num)
+        return ranges
+
+    @staticmethod
+    def get_next_input_positions(
+        mrope_position_delta: int,
+        context_len: int,
+        seq_len: int,
+    ) -> list[list[int]]:
+        return [
+            list(
+                range(context_len + mrope_position_delta,
+                      seq_len + mrope_position_delta)) for _ in range(3)
+        ]
+
+    @staticmethod
+    def get_next_input_positions_tensor(out: np.ndarray, out_offset: int,
+                                        mrope_position_delta: int,
+                                        context_len: int, num_new_tokens: int):
+
+        values = np.arange(mrope_position_delta + context_len,
+                           mrope_position_delta + context_len + num_new_tokens,
+                           dtype=out.dtype)
+        out[:, out_offset:out_offset + num_new_tokens] = values
+
+    @classmethod
+    def omni_get_updates_use_audio_in_video(
+        cls,
+        thinker_config: PretrainedConfig,
+        audio_len: int,
+        video_grid_thw: Union[list[int], torch.Tensor],
+        video_second_per_grid_t: float,
+    ) -> list[int]:
+        """Get video prompt updates when `use_audio_in_video` is True.
+
+        In this case, audio and vision update ids will be split into
+        chunks and interleaved (details in `_omni_get_input_positions_tensor`).
+
+        <|video_bos|><|VIDEO|><|video_eos|> =>
+        <|video_bos|><|audio_bos|>(... chunks ...)<|audio_eos|><|video_eos|>
+        """
+
+        audio_token_id = thinker_config.audio_token_index
+        video_token_id = thinker_config.video_token_index
+        audio_start_token_id = thinker_config.audio_start_token_id
+        audio_end_token_id = thinker_config.audio_end_token_id
+        seconds_per_chunk = thinker_config.seconds_per_chunk
+        spatial_merge_size = thinker_config.vision_config.spatial_merge_size
+        tokens_per_second = getattr(thinker_config.vision_config,
+                                    "tokens_per_second", 25)
+
+        grid_t = video_grid_thw[0]
+        grid_h = video_grid_thw[1]
+        grid_w = video_grid_thw[2]
+        t_ntoken_per_chunk = int(tokens_per_second * seconds_per_chunk)
+        t_index = (torch.arange(grid_t) * video_second_per_grid_t *
+                   tokens_per_second).long()
+        t_index_split_chunk = cls._split_list_into_ranges(
+            t_index, t_ntoken_per_chunk)
+
+        updates = [audio_start_token_id]
+        added_audio_len = 0
+        for t_chunk in t_index_split_chunk:
+            vision_ntoken_per_chunk = len(t_chunk) * grid_h * grid_w // (
+                spatial_merge_size**2)
+            updates.extend([video_token_id] * vision_ntoken_per_chunk)
+
+            audio_chunk_size = min(t_ntoken_per_chunk,
+                                   audio_len - added_audio_len)
+            updates.extend(audio_chunk_size * [audio_token_id])
+            added_audio_len += audio_chunk_size
+        if added_audio_len < audio_len:
+            updates.extend((audio_len - added_audio_len) * [audio_token_id])
+        updates.extend([audio_end_token_id])
+
+        return updates
diff --git a/vllm/model_executor/layers/rotary_embedding/ntk_scaling_rope.py b/vllm/model_executor/layers/rotary_embedding/ntk_scaling_rope.py
new file mode 100644
index 0000000000000..42926bad22ef6
--- /dev/null
+++ b/vllm/model_executor/layers/rotary_embedding/ntk_scaling_rope.py
@@ -0,0 +1,42 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import torch
+
+from .base import RotaryEmbedding
+
+
+class NTKScalingRotaryEmbedding(RotaryEmbedding):
+    """RotaryEmbedding extended with fixed and mixed NTK scaling.
+    https://kexue.fm/archives/9706 """
+
+    def __init__(self,
+                 head_size: int,
+                 rotary_dim: int,
+                 max_position_embeddings: int,
+                 base: float,
+                 is_neox_style: bool,
+                 scaling_factor: float,
+                 dtype: torch.dtype,
+                 mixed_b: Optional[float] = None) -> None:
+        self.scaling_factor = scaling_factor
+        self.mixed_b = mixed_b
+        super().__init__(head_size, rotary_dim, max_position_embeddings, base,
+                         is_neox_style, dtype)
+
+    def _compute_inv_freq(self, base: float) -> torch.Tensor:
+        base = self.base * (self.scaling_factor if self.mixed_b is None else 1)
+        inv_freq = super()._compute_inv_freq(base)
+
+        if self.mixed_b is None:
+            inv_freq = inv_freq / self.scaling_factor**(2 / self.rotary_dim)
+        else:
+            a = torch.tensor(self.scaling_factor).log() / (self.rotary_dim /
+                                                           2)**self.mixed_b
+            lambda_1_m = (a * torch.arange(
+                1, self.rotary_dim // 2 + 1).float()**self.mixed_b).exp()
+            inv_freq = inv_freq / lambda_1_m
+
+        return inv_freq
diff --git a/vllm/model_executor/layers/rotary_embedding/phi3_long_rope_scaled_rope.py b/vllm/model_executor/layers/rotary_embedding/phi3_long_rope_scaled_rope.py
new file mode 100644
index 0000000000000..9c36d633e2a9f
--- /dev/null
+++ b/vllm/model_executor/layers/rotary_embedding/phi3_long_rope_scaled_rope.py
@@ -0,0 +1,129 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import math
+from typing import Optional
+
+import torch
+import torch.nn as nn
+
+from .common import rotate_neox
+
+
+class Phi3LongRoPEScaledRotaryEmbedding(nn.Module):
+    """Phi3 family of models scaled rotary embedding.
+
+    Based on the original RotaryEmbedding implementation.
+    """
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        original_max_position_embeddings: int,
+        base: float,
+        is_neox_style: bool,
+        dtype: torch.dtype,
+        short_factor: list[float],
+        long_factor: list[float],
+        short_mscale: Optional[float] = None,
+        long_mscale: Optional[float] = None,
+    ):
+        super().__init__()
+
+        if is_neox_style is False:
+            raise ValueError(
+                "`Phi3LongRoPEScaledRotaryEmbedding` only supports neox_style."
+            )
+
+        self.rotary_dim = rotary_dim
+        self.head_size = head_size
+        self.max_position_embeddings = max_position_embeddings
+        self.original_max_position_embeddings = original_max_position_embeddings
+        self.base = base
+        self.short_factor = short_factor
+        self.long_factor = long_factor
+
+        scale = self.max_position_embeddings / \
+                self.original_max_position_embeddings
+        if scale <= 1.0:
+            scaling_factor = 1.0
+        else:
+            scaling_factor = math.sqrt(
+                1 + math.log(scale) /
+                math.log(self.original_max_position_embeddings))
+        if short_mscale is None:
+            short_mscale = scaling_factor
+        if long_mscale is None:
+            long_mscale = scaling_factor
+
+        self.short_mscale = short_mscale
+        self.long_mscale = long_mscale
+
+        short_cache = self._compute_cos_sin_cache(
+            original_max_position_embeddings, short_factor, short_mscale)
+        short_cache = short_cache.to(dtype)
+
+        long_cache = self._compute_cos_sin_cache(max_position_embeddings,
+                                                 long_factor, long_mscale)
+        long_cache = long_cache.to(dtype)
+
+        long_short_cache = torch.cat([short_cache, long_cache], dim=0)
+        self.register_buffer("long_short_cos_sin_cache",
+                             long_short_cache,
+                             persistent=False)
+
+    def _compute_inv_freq(self, rescale_factors: list[float]) -> torch.Tensor:
+        rescale_factors = torch.tensor(rescale_factors, dtype=torch.float32)
+        inv_freq = 1.0 / (rescale_factors * (self.base**(torch.arange(
+            0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim)))
+        return inv_freq
+
+    def _compute_cos_sin_cache(
+        self,
+        max_position_embeddings: int,
+        rescale_factors: list[float],
+        mscale: float,
+    ) -> torch.Tensor:
+        inv_freq = self._compute_inv_freq(rescale_factors)
+        t = torch.arange(max_position_embeddings, dtype=torch.float)
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = freqs.cos() * mscale
+        sin = freqs.sin() * mscale
+        cache = torch.cat((cos, sin), dim=-1)
+        return cache
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: Optional[torch.Tensor] = None,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        assert key is not None
+        query = query.view(*query.shape[:-1], -1, self.head_size)
+        key = key.view(*key.shape[:-1], -1, self.head_size)
+
+        k = self.original_max_position_embeddings
+        long_prompt_offset = (torch.any(positions > k).float() *
+                              torch.full_like(positions, k)).long()
+        idx = (torch.add(positions, long_prompt_offset)
+               if long_prompt_offset is not None else positions)
+        idx = torch.add(idx, offsets) if offsets is not None else idx
+        cos_sin = torch.index_select(self.long_short_cos_sin_cache, 0, idx)
+
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        cos = cos.repeat(1, 2).unsqueeze(-2)
+        sin = sin.repeat(1, 2).unsqueeze(-2)
+
+        query_rot = query[..., :self.rotary_dim]
+        query_pass = query[..., self.rotary_dim:]
+        query_rot = query_rot * cos + rotate_neox(query_rot) * sin
+        query = torch.cat((query_rot, query_pass), dim=-1)
+
+        key_rot = key[..., :self.rotary_dim]
+        key_pass = key[..., self.rotary_dim:]
+        key_rot = key_rot * cos + rotate_neox(key_rot) * sin
+        key = torch.cat((key_rot, key_pass), dim=-1)
+
+        return query.flatten(-2), key.flatten(-2)
diff --git a/vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py b/vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py
new file mode 100644
index 0000000000000..851565c5667a4
--- /dev/null
+++ b/vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py
@@ -0,0 +1,68 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+from .base import RotaryEmbedding
+from .common import (yarn_find_correction_range, yarn_get_mscale,
+                     yarn_linear_ramp_mask)
+
+
+class YaRNScalingRotaryEmbedding(RotaryEmbedding):
+    """RotaryEmbedding extended with YaRN method.
+
+    Credits to Peng et al. github.com/jquesnelle/yarn
+    """
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: float,
+        is_neox_style: bool,
+        scaling_factor: float,
+        dtype: torch.dtype,
+        *,
+        extrapolation_factor: float = 1,
+        attn_factor: float = 1,
+        beta_fast: int = 32,
+        beta_slow: int = 1,
+    ) -> None:
+        self.scaling_factor = scaling_factor
+        self.extrapolation_factor = extrapolation_factor
+        self.attn_factor = attn_factor
+        self.beta_fast = beta_fast
+        self.beta_slow = beta_slow
+        # Get n-d magnitude scaling corrected for interpolation
+        self.mscale = float(yarn_get_mscale(self.scaling_factor) * attn_factor)
+        super().__init__(head_size, rotary_dim, max_position_embeddings, base,
+                         is_neox_style, dtype)
+
+    def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor:
+        pos_freqs = self.base**(
+            torch.arange(0, self.rotary_dim, 2, dtype=torch.float) /
+            self.rotary_dim)
+        inv_freq_extrapolation = 1.0 / pos_freqs
+        inv_freq_interpolation = 1.0 / (scaling_factor * pos_freqs)
+
+        low, high = yarn_find_correction_range(self.beta_fast, self.beta_slow,
+                                               self.rotary_dim, self.base,
+                                               self.max_position_embeddings)
+        # Get n-d rotational scaling corrected for extrapolation
+        inv_freq_mask = (1 - yarn_linear_ramp_mask(
+            low, high, self.rotary_dim // 2,
+            dtype=torch.float)) * self.extrapolation_factor
+        inv_freq = inv_freq_interpolation * (
+            1 - inv_freq_mask) + inv_freq_extrapolation * inv_freq_mask
+        return inv_freq
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        inv_freq = self._compute_inv_freq(self.scaling_factor)
+        t = torch.arange(self.max_position_embeddings * self.scaling_factor,
+                         dtype=torch.float32)
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = (freqs.cos() * self.mscale)
+        sin = (freqs.sin() * self.mscale)
+        cache = torch.cat((cos, sin), dim=-1)
+        return cache
diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py
index 68fcb785691c8..f54dfab5238e1 100644
--- a/vllm/model_executor/model_loader/bitsandbytes_loader.py
+++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py
@@ -34,7 +34,8 @@ from vllm.model_executor.model_loader.weight_utils import (
     filter_duplicate_safetensors_files, filter_files_not_needed_for_inference,
     pt_weights_iterator, safetensors_weights_iterator)
 from vllm.model_executor.models import is_pooling_model
-from vllm.model_executor.utils import (get_packed_modules_mapping,
+from vllm.model_executor.utils import (get_moe_expert_mapping,
+                                       get_packed_modules_mapping,
                                        set_weight_attrs)
 from vllm.platforms import current_platform
 
@@ -43,6 +44,12 @@ from vllm.platforms import current_platform
 logger = init_logger(__name__)
 
 
+def is_moe_model(model: torch.nn.Module) -> bool:
+    """Checks if the model contains FusedMoE layers."""
+    return bool(any(
+        isinstance(module, FusedMoE) for module in model.modules()))
+
+
 class BitsAndBytesModelLoader(BaseModelLoader):
     """Model loader to load model weights with BitAndBytes quantization."""
 
@@ -61,6 +68,8 @@ class BitsAndBytesModelLoader(BaseModelLoader):
         # Store all module names (from transformers) that support
         # BNB quantization.
         self.target_modules: list[str] = []
+        # Store the mapping of expert parameters for MoE models.
+        self.expert_params_mapping: list[tuple[str, str, int, str]] = []
         # mapping weight names from transformers to vllm.
         self.weight_mapper: Callable = lambda name: name
         self.pre_quant: bool = False
@@ -413,13 +422,8 @@ class BitsAndBytesModelLoader(BaseModelLoader):
                 # in case model has a mixture of disk-merged and disk-split
                 # weights with same last name.
                 self.target_modules.append(name)
-            elif (isinstance(module, FusedMoE)
-                  and hasattr(module.quant_method, "quant_config")):
-                if not hasattr(model, "get_expert_mapping"):
-                    raise AttributeError(
-                        f"MoE Model {type(model).__name__} does not support "
-                        "BitsAndBytes quantization yet. Ensure this model has "
-                        "'get_expert_mapping' method.")
+            elif isinstance(module, FusedMoE) and hasattr(
+                    module.quant_method, "quant_config"):
                 # TODO: support FusedMoE with prequant and 8bit.
                 if self.pre_quant:
                     raise ValueError(
@@ -430,9 +434,9 @@ class BitsAndBytesModelLoader(BaseModelLoader):
                         "BitsAndBytes 8bit quantization with FusedMoE is not "
                         "supported yet.")
                 # Get the corresponding weight name using module name and
-                # get_expert_mapping.
-                expert_mapping = model.get_expert_mapping()
-                for exp in expert_mapping:
+                # expert_params_mapping.
+
+                for exp in self.expert_params_mapping:
                     weight_name = exp[1]
                     rep_name = name.replace("experts",
                                             "") + weight_name.removesuffix(".")
@@ -464,7 +468,7 @@ class BitsAndBytesModelLoader(BaseModelLoader):
             elif isinstance(module, (RowParallelLinear, )):
                 self.column_sharded_weights_modules.append(name)
             elif isinstance(module, FusedMoE):
-                expert_mapping = model.get_expert_mapping()
+                expert_mapping = self.expert_params_mapping
                 for exp in expert_mapping:
                     if exp[-1] == "w2":
                         weight_name = exp[1]
@@ -516,6 +520,13 @@ class BitsAndBytesModelLoader(BaseModelLoader):
         self.is_pool_model = is_pooling_model(model)
         self.modules_mapping = ParamMapping(get_packed_modules_mapping(model))
 
+        if is_moe_model(model):
+            self.expert_params_mapping = get_moe_expert_mapping(model)
+            if not self.expert_params_mapping:
+                raise AttributeError(
+                    f"MoE Model {type(model).__name__} does not support "
+                    "BitsAndBytes quantization yet. Ensure this model has "
+                    "'get_expert_mapping' method.")
         # For some models like Molmo, we need to use hf_to_vllm_mapper
         # to ensure correct loading of weights.
         if hf_to_vllm_mapper := getattr(model, "hf_to_vllm_mapper", None):
@@ -569,10 +580,10 @@ class BitsAndBytesModelLoader(BaseModelLoader):
         """
         from bitsandbytes.functional import QuantState
 
-        if not hasattr(model, "get_expert_mapping"):
+        if not self.expert_params_mapping:
             return dict()
 
-        expert_mapping = model.get_expert_mapping()
+        expert_mapping = self.expert_params_mapping
         expert_qs_dict = {}
         for name, module in model.named_modules():
             if not isinstance(module, FusedMoE):
diff --git a/vllm/model_executor/models/aimv2.py b/vllm/model_executor/models/aimv2.py
index b13d863ebb744..d2307bb464bdb 100644
--- a/vllm/model_executor/models/aimv2.py
+++ b/vllm/model_executor/models/aimv2.py
@@ -8,6 +8,7 @@ from typing import Optional
 
 import torch
 import torch.nn as nn
+from transformers import PretrainedConfig
 
 from vllm.attention.layer import MultiHeadAttention
 from vllm.distributed import get_tensor_model_parallel_world_size
@@ -20,13 +21,12 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.transformers_utils.configs.ovis import AIMv2Config
 
 
 class AIMv2SwiGLUFFN(nn.Module):
 
-    def __init__(self, config: AIMv2Config, quant_config: QuantizationConfig,
-                 prefix: str):
+    def __init__(self, config: PretrainedConfig,
+                 quant_config: QuantizationConfig, prefix: str):
         super().__init__()
         hidden_features = config.intermediate_size
         in_features = config.hidden_size
@@ -57,7 +57,7 @@ class AIMv2SwiGLUFFN(nn.Module):
 
 class AIMv2PatchEmbed(nn.Module):
 
-    def __init__(self, config: AIMv2Config):
+    def __init__(self, config: PretrainedConfig):
         super().__init__()
         self.proj = nn.Conv2d(
             config.num_channels,
@@ -75,7 +75,7 @@ class AIMv2PatchEmbed(nn.Module):
 
 class AIMv2ViTPreprocessor(nn.Module):
 
-    def __init__(self, config: AIMv2Config):
+    def __init__(self, config: PretrainedConfig):
         super().__init__()
         num_patches = (config.image_size // config.patch_size)**2
 
@@ -93,8 +93,8 @@ class AIMv2ViTPreprocessor(nn.Module):
 
 class AIMv2Attention(nn.Module):
 
-    def __init__(self, config: AIMv2Config, quant_config: QuantizationConfig,
-                 prefix: str):
+    def __init__(self, config: PretrainedConfig,
+                 quant_config: QuantizationConfig, prefix: str):
         super().__init__()
         self.config = config
         self.embed_dim = config.hidden_size
@@ -141,8 +141,8 @@ class AIMv2Attention(nn.Module):
 
 class AIMv2Block(nn.Module):
 
-    def __init__(self, config: AIMv2Config, quant_config: QuantizationConfig,
-                 prefix: str):
+    def __init__(self, config: PretrainedConfig,
+                 quant_config: QuantizationConfig, prefix: str):
         super().__init__()
         self.attn = AIMv2Attention(config,
                                    quant_config=quant_config,
@@ -163,7 +163,7 @@ class AIMv2Transformer(nn.Module):
 
     def __init__(
         self,
-        config: AIMv2Config,
+        config: PretrainedConfig,
         quant_config: QuantizationConfig,
         *,
         require_post_norm: Optional[bool] = None,
@@ -193,7 +193,7 @@ class AIMv2Transformer(nn.Module):
 class AIMv2Model(torch.nn.Module):
 
     def __init__(self,
-                 config: AIMv2Config,
+                 config: PretrainedConfig,
                  quant_config: QuantizationConfig,
                  *,
                  require_post_norm: Optional[bool] = None,
diff --git a/vllm/model_executor/models/arcee.py b/vllm/model_executor/models/arcee.py
index 4e3ba107ba7e0..4cf73e2e0ea56 100644
--- a/vllm/model_executor/models/arcee.py
+++ b/vllm/model_executor/models/arcee.py
@@ -24,10 +24,12 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (AutoWeightsLoader, PPMissingLayer,
+from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers)
 
 
@@ -260,6 +262,81 @@ class ArceeModel(nn.Module):
             return hidden_states, aux_hidden_states
         return hidden_states
 
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        """Load weights, mapping q/k/v projections to fused qkv_proj."""
+        stacked_params_mapping = [
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
+                continue
+
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+
+            if "scale" in name:
+                remapped_name = maybe_remap_kv_scale_name(name, params_dict)
+                if remapped_name is None:
+                    continue
+                name = remapped_name
+
+            mapped = False
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+
+                name = name.replace(weight_name, param_name)
+
+                if name.endswith(".bias") and name not in params_dict:
+                    mapped = True
+                    break
+
+                if is_pp_missing_parameter(name, self):
+                    mapped = True
+                    break
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader  # type: ignore[attr-defined]
+                weight_loader(param, loaded_weight, shard_id)
+                loaded_params.add(name)
+                mapped = True
+                break
+
+            if mapped:
+                continue
+
+            if name.endswith(".bias") and name not in params_dict:
+                continue
+
+            if is_pp_missing_parameter(name, self):
+                continue
+
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+
+        return loaded_params
+
 
 class ArceeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     """Arcee Model for causal language modeling, integrated with vLLM
@@ -304,8 +381,7 @@ class ArceeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         else:
             # Placeholder for lm_head on non-last ranks
             self.lm_head = PPMissingLayer()
-        # Provide a reference to the model's method for generating empty
-        # tensors (used in pipeline parallel schedule)
+
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -316,7 +392,6 @@ class ArceeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        # Forward pass through the Arcee model backbone
         model_output = self.model(input_ids=input_ids,
                                   positions=positions,
                                   intermediate_tensors=intermediate_tensors,
diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py
index a3eee9f065aea..b476a4f918bc3 100644
--- a/vllm/model_executor/models/aya_vision.py
+++ b/vllm/model_executor/models/aya_vision.py
@@ -123,16 +123,10 @@ class AyaVisionProcessingInfo(BaseProcessingInfo):
         return self.ctx.get_hf_config(AyaVisionConfig)
 
     def get_hf_processor(self, **kwargs: object) -> AyaVisionProcessor:
-        processor = self.ctx.get_hf_processor(AyaVisionProcessor, **kwargs)
+        return self.ctx.get_hf_processor(AyaVisionProcessor, **kwargs)
 
-        # Temporary workaround since this processor has multiple image tokens
-        # See https://github.com/huggingface/transformers/issues/38350
-        processor._check_special_mm_tokens = lambda *args, **kwargs: None
-
-        return processor
-
-    def get_image_processor(self) -> GotOcr2ImageProcessor:
-        return self.get_hf_processor().image_processor
+    def get_image_processor(self, **kwargs: object) -> GotOcr2ImageProcessor:
+        return self.get_hf_processor(**kwargs).image_processor
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py
index 0b7350f07d3f6..59033cb74a338 100644
--- a/vllm/model_executor/models/bert_with_rope.py
+++ b/vllm/model_executor/models/bert_with_rope.py
@@ -8,11 +8,13 @@ from torch import nn
 from transformers import PretrainedConfig
 
 from vllm.attention import Attention, AttentionType
-from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
-from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.distributed import (divide, get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size,
+                              tensor_model_parallel_all_reduce)
 from vllm.model_executor.layers.activation import (get_act_and_mul_fn,
                                                    get_act_fn)
+from vllm.model_executor.layers.fused_moe import fused_moe
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                MergedColumnParallelLinear,
                                                QKVParallelLinear,
@@ -23,9 +25,10 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.models import SupportsV0Only
 from vllm.model_executor.models.interfaces import SupportsQuant
 from vllm.model_executor.models.utils import WeightsMapper
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 
 
@@ -201,114 +204,101 @@ class BertWithRopeMLP(nn.Module):
         return hidden_states
 
 
-class NomicRouter(nn.Module):
+class NomicMoE(nn.Module):
 
-    def __init__(self, hidden_size: int, moe_num_experts: int, moe_top_k: int):
+    def __init__(
+        self,
+        num_experts: int,
+        top_k: int,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        params_dtype: Optional[torch.dtype] = None,
+        tp_size: Optional[int] = None,
+    ):
         super().__init__()
-        self.moe_top_k = moe_top_k
-        self.layer = ReplicatedLinear(hidden_size, moe_num_experts, bias=False)
 
-    def forward(
-        self, x: torch.Tensor
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.LongTensor]:
-        weights = self.layer(x.view(-1, x.shape[-1]))[0].softmax(
-            dim=-1, dtype=torch.float32)
-        top_weights, top_experts = torch.topk(weights, self.moe_top_k, dim=-1)
-        weights = weights.to(x.dtype)
-        top_weights = top_weights.to(x.dtype)
-        return weights, top_weights, top_experts  # type: ignore
-
-
-class NomicExpertMLP(nn.Module):
-
-    def __init__(self, hidden_size: int, ffn_hidden_size: int,
-                 moe_num_experts: int, ffn_act_fn: str):
-        super().__init__()
+        self.tp_size = tp_size or get_tensor_model_parallel_world_size()
+        self.num_total_experts = num_experts
+        self.top_k = top_k
         self.hidden_size = hidden_size
-        self.ffn_hidden_size = ffn_hidden_size
-        self.moe_num_experts = moe_num_experts
+        self.total_intermediate_size = intermediate_size
+        self.intermediate_size = divide(intermediate_size, self.tp_size)
+        self.hidden_act = hidden_act
 
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+        self.params_dtype = params_dtype
+
+        self.router = ReplicatedLinear(self.hidden_size,
+                                       self.num_total_experts,
+                                       bias=False)
         self.w1 = nn.Parameter(
-            torch.empty(moe_num_experts * ffn_hidden_size, hidden_size))
+            torch.empty(self.num_total_experts,
+                        self.intermediate_size,
+                        self.hidden_size,
+                        device=current_platform.device_type,
+                        dtype=self.params_dtype))
         self.w2 = nn.Parameter(
-            torch.empty(moe_num_experts * ffn_hidden_size, hidden_size))
-        self.activation_fn = get_act_fn(ffn_act_fn)
+            torch.empty(self.num_total_experts,
+                        self.hidden_size,
+                        self.intermediate_size,
+                        device=current_platform.device_type,
+                        dtype=self.params_dtype))
+        self.bias = nn.Parameter(torch.zeros(self.hidden_size))
+        set_weight_attrs(self.w1, {
+            "weight_loader": self.weight_loader,
+        })
+        set_weight_attrs(self.w2, {
+            "weight_loader": self.weight_loader,
+        })
 
-    def forward(self, x: torch.Tensor, expert_idx: int) -> torch.Tensor:
-        expert_w1 = self.w1.view(self.moe_num_experts, self.ffn_hidden_size,
-                                 self.hidden_size)[expert_idx]
-        expert_w2 = self.w2.view(self.moe_num_experts, self.ffn_hidden_size,
-                                 self.hidden_size)[expert_idx]
+    def weight_loader(
+        self,
+        param: nn.Parameter,
+        loaded_weight: torch.Tensor,
+        weight_name: str,
+    ):
+        # NOTE: Nomic-MoE has fused experts weights with shape
+        # (num_experts * intermediate_size, hidden_size)
+        tp_rank = get_tensor_model_parallel_rank()
+        param_data = param.data
+        shard_size = self.intermediate_size
+        shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size)
+        if weight_name.endswith("w1"):
+            loaded_weight = loaded_weight.reshape(
+                self.num_total_experts,
+                self.total_intermediate_size,
+                self.hidden_size,
+            )[:, shard]
+        if weight_name.endswith("w2"):
+            loaded_weight = loaded_weight.reshape(
+                self.num_total_experts,
+                self.total_intermediate_size,
+                self.hidden_size,
+            )[:, shard].transpose(1, 2)
+        param_data.copy_(loaded_weight)
 
-        x1 = x.matmul(expert_w1.t())
-        act_out = self.activation_fn(x1)
-        x2 = act_out.matmul(expert_w2)
-        return x2
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        num_tokens, hidden_size = hidden_states.shape
+        hidden_states = hidden_states.view(-1, self.hidden_size)
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.router(hidden_states)
+        final_hidden_states = fused_moe(hidden_states,
+                                        self.w1,
+                                        self.w2,
+                                        router_logits,
+                                        self.top_k,
+                                        renormalize=False,
+                                        inplace=False,
+                                        activation=self.hidden_act,
+                                        is_act_and_mul=False)
 
+        if self.tp_size > 1:
+            final_hidden_states = tensor_model_parallel_all_reduce(
+                final_hidden_states)
 
-class NomicExperts(nn.Module):
-
-    def __init__(self, config, hidden_size: int, ffn_hidden_size: int,
-                 moe_num_experts: int):
-        super().__init__()
-        self.moe_num_experts = moe_num_experts
-
-        self.mlp = NomicExpertMLP(hidden_size=config.n_embd,
-                                  ffn_hidden_size=config.n_inner,
-                                  moe_num_experts=moe_num_experts,
-                                  ffn_act_fn=config.hidden_act)
-        self.bias = nn.Parameter(torch.zeros(config.n_embd))
-
-    def forward(self, x: torch.Tensor, weights: torch.Tensor,
-                top_weights: torch.Tensor,
-                top_experts: torch.LongTensor) -> torch.Tensor:
-        q_len, hidden_size = x.shape
-        x = x.view(-1, hidden_size)
-        out = torch.zeros_like(x)
-
-        expert_mask = nn.functional.one_hot(
-            top_experts, num_classes=self.moe_num_experts).permute(2, 1, 0)
-        for expert_idx in range(0, self.moe_num_experts):
-            topk_idx, token_idx = torch.where(expert_mask[expert_idx])
-            if token_idx.shape[0] == 0:
-                continue
-
-            token_list = token_idx.tolist()
-            topk_list = topk_idx.tolist()
-
-            expert_tokens = x[None, token_list].reshape(-1, hidden_size)
-            expert_out = self.mlp(
-                expert_tokens, expert_idx) * top_weights[token_list, topk_list,
-                                                         None]
-
-            out.index_add_(0, token_idx, expert_out)
-
-        out = out.reshape(q_len, hidden_size)
-        return out + self.bias
-
-
-class NomicMoELayer(nn.Module):
-
-    def __init__(self, config: PretrainedConfig):
-        super().__init__()
-
-        self.router = NomicRouter(
-            config.n_embd,
-            moe_num_experts=config.num_experts,
-            moe_top_k=config.moe_top_k,
-        )
-
-        self.experts = NomicExperts(
-            config,
-            hidden_size=config.n_embd,
-            ffn_hidden_size=config.n_inner,
-            moe_num_experts=config.num_experts,
-        )
-
-    def forward(self, x: torch.Tensor):
-        weights, top_weights, top_experts = self.router(x)
-        out = self.experts(x, weights, top_weights, top_experts)
-        return out
+        return final_hidden_states.view(num_tokens, hidden_size) + self.bias
 
 
 class BertWithRopeBlock(nn.Module):
@@ -332,7 +322,11 @@ class BertWithRopeBlock(nn.Module):
             prefix=f"{prefix}.attention")
 
         if moe:
-            self.mlp = NomicMoELayer(config=config, )
+            self.mlp = NomicMoE(num_experts=config.num_experts,
+                                top_k=config.moe_top_k,
+                                hidden_size=config.hidden_size,
+                                intermediate_size=config.intermediate_size,
+                                hidden_act=config.hidden_act)
         else:
             if config.hidden_act in ["silu", "geglu"]:
                 self.mlp = BertWithRopeGatedMLP(
@@ -364,7 +358,6 @@ class BertWithRopeBlock(nn.Module):
         return hidden_states
 
 
-@support_torch_compile
 class BertWithRopeEncoder(nn.Module):
 
     def __init__(self,
@@ -398,7 +391,7 @@ class BertWithRopeEncoder(nn.Module):
         return hidden_states
 
 
-class BertWithRope(nn.Module, SupportsV0Only, SupportsQuant):
+class BertWithRope(nn.Module, SupportsQuant):
     hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
@@ -463,7 +456,11 @@ class BertWithRope(nn.Module, SupportsV0Only, SupportsQuant):
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
-                weight_loader(param, loaded_weight)
+                if name.endswith((".w1", ".w2")):
+                    # Nomic-MoE has fused experts weights
+                    weight_loader(param, loaded_weight, name)
+                else:
+                    weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
 
@@ -481,6 +478,10 @@ class NomicBertModel(BertWithRope):
             "mlp.fc12": "mlp.gate_proj",
             "mlp.fc2": "mlp.down_proj",
             "norm2": "mlp_ln",
+            # MoE mapping
+            "experts.mlp.": "",
+            "experts.": "",
+            "router.layer": "router",
         })
 
 
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index 129f0942f14ef..5470ff3e8b612 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Adapted from
-# https://github.com/THUDM/ChatGLM2-6B
+# https://github.com/zai-org/ChatGLM2-6B
 """Inference-only ChatGLM model compatible with THUDM weights."""
 import json
 from collections.abc import Iterable
@@ -86,10 +86,10 @@ class GLMAttention(nn.Module):
             prefix=f"{prefix}.dense",
         )
 
-        # https://huggingface.co/THUDM/chatglm3-6b-32k/blob/e210410255278dd9d74463cf396ba559c0ef801c/modeling_chatglm.py#L141
+        # https://huggingface.co/zai-org/chatglm3-6b-32k/blob/e210410255278dd9d74463cf396ba559c0ef801c/modeling_chatglm.py#L141
         rope_ratio = getattr(config, "rope_ratio", 1.0)
         max_positions = getattr(config, "seq_length", 8192)
-        # NOTE: THUDM/cogagent-9b-20241220 uses original_rope=False,
+        # NOTE: zai-org/cogagent-9b-20241220 uses original_rope=False,
         # which is equivalent to is_neox_style=True
         is_neox_style = not config.original_rope
         self.rotary_emb = get_rope(
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index 6f50b17530987..6f09be7a59410 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -44,6 +44,15 @@ class GteNewModelConfig(VerifyAndUpdateConfig):
         }
 
 
+class JambaForSequenceClassificationConfig(VerifyAndUpdateConfig):
+
+    @staticmethod
+    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
+        pooler_config = vllm_config.model_config.pooler_config
+        if pooler_config.activation is None:
+            pooler_config.activation = False
+
+
 class JinaRobertaModelConfig(VerifyAndUpdateConfig):
 
     @staticmethod
@@ -93,7 +102,7 @@ class NomicBertModelConfig(VerifyAndUpdateConfig):
         config.num_hidden_layers = config.n_layer
 
         head_dim = config.hidden_size // config.num_attention_heads
-        rotary_emb_dim = head_dim * config.rotary_emb_fraction
+        rotary_emb_dim = int(head_dim * config.rotary_emb_fraction)
         max_trained_positions = getattr(config, "max_trained_positions", 2048)
         config.rotary_kwargs = {
             "head_size": head_dim,
@@ -155,6 +164,26 @@ class NomicBertModelConfig(VerifyAndUpdateConfig):
             vllm_config.recalculate_max_model_len(max_model_len)
 
 
+class Qwen2ForProcessRewardModelConfig(VerifyAndUpdateConfig):
+
+    @staticmethod
+    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
+        pooler_config = vllm_config.model_config.pooler_config
+
+        if pooler_config.step_tag_id is None:
+            pooler_config.step_tag_id = 151651
+
+
+class Qwen2ForRewardModelConfig(VerifyAndUpdateConfig):
+
+    @staticmethod
+    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
+        pooler_config = vllm_config.model_config.pooler_config
+
+        if pooler_config.softmax is None:
+            pooler_config.softmax = False
+
+
 class Qwen3ForSequenceClassificationConfig(VerifyAndUpdateConfig):
 
     @staticmethod
@@ -309,8 +338,11 @@ MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = {
     "GteModel": SnowflakeGteNewModelConfig,
     "GteNewModel": GteNewModelConfig,
     "NomicBertModel": NomicBertModelConfig,
+    "Qwen2ForProcessRewardModel": Qwen2ForProcessRewardModelConfig,
+    "Qwen2ForRewardModel": Qwen2ForRewardModelConfig,
     "Qwen3ForSequenceClassification": Qwen3ForSequenceClassificationConfig,
     "XLMRobertaModel": JinaRobertaModelConfig,
     "JinaVLForRanking": JinaVLForSequenceClassificationConfig,
+    "JambaForSequenceClassification": JambaForSequenceClassificationConfig,
     "GraniteMoeHybridForCausalLM": GraniteMoeHybridModelConfig,
 }
diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py
index 7a4dd69443ad7..360c7e66bf5ce 100644
--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@@ -6,6 +6,7 @@ from typing import Optional, Union
 
 import torch
 import torch.nn as nn
+from transformers import PretrainedConfig
 
 from vllm.attention import Attention
 from vllm.config import CacheConfig, VllmConfig
@@ -24,7 +25,6 @@ from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.configs.dbrx import DbrxConfig
 
 from .interfaces import SupportsPP
 from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
@@ -39,7 +39,7 @@ class DbrxRouter(nn.Module):
 
     def __init__(
         self,
-        config: DbrxConfig,
+        config: PretrainedConfig,
         params_dtype: Optional[torch.dtype] = None,
     ):
         super().__init__()
@@ -63,7 +63,7 @@ class DbrxExperts(FusedMoE):
 
     def __init__(
         self,
-        config: DbrxConfig,
+        config: PretrainedConfig,
         quant_config: Optional[QuantizationConfig] = None,
         params_dtype: Optional[torch.dtype] = None,
         prefix: str = "",
@@ -138,7 +138,7 @@ class DbrxMoE(nn.Module):
 
     def __init__(
         self,
-        config: DbrxConfig,
+        config: PretrainedConfig,
         quant_config: Optional[QuantizationConfig] = None,
         params_dtype: Optional[torch.dtype] = None,
         prefix: str = "",
@@ -169,7 +169,7 @@ class DbrxAttention(nn.Module):
 
     def __init__(
         self,
-        config: DbrxConfig,
+        config: PretrainedConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
@@ -249,7 +249,7 @@ class DbrxFusedNormAttention(nn.Module):
 
     def __init__(
         self,
-        config: DbrxConfig,
+        config: PretrainedConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
@@ -284,7 +284,7 @@ class DbrxBlock(nn.Module):
 
     def __init__(
         self,
-        config: DbrxConfig,
+        config: PretrainedConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py
index 911f0036c2dd6..2e026d582a6de 100644
--- a/vllm/model_executor/models/deepseek_mtp.py
+++ b/vllm/model_executor/models/deepseek_mtp.py
@@ -182,6 +182,8 @@ class DeepSeekMTP(nn.Module, SupportsPP):
         stacked_params_mapping = [
             ("gate_up_proj", "gate_proj", 0),
             ("gate_up_proj", "up_proj", 1),
+            ("fused_qkv_a_proj", "q_a_proj", 0),
+            ("fused_qkv_a_proj", "kv_a_proj_with_mqa", 1),
         ]
 
         expert_params_mapping = FusedMoE.make_expert_params_mapping(
@@ -212,6 +214,13 @@ class DeepSeekMTP(nn.Module, SupportsPP):
                 if (("mlp.experts." in name) and name not in params_dict):
                     continue
                 name = name.replace(weight_name, param_name)
+
+                # QKV fusion is optional, fall back to normal
+                # weight loading if it's not enabled
+                if ((param_name == "fused_qkv_a_proj")
+                        and name not in params_dict):
+                    continue
+
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 79ddd3d0f6276..68a0a83d6204c 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -830,20 +830,6 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP, MixtureOfExperts):
                                        sampling_metadata)
         return logits
 
-    def make_empty_intermediate_tensors(
-            self, batch_size: int, dtype: torch.dtype,
-            device: torch.device) -> IntermediateTensors:
-        return IntermediateTensors({
-            "hidden_states":
-            torch.zeros((batch_size, self.config.hidden_size),
-                        dtype=dtype,
-                        device=device),
-            "residual":
-            torch.zeros((batch_size, self.config.hidden_size),
-                        dtype=dtype,
-                        device=device),
-        })
-
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index 544de5fe02d35..e0acca75d9dd6 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -51,13 +51,14 @@ class DeepseekVL2ImagePixelInputs(TensorSchema):
     """
     Dimensions:
         - bn: Batch size * number of images
+        - p: Number of patches
         - c: Number of channels (3)
         - h: Height of each image
         - w: Width of each image
     """
     type: Literal["pixel_values"]
     data: Annotated[Union[torch.Tensor, list[torch.Tensor]],
-                    TensorShape("bn", 3, "h", "w")]
+                    TensorShape("bn", "p", 3, "h", "w", dynamic_dims={"p"})]
     images_spatial_crop: Annotated[torch.Tensor, TensorShape("bn", 2)]
 
 
@@ -214,25 +215,25 @@ class DeepseekVL2MultiModalProcessor(
         mm_kwargs: Mapping[str, object],
         tok_kwargs: Mapping[str, object],
     ) -> BatchFeature:
-        if mm_data:
-            processed_outputs = self.info.ctx.call_hf_processor(
-                self.info.get_hf_processor(**mm_kwargs),
-                dict(prompt=prompt, **mm_data),
-                dict(**mm_kwargs, **tok_kwargs),
-            )
-            pixel_values = processed_outputs["pixel_values"]
-            # split pixel values into patches corresponding to each image
-            images_spatial_crop = processed_outputs["images_spatial_crop"]
-            patches_per_image = [
-                x.prod().item() + 1 for x in images_spatial_crop
-            ]
-            pixel_values = pixel_values.split(patches_per_image)
-            processed_outputs["pixel_values"] = pixel_values
-        else:
+        if not mm_data:
             tokenizer = self.info.get_tokenizer()
-            processed_outputs = tokenizer(prompt,
-                                          add_special_tokens=True,
-                                          return_tensors="pt")
+            return tokenizer(prompt,
+                             add_special_tokens=True,
+                             return_tensors="pt")
+
+        processed_outputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+            tok_kwargs=tok_kwargs,
+        )
+
+        pixel_values = processed_outputs["pixel_values"]
+        # split pixel values into patches corresponding to each image
+        images_spatial_crop = processed_outputs["images_spatial_crop"]
+        patches_per_image = [x.prod().item() + 1 for x in images_spatial_crop]
+        pixel_values = pixel_values.split(patches_per_image)
+        processed_outputs["pixel_values"] = pixel_values
 
         return processed_outputs
 
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index aaf105ec2552a..8052b6bb82348 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -30,6 +30,7 @@ from typing import Any, Optional, Union
 
 import torch
 from torch import nn
+from transformers import PretrainedConfig
 
 from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
@@ -49,7 +50,6 @@ from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.configs.exaone import ExaoneConfig
 
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
@@ -99,7 +99,7 @@ class ExaoneAttention(nn.Module):
 
     def __init__(
         self,
-        config: ExaoneConfig,
+        config: PretrainedConfig,
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
@@ -194,7 +194,7 @@ class ExaoneBlockAttention(nn.Module):
 
     def __init__(
         self,
-        config: ExaoneConfig,
+        config: PretrainedConfig,
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
@@ -236,7 +236,7 @@ class ExaoneDecoderLayer(nn.Module):
 
     def __init__(
         self,
-        config: ExaoneConfig,
+        config: PretrainedConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py
index 97aeb6fd7b172..3d6ce3e8895fb 100644
--- a/vllm/model_executor/models/exaone4.py
+++ b/vllm/model_executor/models/exaone4.py
@@ -26,6 +26,7 @@ from typing import Any, Optional, Union
 
 import torch
 from torch import nn
+from transformers import PretrainedConfig
 
 from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
@@ -45,7 +46,6 @@ from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.configs.exaone4 import Exaone4Config
 
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index,
@@ -96,7 +96,7 @@ class Exaone4Attention(nn.Module):
 
     def __init__(
         self,
-        config: Exaone4Config,
+        config: PretrainedConfig,
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
@@ -224,7 +224,7 @@ class Exaone4DecoderLayer(nn.Module):
 
     def __init__(
         self,
-        config: Exaone4Config,
+        config: PretrainedConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py
index 399c739f408ee..56e456c2f1f2a 100644
--- a/vllm/model_executor/models/florence2.py
+++ b/vllm/model_executor/models/florence2.py
@@ -761,12 +761,6 @@ class Florence2LanguageForConditionalGeneration(nn.Module, SupportsV0Only):
 
 class Florence2ProcessingInfo(BaseProcessingInfo):
 
-    def get_hf_config(self):
-        return self.ctx.get_hf_config()
-
-    def get_hf_processor(self):
-        return self.ctx.get_hf_processor()
-
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": 1}
 
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 7e1d478562a4c..b61e0361fe8c3 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -83,8 +83,8 @@ class FuyuProcessingInfo(BaseProcessingInfo):
     def get_hf_processor(self, **kwargs: object):
         return self.ctx.get_hf_processor(FuyuProcessor, **kwargs)
 
-    def get_image_processor(self) -> FuyuImageProcessor:
-        return self.get_hf_processor().image_processor
+    def get_image_processor(self, **kwargs: object) -> FuyuImageProcessor:
+        return self.get_hf_processor(**kwargs).image_processor
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": 1}
diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py
index 7d163320e0d6a..e16c03c8d3b57 100644
--- a/vllm/model_executor/models/gemma3n.py
+++ b/vllm/model_executor/models/gemma3n.py
@@ -46,6 +46,7 @@ from vllm.model_executor.model_loader.weight_utils import (
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
+from .interfaces import SupportsQuant
 from .utils import (AutoWeightsLoader, extract_layer_index,
                     is_pp_missing_parameter, make_layers, maybe_prefix)
 
@@ -68,6 +69,7 @@ class Gemma3nAltUp(nn.Module):
         altup_num_inputs: int,
         altup_coef_clip: float,
         altup_active_idx: int,
+        quant_config: QuantizationConfig,
         prefix: str,
     ):
         super().__init__()
@@ -80,6 +82,7 @@ class Gemma3nAltUp(nn.Module):
             altup_num_inputs,
             altup_num_inputs,
             bias=False,
+            quant_config=quant_config,
             prefix=f"{prefix}.correction_coefs",
             return_bias=False,
         )
@@ -87,6 +90,7 @@ class Gemma3nAltUp(nn.Module):
             altup_num_inputs,
             altup_num_inputs**2,
             bias=False,
+            quant_config=quant_config,
             prefix=f"{prefix}.prediction_coefs",
             return_bias=False,
         )
@@ -94,6 +98,7 @@ class Gemma3nAltUp(nn.Module):
             hidden_size,
             altup_num_inputs,
             bias=False,
+            quant_config=quant_config,
             prefix=f"{prefix}.modality_router",
             return_bias=False,
         )
@@ -167,22 +172,33 @@ class Gemma3nAltUp(nn.Module):
 class Gemma3nLaurelBlock(nn.Module):
     """Learned Augmented Residual Layer"""
 
-    def __init__(self, hidden_size: int, laurel_rank: int, rms_norm_eps: float,
-                 prefix: str):
+    def __init__(
+        self,
+        hidden_size: int,
+        laurel_rank: int,
+        rms_norm_eps: float,
+        *,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str,
+    ) -> None:
         super().__init__()
 
         self.linear_left = ColumnParallelLinear(
             hidden_size,
             laurel_rank,
             bias=False,
+            quant_config=quant_config,
             prefix=f"{prefix}.linear_left",
             return_bias=False,
         )
-        self.linear_right = RowParallelLinear(laurel_rank,
-                                              hidden_size,
-                                              bias=False,
-                                              prefix=f"{prefix}.linear_right",
-                                              return_bias=False)
+        self.linear_right = RowParallelLinear(
+            laurel_rank,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.linear_right",
+            return_bias=False,
+        )
         self.post_laurel_norm = RMSNorm(
             hidden_size=hidden_size,
             eps=rms_norm_eps,
@@ -297,8 +313,13 @@ class Gemma3nAttention(nn.Module):
                               has_weight=False)
 
         layer_idx = extract_layer_index(prefix)
-        if config.layer_types[layer_idx] == "sliding_attention":
-            self.sliding_window = config.sliding_window
+
+        is_sliding_window = (
+            getattr(config, "interleaved_sliding_window", None) is not None
+            and config.layer_types[layer_idx] == "sliding_attention")
+
+        if is_sliding_window:
+            self.sliding_window = config.interleaved_sliding_window
             rope_theta = config.rope_local_base_freq
             rope_scaling = {"rope_type": "default"}
         else:
@@ -384,6 +405,7 @@ class Gemma3nDecoderLayer(nn.Module):
             altup_num_inputs=config.altup_num_inputs,
             altup_coef_clip=config.altup_coef_clip,
             altup_active_idx=config.altup_active_idx,
+            quant_config=quant_config,
             prefix=f"{prefix}.altup",
         )
         self.self_attn = Gemma3nAttention(
@@ -412,6 +434,7 @@ class Gemma3nDecoderLayer(nn.Module):
             hidden_size=config.hidden_size,
             laurel_rank=config.laurel_rank,
             rms_norm_eps=config.rms_norm_eps,
+            quant_config=quant_config,
             prefix=f"{prefix}.laurel",
         )
 
@@ -422,6 +445,7 @@ class Gemma3nDecoderLayer(nn.Module):
             config.hidden_size,
             config.hidden_size_per_layer_input,
             bias=False,
+            quant_config=quant_config,
             prefix=f"{prefix}.per_layer_input_gate",
             return_bias=False,
         )
@@ -429,6 +453,7 @@ class Gemma3nDecoderLayer(nn.Module):
             config.hidden_size_per_layer_input,
             config.hidden_size,
             bias=False,
+            quant_config=quant_config,
             prefix=f"{prefix}.per_layer_projection",
             return_bias=False,
         )
@@ -508,7 +533,7 @@ class Gemma3nDecoderLayer(nn.Module):
 
 
 @support_torch_compile
-class Gemma3nTextModel(nn.Module):
+class Gemma3nTextModel(nn.Module, SupportsQuant):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
@@ -521,6 +546,7 @@ class Gemma3nTextModel(nn.Module):
         self.embed_tokens = VocabParallelEmbedding(
             config.vocab_size,
             config.hidden_size,
+            quant_config=quant_config,
             prefix=f"{prefix}.embed_tokens",
         )
         self.embed_scale = torch.tensor(
@@ -530,6 +556,7 @@ class Gemma3nTextModel(nn.Module):
         self.embed_tokens_per_layer = VocabParallelEmbedding(
             config.vocab_size_per_layer_input,
             config.num_hidden_layers * config.hidden_size_per_layer_input,
+            quant_config=quant_config,
             prefix=f"{prefix}.per_layer_embed_tokens",
         )
         self.embed_scale_per_layer = torch.tensor(
@@ -542,6 +569,7 @@ class Gemma3nTextModel(nn.Module):
             bias=False,
             gather_output=True,
             return_bias=False,
+            quant_config=quant_config,
             prefix=f"{prefix}.per_layer_model_projection",
         )
         self.per_layer_projection_norm = RMSNorm(
@@ -561,7 +589,8 @@ class Gemma3nTextModel(nn.Module):
                 bias=False,
                 gather_output=True,
                 return_bias=False,
-                prefix=f"{prefix}.{idx-1}.altup_projections",
+                quant_config=quant_config,
+                prefix=f"{prefix}.altup_projections.{idx-1}",
             ) for idx in range(1, self.config.altup_num_inputs)
         ])
         self.altup_unembed_projections = nn.ModuleList([
@@ -571,7 +600,8 @@ class Gemma3nTextModel(nn.Module):
                 bias=False,
                 gather_output=True,
                 return_bias=False,
-                prefix=f"{prefix}.{idx-1}.altup_unembed_projections",
+                quant_config=quant_config,
+                prefix=f"{prefix}.altup_unembed_projections.{idx-1}",
             ) for idx in range(1, self.config.altup_num_inputs)
         ])
 
@@ -752,7 +782,7 @@ class Gemma3nModel(nn.Module):
                                    **kwargs)
 
 
-class Gemma3nForConditionalGeneration(nn.Module):
+class Gemma3nForConditionalGeneration(nn.Module, SupportsQuant):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -771,6 +801,7 @@ class Gemma3nForConditionalGeneration(nn.Module):
         del lora_config  # Unused.
         super().__init__()
         self.config = config
+        self.cache_config = vllm_config.cache_config
         self.model = Gemma3nModel(vllm_config=vllm_config,
                                   prefix=maybe_prefix(prefix, "model"))
         self.logits_processor = LogitsProcessor(
diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index 1fd65cc9099b7..7983895687a38 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -37,8 +37,7 @@ import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange
 from transformers import BatchFeature
-from transformers.models.glm4v.configuration_glm4v import (Glm4vConfig,
-                                                           Glm4vVisionConfig)
+from transformers.models.glm4v.configuration_glm4v import Glm4vVisionConfig
 from transformers.models.glm4v.image_processing_glm4v import (
     Glm4vImageProcessor, smart_resize)
 from transformers.models.glm4v.video_processing_glm4v import (
@@ -801,7 +800,7 @@ class Glm4vVisionTransformer(nn.Module):
 class Glm4vProcessingInfo(BaseProcessingInfo):
 
     def get_hf_config(self):
-        return self.ctx.get_hf_config(Glm4vConfig)
+        return self.ctx.get_hf_config()
 
     def get_tokenizer(self):
         return self.ctx.tokenizer
@@ -809,11 +808,11 @@ class Glm4vProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None, "video": 1}
 
-    def get_image_processor(self) -> Glm4vImageProcessor:
-        return self.get_hf_processor().image_processor
+    def get_image_processor(self, **kwargs: object) -> Glm4vImageProcessor:
+        return self.get_hf_processor(**kwargs).image_processor
 
-    def get_video_processor(self) -> Glm4vVideoProcessor:
-        return self.get_hf_processor().video_processor
+    def get_video_processor(self, **kwargs: object) -> Glm4vVideoProcessor:
+        return self.get_hf_processor(**kwargs).video_processor
 
     def _get_vision_info(
         self,
@@ -937,7 +936,7 @@ class Glm4vProcessingInfo(BaseProcessingInfo):
                               total_frames: int) -> list[int]:
         video_processor = self.get_video_processor()
 
-        video_fps = metadata.get("fps", 2.0)
+        video_fps = metadata.get("fps", video_processor.fps)
         meta_frames = metadata.get("total_num_frames", total_frames)
         max_frame_idx = meta_frames - 1
         duration = metadata.get("duration",
@@ -1120,11 +1119,7 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
                     video_placeholder,
                 )
 
-                grid_t = len(video_outputs["video_grid_thw"])
-                _, grid_h, grid_w = video_outputs["video_grid_thw"][0]
-                grid_thw = torch.tensor([[grid_t, grid_h, grid_w]])
-
-                video_grid_thw_lst.append(grid_thw)
+                video_grid_thw_lst.append(video_outputs["video_grid_thw"])
                 pixel_values_videos_lst.append(
                     video_outputs["pixel_values_videos"])
             video_outputs = dict(
@@ -1257,7 +1252,7 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal,
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
-        config: Glm4vConfig = vllm_config.model_config.hf_config
+        config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
         multimodal_config = vllm_config.model_config.multimodal_config
 
@@ -1271,11 +1266,18 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal,
             prefix=maybe_prefix(prefix, "visual"),
         )
 
+        if config.model_type == "glm4v":
+            architectures = ["Glm4ForCausalLM"]
+        elif config.model_type == "glm4v_moe":
+            architectures = ["Glm4MoeForCausalLM"]
+        else:
+            architectures = None
+
         self.language_model = init_vllm_registered_model(
             vllm_config=vllm_config,
-            prefix=maybe_prefix(prefix, ""),
-            architectures=["Glm4ForCausalLM"],
-        )
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+            architectures=architectures)
 
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py
index 6a196fef572de..bd3e27662ee7c 100644
--- a/vllm/model_executor/models/glm4_moe.py
+++ b/vllm/model_executor/models/glm4_moe.py
@@ -123,11 +123,11 @@ class Glm4MoE(nn.Module):
                                      config.n_routed_experts,
                                      bias=False,
                                      quant_config=None,
+                                     params_dtype=torch.float32,
                                      prefix=f"{prefix}.gate")
 
-        # noaux_tc is not set in transformers new config now
-        self.gate.e_score_correction_bias = (nn.Parameter(
-            torch.empty(config.n_routed_experts)))
+        self.gate.e_score_correction_bias = nn.Parameter(
+            torch.empty(config.n_routed_experts, dtype=torch.float32))
 
         # Load balancing settings.
         vllm_config = get_current_vllm_config()
@@ -181,7 +181,7 @@ class Glm4MoE(nn.Module):
 
         if self.n_shared_experts is not None:
             shared_output = self.shared_experts(hidden_states)
-        router_logits, _ = self.gate(hidden_states)
+        router_logits, _ = self.gate(hidden_states.to(dtype=torch.float32))
         final_hidden_states = self.experts(
             hidden_states=hidden_states,
             router_logits=router_logits) * self.routed_scaling_factor
diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py
index 537aeabf72d5a..1751fccd08b06 100644
--- a/vllm/model_executor/models/glm4v.py
+++ b/vllm/model_executor/models/glm4v.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
-# https://github.com/THUDM/CogAgent
+# https://github.com/zai-org/CogAgent
 """Inference-only CogAgent model compatible with THUDM weights."""
 from argparse import Namespace
 from collections.abc import Mapping, Sequence
diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
index 467b074f37753..c3e4f81597adb 100644
--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -392,21 +392,7 @@ class H2OVLProcessor(BaseInternVLProcessor):
 
 class H2OVLProcessingInfo(BaseInternVLProcessingInfo):
 
-    def get_hf_processor(
-        self,
-        *,
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
-        **kwargs: object,
-    ) -> H2OVLProcessor:
-        if min_dynamic_patch is not None:
-            kwargs["min_dynamic_patch"] = min_dynamic_patch
-        if max_dynamic_patch is not None:
-            kwargs["max_dynamic_patch"] = max_dynamic_patch
-        if dynamic_image_size is not None:
-            kwargs["dynamic_image_size"] = dynamic_image_size
-
+    def get_hf_processor(self, **kwargs: object) -> H2OVLProcessor:
         return self.ctx.init_processor(
             H2OVLProcessor,
             config=self.get_hf_config(),
diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py
index 3e8e50b35c0b7..e5c94c7f3a706 100644
--- a/vllm/model_executor/models/hyperclovax_vision.py
+++ b/vllm/model_executor/models/hyperclovax_vision.py
@@ -25,8 +25,7 @@ import torch
 import torch.nn as nn
 from timm.layers import LayerNorm, LayerNorm2d
 from timm.models.regnet import RegStage
-from transformers import (AutoProcessor, BatchFeature, CLIPVisionConfig,
-                          SiglipVisionConfig)
+from transformers import BatchFeature, CLIPVisionConfig, SiglipVisionConfig
 from transformers.modeling_utils import no_init_weights
 
 from vllm.config import VllmConfig
@@ -80,26 +79,9 @@ HCXVisionMultimodalInputs = Union[HCXVisionMultimodalPixelInputs]
 
 class HCXVisionProcessingInfo(BaseProcessingInfo):
 
-    def get_hf_config(self):
-        return self.ctx.get_hf_config()
-
     def get_vision_encoder_info(self):
         return get_vision_encoder_info(self.get_hf_config())
 
-    def get_hf_processor(
-        self,
-        **kwargs: object,
-    ):
-        processor_cls = type(
-            AutoProcessor.from_pretrained(
-                self.ctx.model_config.model,
-                trust_remote_code=self.ctx.model_config.trust_remote_code,
-            ))
-        return self.ctx.get_hf_processor(
-            processor_cls,
-            **kwargs,
-        )
-
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None, "video": None}
 
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index 6e991d99b9638..3c01789b90066 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -88,15 +88,7 @@ ImageInputs = Union[Idefics3ImagePixelInputs, Idefics3ImageEmbeddingInputs]
 
 class Idefics3ProcessingInfo(BaseProcessingInfo):
 
-    def get_hf_processor(
-        self,
-        *,
-        size: Optional[dict[str, int]] = None,
-        **kwargs: object,
-    ) -> Idefics3Processor:
-        if size is not None:
-            kwargs["size"] = size
-
+    def get_hf_processor(self, **kwargs: object) -> Idefics3Processor:
         return self.ctx.get_hf_processor(Idefics3Processor, **kwargs)
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 957b57276b4ca..b6d9877cd01b6 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -1,13 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from collections.abc import Iterable, MutableSequence
+from collections.abc import Iterable, Mapping, MutableSequence
 from typing import (TYPE_CHECKING, ClassVar, Literal, Optional, Protocol,
                     Union, overload, runtime_checkable)
 
 import numpy as np
 import torch
 from torch import Tensor
+from transformers.models.whisper.tokenization_whisper import LANGUAGES
 from typing_extensions import Self, TypeIs
 
 from vllm.config import ModelConfig, SpeechToTextConfig
@@ -685,6 +686,8 @@ class SupportsQuant:
 @runtime_checkable
 class SupportsTranscription(Protocol):
     """The interface required for all models that support transcription."""
+    # Mapping from ISO639_1 language codes: language names
+    supported_languages: ClassVar[Mapping[str, str]]
 
     supports_transcription: ClassVar[Literal[True]] = True
 
@@ -694,11 +697,22 @@ class SupportsTranscription(Protocol):
     `True`.
     """
 
+    def __init_subclass__(cls, **kwargs):
+        super().__init_subclass__(**kwargs)
+        # language codes in supported_languages
+        # that don't exist in the full language map
+        invalid = set(cls.supported_languages) - set(LANGUAGES.keys())
+        if invalid:
+            raise ValueError(
+                f"{cls.__name__}.supported_languages contains invalid "
+                f"language codes: {sorted(invalid)}\n. "
+                f"Valid choices are: {sorted(LANGUAGES.keys())}")
+
     @classmethod
     def get_generation_prompt(cls, audio: np.ndarray,
                               stt_config: SpeechToTextConfig,
-                              model_config: ModelConfig, language: str,
-                              task_type: str,
+                              model_config: ModelConfig,
+                              language: Optional[str], task_type: str,
                               request_prompt: str) -> PromptType:
         """Get the prompt for the ASR model.
         The model has control over the construction, as long as it
@@ -706,9 +720,36 @@ class SupportsTranscription(Protocol):
         ...
 
     @classmethod
-    def validate_language(cls, language: str) -> bool:
-        """Check if the model supports a specific ISO639_1 language."""
-        ...
+    def get_other_languages(cls) -> Mapping[str, str]:
+        # other possible language codes from the whisper map
+        return {
+            k: v
+            for k, v in LANGUAGES.items() if k not in cls.supported_languages
+        }
+
+    @classmethod
+    def validate_language(cls, language: Optional[str]) -> Optional[str]:
+        """
+        Ensure the language specified in the transcription request 
+        is a valid ISO 639-1 language code. If the request language is 
+        valid, but not natively supported by the model, trigger a 
+        warning (but not an exception).
+        """
+        if language is None or language in cls.supported_languages:
+            return language
+        elif language in cls.get_other_languages():
+            logger.warning(
+                "Language %r is not natively supported by %s; "
+                "results may be less accurate. Supported languages: %r",
+                language,
+                cls.__name__,
+                list(cls.supported_languages.keys()),
+            )
+            return language
+        else:
+            raise ValueError(
+                f"Unsupported language: {language!r}.  Must be one of "
+                f"{list(cls.supported_languages.keys())}.")
 
     @classmethod
     def get_speech_to_text_config(
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 3637f037751c0..8e766dd4c4768 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -9,7 +9,7 @@
 # --------------------------------------------------------
 from abc import ABC, abstractmethod
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Any, Literal, Optional, TypedDict, TypeVar, Union
+from typing import Annotated, Any, Literal, Optional, TypeVar, Union
 
 import numpy.typing as npt
 import torch
@@ -37,6 +37,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
                          SupportsMultiModal, SupportsPP)
@@ -51,54 +52,60 @@ IMAGENET_MEAN = (0.485, 0.456, 0.406)
 IMAGENET_STD = (0.229, 0.224, 0.225)
 
 
-class InternVLImagePixelInputs(TypedDict):
+class InternVLImagePixelInputs(TensorSchema):
+    """
+    Dimensions:
+        - bn: Batch size * number of images
+        - bnp: Batch size * number of images * (1 + num_patches)
+        - c: Number of channels (3)
+        - h: Height of each image patch
+        - w: Width of each image patch
+    """
     type: Literal["pixel_values"]
-    pixel_values_flat: torch.Tensor
+    pixel_values_flat: Annotated[torch.Tensor, TensorShape("bnp", 3, "h", "w")]
+    num_patches: Annotated[torch.Tensor, TensorShape("bn")]
+
+
+class InternVLImageEmbeddingInputs(TensorSchema):
     """
-    Shape:
-    `(batch_size * num_images * (1 + num_patches), num_channels, height, width)`
+    Dimensions:
+        - n: Number of images
+        - f: Total image feature size
+        - h: Hidden size (must match the hidden size of language model backbone)
     """
-
-    num_patches: torch.Tensor
-    """Shape: `(batch_size * num_images)`"""
-
-
-class InternVLImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
-    data: Union[torch.Tensor, list[torch.Tensor]]
-    """ 
-    A tensor of shape `(num_images, total_image_feature_size, hidden_size)`
-    or a list of tensors of shape `(total_image_feature_size, hidden_size)`
-
-    `hidden_size` must match the hidden size of language model backbone.
-    """
+    data: Annotated[Union[torch.Tensor, list[torch.Tensor]],
+                    TensorShape("n", "f", "h")]
 
 
 InternVLImageInputs = Union[InternVLImagePixelInputs,
                             InternVLImageEmbeddingInputs]
 
 
-class InternVLVideoPixelInputs(TypedDict):
+class InternVLVideoPixelInputs(TensorSchema):
+    """
+    Dimensions:
+        - bvf: Batch size * number of videos * num_frames
+        - bn: Batch size * number of images
+        - c: Number of channels (3)
+        - h: Height of each video frame
+        - w: Width of each video frame
+    """
     type: Literal["pixel_values_videos"]
-    pixel_values_flat: torch.Tensor
+    pixel_values_flat: Annotated[torch.Tensor, TensorShape("bvf", 3, "h", "w")]
+    num_patches: Annotated[torch.Tensor, TensorShape("bn")]
+
+
+class InternVLVideoEmbeddingInputs(TensorSchema):
     """
-    Shape:
-    `(batch_size * num_video * num_frames, num_channels, height, width)`
+    Dimensions:
+        - n: Number of videos
+        - f: Total video feature size
+        - h: Hidden size (must match the hidden size of language model backbone)
     """
-
-    num_patches: torch.Tensor
-    """Shape: `(batch_size * num_images)`"""
-
-
-class InternVLVideoEmbeddingInputs(TypedDict):
     type: Literal["video_embeds"]
-    data: Union[torch.Tensor, list[torch.Tensor]]
-    """ 
-    A tensor of shape `(num_videos, total_video_feature_size, hidden_size)`
-    or a list of tensors of shape `(total_video_feature_size, hidden_size)`
-
-    `hidden_size` must match the hidden size of language model backbone.
-    """
+    data: Annotated[Union[torch.Tensor, list[torch.Tensor]],
+                    TensorShape("n", "f", "h")]
 
 
 InternVLVideoInputs = Union[InternVLVideoPixelInputs,
@@ -658,14 +665,7 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo):
     """Basic image-only ProcessingInfo for InternVL-style models."""
 
     @abstractmethod
-    def get_hf_processor(
-        self,
-        *,
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
-        **kwargs: object,
-    ) -> BaseInternVLProcessor:
+    def get_hf_processor(self, **kwargs: object) -> BaseInternVLProcessor:
         raise NotImplementedError
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
@@ -875,27 +875,12 @@ class InternVLProcessingInfo(BaseInternVLProcessingInfo):
 
         return max(max_frames_per_video, 1)
 
-    def get_hf_processor(
-        self,
-        *,
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
-        **kwargs: object,
-    ) -> InternVLProcessor:
-        if min_dynamic_patch is not None:
-            kwargs["min_dynamic_patch"] = min_dynamic_patch
-        if max_dynamic_patch is not None:
-            kwargs["max_dynamic_patch"] = max_dynamic_patch
-        if dynamic_image_size is not None:
-            kwargs["dynamic_image_size"] = dynamic_image_size
-
-        kwargs["video_token"] = self.get_video_token()
-
+    def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
         return self.ctx.init_processor(
             InternVLProcessor,
             config=self.get_hf_config(),
             tokenizer=self.get_tokenizer(),
+            video_token=self.get_video_token(),
             **kwargs,
         )
 
@@ -1151,26 +1136,6 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
         vit_embeds = self.mlp1(vit_embeds)
         return vit_embeds
 
-    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
-
-        h = w = self.config.vision_config.image_size
-        expected_dims = (3, h, w)
-
-        def _validate_shape(d: torch.Tensor):
-            actual_dims = tuple(d.shape)
-
-            if actual_dims != expected_dims:
-                expected_expr = str(expected_dims)
-                raise ValueError(
-                    "The expected shape of pixel values per image per batch "
-                    f" per patch is {expected_expr}. "
-                    f"You supplied {tuple(d.shape)}.")
-
-        for d in data:
-            _validate_shape(d)
-
-        return data
-
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[InternVLImageInputs]:
         pixel_values_flat = kwargs.pop("pixel_values_flat", None)
@@ -1205,12 +1170,14 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
 
             pixel_values_flat = flatten_bn(pixel_values_flat, concat=True)
             image_num_patches = flatten_bn(image_num_patches, concat=True)
+            expected_h = expected_w = self.config.vision_config.image_size
+            resolve_bindings = {"h": expected_h, "w": expected_w}
 
             return InternVLImagePixelInputs(
                 type="pixel_values",
-                pixel_values_flat=self._validate_pixel_values(
-                    pixel_values_flat),
+                pixel_values_flat=pixel_values_flat,
                 num_patches=image_num_patches,
+                resolve_bindings=resolve_bindings,
             )
 
         raise AssertionError("This line should be unreachable.")
@@ -1225,11 +1192,7 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
             return None
 
         if video_embeds is not None:
-            if not isinstance(video_embeds, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of video embeddings. "
-                                 f"Got type: {type(video_embeds)}")
-
-            return InternVLImageEmbeddingInputs(
+            return InternVLVideoEmbeddingInputs(
                 type="video_embeds",
                 data=flatten_bn(video_embeds),
             )
@@ -1250,12 +1213,14 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
             pixel_values_flat_video = flatten_bn(pixel_values_flat_video,
                                                  concat=True)
             video_num_patches = flatten_bn(video_num_patches, concat=True)
+            expected_h = expected_w = self.config.vision_config.image_size
+            resolve_bindings = {"h": expected_h, "w": expected_w}
 
             return InternVLVideoPixelInputs(
                 type="pixel_values_videos",
-                pixel_values_flat=self._validate_pixel_values(
-                    pixel_values_flat_video),
+                pixel_values_flat=pixel_values_flat_video,
                 num_patches=video_num_patches,
+                resolve_bindings=resolve_bindings,
             )
 
         raise AssertionError("This line should be unreachable.")
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 34281b2e99ee8..ab21b7ce2c5f5 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -25,6 +25,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.llama import LlamaMLP as JambaMLP
 from vllm.model_executor.models.mamba_cache import (MambaCacheManager,
                                                     MambaCacheParams)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
@@ -33,7 +34,7 @@ from vllm.utils import LayerBlockType
 
 from .interfaces import (HasInnerState, IsHybrid, SupportsLoRA, SupportsPP,
                          SupportsV0Only)
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -87,23 +88,6 @@ class JambaMoE(nn.Module):
         return hidden_states.view(orig_shape)
 
 
-class JambaMLP(JambaMoE):
-
-    def __init__(self,
-                 config: JambaConfig,
-                 params_dtype: Optional[torch.dtype] = None,
-                 tp_size: Optional[int] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
-        super().__init__(config,
-                         num_experts=1,
-                         top_k=1,
-                         params_dtype=params_dtype,
-                         tp_size=tp_size,
-                         quant_config=quant_config,
-                         prefix=prefix)
-
-
 class JambaMambaDecoderLayer(nn.Module):
 
     def __init__(self,
@@ -132,10 +116,20 @@ class JambaMambaDecoderLayer(nn.Module):
                                 )
 
         num_experts = config.layers_num_experts[layer_idx]
-        ffn_layer_class = JambaMoE if num_experts > 1 else JambaMLP
-        self.feed_forward = ffn_layer_class(config,
-                                            quant_config=quant_config,
-                                            prefix=f"{prefix}.feed_forward")
+        if num_experts > 1:
+            self.feed_forward = JambaMoE(
+                config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.feed_forward",
+            )
+        else:
+            self.feed_forward = JambaMLP(
+                config.hidden_size,
+                config.intermediate_size,
+                config.hidden_act,
+                quant_config=quant_config,
+                prefix=f"{prefix}.feed_forward",
+            )
         self.input_layernorm = RMSNorm(config.hidden_size,
                                        eps=config.rms_norm_eps)
         self.pre_ff_layernorm = RMSNorm(config.hidden_size,
@@ -216,10 +210,20 @@ class JambaAttentionDecoderLayer(nn.Module):
         )
 
         num_experts = config.layers_num_experts[layer_idx]
-        ffn_layer_class = JambaMoE if num_experts > 1 else JambaMLP
-        self.feed_forward = ffn_layer_class(config,
-                                            quant_config=quant_config,
-                                            prefix=f"{prefix}.feed_forward")
+        if num_experts > 1:
+            self.feed_forward = JambaMoE(
+                config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.feed_forward",
+            )
+        else:
+            self.feed_forward = JambaMLP(
+                config.hidden_size,
+                config.intermediate_size,
+                config.hidden_act,
+                quant_config=quant_config,
+                prefix=f"{prefix}.feed_forward",
+            )
         self.input_layernorm = RMSNorm(config.hidden_size,
                                        eps=config.rms_norm_eps)
         self.pre_ff_layernorm = RMSNorm(config.hidden_size,
@@ -359,15 +363,97 @@ class JambaModel(nn.Module):
         hidden_states, _ = self.final_layernorm(hidden_states, residual)
         return hidden_states
 
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        return FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.num_experts)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        expert_params_mapping = self.get_expert_mapping()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                if 'experts' in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for (
+                        param_name,
+                        weight_name,
+                        expert_id,
+                        shard_id,
+                ) in expert_params_mapping:
+                    if weight_name not in name:
+                        continue
+
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param,
+                                  loaded_weight,
+                                  name,
+                                  shard_id=shard_id,
+                                  expert_id=expert_id)
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    if is_pp_missing_parameter(name, self):
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
                        IsHybrid, SupportsV0Only):
+    hf_to_vllm_mapper = WeightsMapper(orig_to_new_substr={
+        ".self_attn.": ".",
+        ".A_log": ".A"
+    }, )
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
             "k_proj",
             "v_proj",
         ],
+        "gate_up_proj": ["gate_proj", "up_proj"],
         "in_proj": ["in_proj"],
     }
 
@@ -468,96 +554,11 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
 
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-        ]
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
 
-        # Params for weights, fp8 weight scales, fp8 activation scales
-        # (param_name, weight_name, expert_id, shard_id)
-        expert_params_mapping = FusedMoE.make_expert_params_mapping(
-            ckpt_gate_proj_name="gate_proj",
-            ckpt_down_proj_name="down_proj",
-            ckpt_up_proj_name="up_proj",
-            num_experts=self.config.num_experts)
-
-        params_dict = dict(self.named_parameters())
-        loaded_params: set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-
-            if "A_log" in name:
-                name = name.replace("A_log", "A")
-
-            if ".self_attn." in name:
-                name = name.replace(".self_attn", "")
-
-            if "feed_forward" in name and not _is_moe_layer(name):
-                ## map MLP layers to expert with ID=0
-                name = name.replace("feed_forward", "feed_forward.experts.0")
-
-            for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                if 'experts' in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # Skip layers on other devices.
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                for (
-                        param_name,
-                        weight_name,
-                        expert_id,
-                        shard_id,
-                ) in expert_params_mapping:
-                    if weight_name not in name:
-                        continue
-
-                    if is_pp_missing_parameter(name, self):
-                        continue
-                    name = name.replace(weight_name, param_name)
-                    param = params_dict[name]
-                    weight_loader = param.weight_loader
-                    weight_loader(param,
-                                  loaded_weight,
-                                  name,
-                                  shard_id=shard_id,
-                                  expert_id=expert_id)
-                    break
-                else:
-                    # Skip loading extra bias for GPTQ models.
-                    if name.endswith(".bias") and name not in params_dict:
-                        continue
-                    if is_pp_missing_parameter(name, self):
-                        continue
-
-                    param = params_dict[name]
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
-                    weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
-
-
-def _is_moe_layer(name: str):
-    return any(
-        [experts_name in name for experts_name in [
-            "experts",
-            "router",
-        ]])
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        return self.model.get_expert_mapping()
 
 
 class JambaForSequenceClassification(JambaForCausalLM):
@@ -592,7 +593,5 @@ class JambaForSequenceClassification(JambaForCausalLM):
                 pooler_config,
                 classifier=self.score,
                 default_pooling_type=PoolingType.LAST,
-                default_normalize=False,
-                default_softmax=False,
             ),
         })
diff --git a/vllm/model_executor/models/jina_vl.py b/vllm/model_executor/models/jina_vl.py
index 0c4284f7daaac..8c64f636c6a0f 100644
--- a/vllm/model_executor/models/jina_vl.py
+++ b/vllm/model_executor/models/jina_vl.py
@@ -90,15 +90,12 @@ class JinaVLForSequenceClassification(Qwen2VLForConditionalGeneration,
                          prefix=maybe_prefix(prefix, "qwen2_vl"))
         config = vllm_config.model_config.hf_config
         pooler_config = vllm_config.model_config.pooler_config
+        assert pooler_config is not None
 
         # logit bias for sigmoid normalization
         self.LOGIT_BIAS = 2.65
 
         self.score = JinaVLScorer(config)
-
-        pooler_config = vllm_config.model_config.pooler_config
-        assert pooler_config is not None
-
         self.pooler = DispatchPooler({
             "encode":
             Pooler.for_encode(pooler_config),
diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py
index 36e57b5e4f46a..40c66c2268507 100644
--- a/vllm/model_executor/models/keye.py
+++ b/vllm/model_executor/models/keye.py
@@ -44,8 +44,6 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.platforms import _Backend
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.config import uses_mrope
-from vllm.transformers_utils.processor import (
-    cached_image_processor_from_config)
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
@@ -106,13 +104,16 @@ def smart_resize(
 class KeyeImagePixelInputs(TensorSchema):
     """
     Dimensions:
+        - b: Batch size
         - np: Number of patches
-        - cps: Number of channels * patch_size * patch_size
+        - c: Number of channels
+        - ps: Patch size
         - ni: Number of images
         - g: Grid dimensions (3 for t, h, w)
     """
     type: Literal["pixel_values"]
-    pixel_values: Annotated[torch.Tensor, TensorShape("np", "cps")]
+    pixel_values: Annotated[torch.Tensor,
+                            TensorShape("b", "np", 3, "ps", "ps")]
     image_grid_thw: Annotated[torch.Tensor, TensorShape("ni", 3)]
 
 
@@ -136,14 +137,16 @@ KeyeImageInputs = Union[KeyeImagePixelInputs, KeyeImageEmbeddingInputs]
 class KeyeVideoPixelInputs(TensorSchema):
     """
     Dimensions:
+        - b: Batch size
         - np: Number of patches
-        - ctps: Number of channels * temporal_patch_size * patch_size * 
-          patch_size
-        - nv: Number of videos
+        - c: Number of channels
+        - ps: Patch size
+        - ni: Number of images
         - g: Grid dimensions (3 for t, h, w)
     """
     type: Literal["pixel_values_videos"]
-    pixel_values_videos: Annotated[torch.Tensor, TensorShape("np", "ctps")]
+    pixel_values_videos: Annotated[torch.Tensor,
+                                   TensorShape("b", "np", 3, "ps", "ps")]
     video_grid_thw: Annotated[torch.Tensor, TensorShape("nv", 3)]
 
 
@@ -980,75 +983,8 @@ class KeyeMultiModalDataParser(MultiModalDataParser):
 
 class KeyeProcessingInfo(BaseProcessingInfo):
 
-    def get_hf_config(self):
-        return self.ctx.get_hf_config(PretrainedConfig)
-
-    def get_hf_processor(
-        self,
-        *,
-        min_pixels: Optional[int] = None,
-        max_pixels: Optional[int] = None,
-        size: Optional[dict[str, int]] = None,
-        **kwargs: object,
-    ):
-        return self.ctx.get_hf_processor(
-            image_processor=self.get_image_processor(
-                min_pixels=min_pixels,
-                max_pixels=max_pixels,
-                size=size,
-            ),
-            **kwargs,
-        )
-
-    def _get_image_processor_kwargs(
-        self,
-        *,
-        min_pixels: Optional[int] = None,
-        max_pixels: Optional[int] = None,
-        size: Optional[dict[str, int]] = None,
-        **kwargs: object,
-    ):
-        if self.ctx.model_config.mm_processor_kwargs:
-            kwargs.update(self.ctx.model_config.mm_processor_kwargs)
-
-        if min_pixels is not None:
-            kwargs["min_pixels"] = min_pixels
-
-            if size is None:
-                size = {"shortest_edge": min_pixels}
-            else:
-                size["shortest_edge"] = min_pixels
-
-        if max_pixels is not None:
-            kwargs["max_pixels"] = max_pixels
-
-            if size is None:
-                size = {"longest_edge": max_pixels}
-            else:
-                size["longest_edge"] = max_pixels
-
-        if size is not None:
-            kwargs["size"] = size
-
-        return kwargs
-
-    def get_image_processor(
-        self,
-        *,
-        min_pixels: Optional[int] = None,
-        max_pixels: Optional[int] = None,
-        size: Optional[dict[str, int]] = None,
-        **kwargs: object,
-    ):
-        return cached_image_processor_from_config(
-            self.ctx.model_config,
-            **self._get_image_processor_kwargs(
-                min_pixels=min_pixels,
-                max_pixels=max_pixels,
-                size=size,
-                **kwargs,
-            ),
-        )
+    def get_image_processor(self, **kwargs: object):
+        return self.get_hf_processor(**kwargs).image_processor
 
     def get_supported_mm_limits(self, ) -> Mapping[str, Optional[int]]:
         return {"image": None, "video": None}
@@ -1249,20 +1185,6 @@ class KeyeMultiModalProcessor(BaseMultiModalProcessor[KeyeProcessingInfo]):
     def _get_data_parser(self) -> MultiModalDataParser:
         return KeyeMultiModalDataParser()
 
-    def _call_hf_processor(
-        self,
-        prompt: str,
-        mm_data: Mapping[str, object],
-        mm_kwargs: Mapping[str, object],
-        tok_kwargs: Mapping[str, object],
-    ) -> BatchFeature:
-        mm_kwargs = self.info._get_image_processor_kwargs(**mm_kwargs)
-        return self.info.ctx.call_hf_processor(
-            self.info.get_hf_processor(**mm_kwargs),
-            dict(text=prompt, **mm_data),
-            dict(**mm_kwargs, **tok_kwargs),
-        )
-
     def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py
index 9c0a6ba92389b..1c7ddd7df7f82 100644
--- a/vllm/model_executor/models/kimi_vl.py
+++ b/vllm/model_executor/models/kimi_vl.py
@@ -46,7 +46,7 @@ import copy
 import math
 from collections.abc import Iterable, Mapping, Sequence
 from dataclasses import dataclass
-from typing import Any, Literal, Optional, TypedDict, Union
+from typing import Annotated, Any, Literal, Optional, Union
 
 import torch
 from torch import nn
@@ -79,6 +79,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs import KimiVLConfig, MoonViTConfig
 from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .utils import is_pp_missing_parameter, maybe_prefix
 
@@ -118,15 +119,22 @@ class KimiVLMultiModalProjector(nn.Module):
         return hidden_states
 
 
-class KimiVLImagePixelInputs(TypedDict):
-    type: Literal["pixel_values"]
-    pixel_values: Union[torch.Tensor, list[torch.Tensor]]
+class KimiVLImagePixelInputs(TensorSchema):
     """
-    Shape:`(num_patches, num_channels, patch_size, patch_size)`
+    Dimensions:
+        - nc: Number of channels
+        - np: Number of patches
+        - ps: Patch size
+        - ni: Number of images
     """
+    type: Literal["pixel_values"] = "pixel_values"
 
-    image_grid_hws: torch.Tensor
-    """Shape:`(num_images, 2)`"""
+    pixel_values: Annotated[
+        Union[torch.Tensor, list[torch.Tensor]],
+        TensorShape("np", 3, "ps", "ps"),
+    ]
+
+    image_grid_hws: Annotated[torch.Tensor, TensorShape("ni", 2)]
 
 
 # TODO: support embeds too
@@ -348,8 +356,6 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal):
             pixel_values = pixel_values.reshape(-1, num_channels, patch_size,
                                                 patch_size)
         pixel_values = pixel_values.to(self.vision_tower.dtype)
-        # image_grid_hws.shape = (N, 2)
-        assert image_grid_hws.ndim == 2, f"unexpected shape for image_grid_hws: {image_grid_hws.shape}"
 
         return KimiVLImagePixelInputs(
             type="pixel_values",
diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py
index fab1c163ac288..60098209c39ac 100644
--- a/vllm/model_executor/models/llama4.py
+++ b/vllm/model_executor/models/llama4.py
@@ -256,6 +256,7 @@ class Llama4DecoderLayer(nn.Module):
         super().__init__()
 
         self.layer_idx = extract_layer_index(prefix)
+        self.global_layer = config.no_rope_layers[self.layer_idx] == 0
         self.hidden_size = config.hidden_size
         rope_theta = config.rope_theta
         rope_scaling = config.rope_scaling
@@ -342,34 +343,94 @@ class Llama4Model(LlamaModel):
         expert_params_mapping: list[tuple[str, str, int, str]],
         fused: bool = True,
     ) -> bool:
+        """
+        Load MoE expert weights.
+
+        Args:
+            name: The name of the weight to load.
+            loaded_weight: The weight to load.
+            params_dict: The dictionary of module parameters.
+            loaded_params: The set of already loaded parameters.
+            expert_params_mapping: The mapping of expert parameters. Must be
+                generated by FusedMoE.make_expert_params_mapping().
+            fused: Whether the expert weights are fused into a single weight
+                tensor or are separate weight tensors for each expert.
+                When fused is True, loaded_weight should have shape of:
+                [num_experts, hidden_in, hidden_out] for gate/up/down proj and
+                [hidden_out, hidden_in] for the others like router.
+                When fused is False, loaded_weight should have shape of:
+                [hidden_out, hidden_in].
+
+        Returns:
+            True if loaded_weight is one of MoE weights and the MoE expert
+            weights are loaded successfully, False otherwise.
+        """
+
+        # Whether the MoE expert weights are loaded successfully.
         expert_param_loaded = False
-        if "experts.gate_up_proj" in name:
-            loaded_weight = loaded_weight.chunk(2, dim=-1)
+
+        # If fused is True, the loaded weight is in the layout of:
+        # [num_experts, hidden_in, hidden_out], so we must transpose the last
+        # two dimensions to match the expected layout of the parameters.
+        if fused and loaded_weight.ndim == 3:
+            loaded_weight = loaded_weight.transpose(-1, -2)
+
+            # If the gate_proj and up_proj weights are fused into a single
+            # weight tensor, we need to split the weight tensor into a tuple
+            # of two weight tensors along the hidden_out dimension.
+            if "experts.gate_up_proj" in name:
+                loaded_weight = loaded_weight.chunk(2, dim=-2)
+
+        # Iterate over all the expert parameters and load the weights if we find
+        # a match in weight name.
         for (param_name, weight_name, expert_id,
              shard_id) in expert_params_mapping:
+
+            # Get a view of the loaded_weight to avoid modifying the original
+            # one across iterations.
             new_loaded_weight = loaded_weight
+
+            # If expert weights are fused into a single weight tensor, remove
+            # the expert index from the expected weight name.
             if fused:
+                # The string between e_str and proj_str is the expert index.
                 e_str, _, proj_str, _ = weight_name.split('.')
                 weight_name = f"{e_str}.{proj_str}"
                 param_name = f"{param_name}weight"
+
+            # Skip if the current weight is not one of the MoE weights.
             if weight_name not in name:
                 continue
+
+            # Replace the weight name with the parameter name.
             full_param_name = name.replace(weight_name, param_name)
-            # Skip layers on other devices.
+
+            # Skip if the current weight corresponds to a parameter that
+            # does not exist on the current PP (pipeline parallel) rank.
             if is_pp_missing_parameter(name, self):
                 continue
+
+            # Skip if the current weight is for the bias.
             if ((name.endswith(".bias") or name.endswith("_bias"))
                     and name not in params_dict):
                 continue
+
             param = params_dict[full_param_name]
             weight_loader = param.weight_loader
+
             if fused:
+                # If the parameter is for w13 together, the corresponding weight
+                # will be a tuple, so we must select the correct weight
+                # depending on the shard id, which is either "w1" or "w3".
                 if "w13" in full_param_name:
+                    assert shard_id in ["w1", "w3"]
                     shard_idx = 0 if shard_id == "w1" else 1
                     new_loaded_weight = new_loaded_weight[shard_idx]
-                new_loaded_weight = new_loaded_weight.transpose(-1, -2)
+
+                # If EP (expert parallel) is enabled, update expert_id to the
+                # starting expert index for the current EP rank and extract the
+                # corresponding expert weights.
                 layer_idx = extract_layer_index(name)
-                # EP mapping
                 expert_map = self.layers[
                     layer_idx].feed_forward.experts.expert_map
                 if expert_map is not None:
@@ -382,6 +443,9 @@ class Llama4Model(LlamaModel):
             else:
                 # TODO: add EP support for non fused weights
                 pass
+
+            # Load the weight into the module parameter with corresponding
+            # shard id and expert id.
             weight_loader(param,
                           new_loaded_weight,
                           full_param_name,
@@ -390,10 +454,13 @@ class Llama4Model(LlamaModel):
 
             loaded_params.add(full_param_name)
             expert_param_loaded = True
+
         return expert_param_loaded
 
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
+        # Name mapping from the parameter name to the shard name and
+        # corresponding shard id.
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             (".qkv_proj", ".q_proj", "q"),
@@ -402,26 +469,43 @@ class Llama4Model(LlamaModel):
             (".gate_up_proj", ".gate_proj", 0),
             (".gate_up_proj", ".up_proj", 1),
         ]
+        # Indicate whether the expert weights are fused into a single weight
+        # tensor.
         fused_experts_params = False
+        # Expert parameter mapping for the case where the expert weights are
+        # not fused into a single weight tensor.
         expert_params_mapping = FusedMoE.make_expert_params_mapping(
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
             num_experts=self.num_experts)
+        # Expert parameter mapping for the case where the expert weights are
+        # fused into a single weight tensor.
         expert_params_mapping_fused = FusedMoE.make_expert_params_mapping(
             ckpt_gate_proj_name="gate_up_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="gate_up_proj",
             num_experts=1)
+        # All the module parameters.
         params_dict = dict(self.named_parameters())
+        # The module parameters that have been loaded.
         loaded_params: set[str] = set()
+
+        # Iterate over all the weights and load them into module parameters.
         for name, loaded_weight in weights:
+
+            # If the name contains "experts.gate_up_proj" or "experts.down_proj"
+            # without the expert indices, it means the expert weights are fused
+            # into a single weight tensor across all experts.
             if "experts.gate_up_proj" in name or "experts.down_proj" in name:
                 fused_experts_params = True
                 expert_params_mapping = expert_params_mapping_fused
+
+            # If kv cache quantization scales exist and the weight name
+            # corresponds to one of the kv cache quantization scales, load
+            # them.
             if (self.quant_config is not None and
                 (scale_name := self.quant_config.get_cache_scale(name))):
-                # Loading kv cache quantization scales
                 param = params_dict[scale_name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
@@ -430,84 +514,119 @@ class Llama4Model(LlamaModel):
                 weight_loader(param, loaded_weight)
                 loaded_params.add(scale_name)
                 continue
+
+            # Iterate over stacked_params_mapping to check if the current weight
+            # is one of the stacked parameters. If so, load the weight with the
+            # corresponding shard id. Note that MoE weights are handled
+            # separately in the else block.
             for param_name, weight_name, shard_id in stacked_params_mapping:
+                # Skip if the current weight is not one of the stacked
+                # parameters or if the current weight is a MoE weight.
                 if weight_name not in name or "experts" in name:
                     continue
-                # This check is for ModelOpt ckpts with kv cache quant enabled
+
+                # For ModelOpt checkpoints, we need to rename the self_attn
+                # weight/weight_scale names except for kv cache scales.
                 if not (name.endswith(
                     (".k_scale", ".v_scale")) and "self_attn" in name):
                     name = name.replace(weight_name, param_name)
+
+                # Skip if the current weight corresponds to a parameter that
+                # does not exist on the current PP (pipeline parallel) rank.
                 if is_pp_missing_parameter(name, self):
                     continue
-                if name.endswith("scale") and "expert" not in name:
-                    # Remapping the name of FP8 kv-scale.
+
+                # Remap kv cache scale names for ModelOpt checkpoints.
+                # TODO: ModelOpt should implement get_cache_scale() such that
+                #       kv cache scale name remapping can be done there.
+                if name.endswith("scale"):
                     name = maybe_remap_kv_scale_name(name, params_dict)
                     if name is None:
                         continue
+
+                # Load the weight into the module parameter with corresponding
+                # shard id and exit the for loop and the else block.
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
+
                 if weight_loader == default_weight_loader:
                     weight_loader(param, loaded_weight)
                 else:
                     weight_loader(param, loaded_weight, shard_id)
+
                 loaded_params.add(name)
                 break
+
+            # Handle normal (non-stacked) weights and MoE weights.
             else:
-                moe_loaded = self.load_moe_expert_weights(
-                    name,
-                    loaded_weight,
-                    params_dict,
-                    loaded_params,
-                    expert_params_mapping,
-                    fused=fused_experts_params)
+                # First, try to load MoE weights using load_moe_expert_weights.
+                # If successful, move on to next loaded weight.
+                if self.load_moe_expert_weights(name,
+                                                loaded_weight,
+                                                params_dict,
+                                                loaded_params,
+                                                expert_params_mapping,
+                                                fused=fused_experts_params):
+                    continue
 
-                if not moe_loaded:
-                    if is_pp_missing_parameter(name, self):
-                        continue
+                # Skip if the current weight corresponds to a parameter that
+                # does not exist on the current PP (pipeline parallel) rank.
+                if is_pp_missing_parameter(name, self):
+                    continue
 
-                    # Handle flat expert scale parameters that
-                    # don't match per-expert patterns
-                    if ("experts." in name and ("w13_input_scale" in name
-                                                or "w13_weight_scale" in name
-                                                or "w2_input_scale" in name
-                                                or "w2_weight_scale" in name)):
-                        # These are flat expert scales that apply to all experts
-                        param = params_dict[name]
-                        weight_loader = getattr(param, "weight_loader",
-                                                default_weight_loader)
-
-                        # Check for MoE-specific loading support via
-                        # attribute instead of expensive runtime reflection
-                        supports_moe = getattr(weight_loader,
-                                               'supports_moe_loading', False)
-
-                        if supports_moe:
-                            # This is a MoE weight loader
-                            if "w13_" in name:
-                                shard_id = "w1"
-                            elif "w2_" in name:
-                                shard_id = "w2"
-                            else:
-                                shard_id = "w1"
-
-                            weight_loader(param,
-                                          loaded_weight,
-                                          name,
-                                          shard_id=shard_id,
-                                          expert_id=0)
-                        else:
-                            # Regular weight loader (handles both
-                            # param.weight_loader and default_weight_loader)
-                            weight_loader(param, loaded_weight)
-                        loaded_params.add(name)
-                        continue
+                # Handle flat expert scale parameters that don't match
+                # per-expert patterns, i.e. one weight scale tensor for all
+                # experts.
+                scale_names = [
+                    "w13_input_scale", "w13_weight_scale", "w2_input_scale",
+                    "w2_weight_scale"
+                ]
+                if ("experts." in name and any(scale_name in name
+                                               for scale_name in scale_names)):
 
                     param = params_dict[name]
                     weight_loader = getattr(param, "weight_loader",
                                             default_weight_loader)
-                    weight_loader(param, loaded_weight)
+
+                    # If weight loader supports special moe loading, use it to
+                    # avoid expensive runtime reflection
+                    if getattr(weight_loader, 'supports_moe_loading', False):
+                        # Map the weight name to the corresponding shard id.
+                        shard_id = "w2" if "w2_" in name else "w1"
+
+                        # Transpose if weight scales are FP8 block scales with
+                        # three dimensions:
+                        # [num_experts, hidden_in, hidden_out].
+                        if name.endswith("weight_scale") \
+                            and loaded_weight.dtype == torch.float8_e4m3fn \
+                            and loaded_weight.ndim == 3:
+                            loaded_weight = loaded_weight.transpose(-1, -2)
+
+                        # Load the weight into the module parameter with
+                        # corresponding shard id and expert id.
+                        weight_loader(param,
+                                      loaded_weight,
+                                      name,
+                                      shard_id=shard_id,
+                                      expert_id=0)
+
+                    else:
+                        # Regular weight loader (handles both
+                        # param.weight_loader and default_weight_loader)
+                        weight_loader(param, loaded_weight)
+
                     loaded_params.add(name)
+                    continue
+
+                # Handle normal (non-stacked, non-MoE) weights.
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+                loaded_params.add(name)
+
+        # Finally, return the set of loaded parameters.
         return loaded_params
 
 
@@ -560,23 +679,43 @@ class Llama4ForCausalLM(LlamaForCausalLM):
         loaded_weight: torch.Tensor,
     ) -> tuple[str, torch.Tensor]:
 
-        def permute(w: torch.Tensor, n_heads: int):
+        # Helper function to permute the weight's channels
+        def permute(w: torch.Tensor, n_heads: int, is_weight_scale: bool):
+
+            # Calculate the expected shape of the weight.
+            # Do not rely on w's shape, as it may be in another layout.
             attn_in = self.config.head_dim * n_heads
             attn_out = self.config.hidden_size
 
+            # If the weight is FP4 packed as uint8, we need to divide attn_out
+            # by 2.
+            if w.dtype == torch.uint8 and w.shape[1] * 2 == attn_out:
+                attn_out = attn_out // 2
+
+            # If the weight is a weight scale, we need to divide attn_out by
+            # block size, which is currently 16.
+            elif w.dtype == torch.float8_e4m3fn and is_weight_scale \
+                and w.shape[1] * 16 == attn_out:
+                attn_out = attn_out // 16
+
             return w.view(n_heads, attn_in // n_heads // 2, 2,
                           attn_out).transpose(1, 2).reshape(attn_in, attn_out)
 
         modules = name.split(".")
 
-        # rotary embeds should be sliced
-        if ("wk" in modules or "k_proj" in modules) \
-           and modules[-1] == "weight":
-            loaded_weight = permute(loaded_weight,
-                                    self.config.num_key_value_heads)
-        elif ("wq" in modules or "q_proj" in modules) \
-                and modules[-1] == "weight":
-            loaded_weight = permute(loaded_weight,
-                                    self.config.num_attention_heads)
+        # Permute Q/K weights and weight block scales for rotary embedding
+        is_weight = modules[-1] == "weight"
+        is_nvfp4_weight_scale = (modules[-1] == "weight_scale" and
+                                 loaded_weight.dtype == torch.float8_e4m3fn)
+
+        if is_weight or is_nvfp4_weight_scale:
+            if ("wk" in modules or "k_proj" in modules):
+                loaded_weight = permute(loaded_weight,
+                                        self.config.num_key_value_heads,
+                                        is_nvfp4_weight_scale)
+            elif ("wq" in modules or "q_proj" in modules):
+                loaded_weight = permute(loaded_weight,
+                                        self.config.num_attention_heads,
+                                        is_nvfp4_weight_scale)
 
         return name, loaded_weight
diff --git a/vllm/model_executor/models/llama4_eagle.py b/vllm/model_executor/models/llama4_eagle.py
index 222ab5dfaee4a..ece490ff2f2a8 100644
--- a/vllm/model_executor/models/llama4_eagle.py
+++ b/vllm/model_executor/models/llama4_eagle.py
@@ -37,8 +37,9 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.llama4 import (Llama4DecoderLayer,
                                                Llama4ForCausalLM)
 from vllm.model_executor.models.utils import extract_layer_index
+from vllm.multimodal.inputs import NestedTensors
 
-from .utils import AutoWeightsLoader, maybe_prefix
+from .utils import AutoWeightsLoader, maybe_prefix, merge_multimodal_embeddings
 
 logger = init_logger(__name__)
 
@@ -78,15 +79,23 @@ class LlamaModel(nn.Module):
         self.norm = RMSNorm(self.config.hidden_size,
                             eps=self.config.rms_norm_eps)
 
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+    ) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: Optional[torch.Tensor],
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
-        input_embeds = self.embed_tokens(input_ids)
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings(input_ids)
         hidden_states = self.fc(
-            torch.cat((input_embeds, hidden_states), dim=-1))
+            torch.cat((inputs_embeds, hidden_states), dim=-1))
         residual = None
         for layer in self.layers:
             hidden_states, residual = layer(
@@ -190,8 +199,9 @@ class EagleLlama4ForCausalLM(Llama4ForCausalLM):
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
-        return self.model(input_ids, positions, hidden_states)
+        return self.model(input_ids, positions, hidden_states, inputs_embeds)
 
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> None:
@@ -212,3 +222,20 @@ class EagleLlama4ForCausalLM(Llama4ForCausalLM):
             model_weights[name] = loaded_weight
 
         loader.load_weights(model_weights.items())
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.model.get_input_embeddings(input_ids)
+
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                multimodal_embeddings,
+                self.config.image_token_index,
+            )
+
+        return inputs_embeds
diff --git a/vllm/model_executor/models/llama_eagle.py b/vllm/model_executor/models/llama_eagle.py
index c7690604c1d09..a4933b77e3a53 100644
--- a/vllm/model_executor/models/llama_eagle.py
+++ b/vllm/model_executor/models/llama_eagle.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
+from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -148,7 +149,12 @@ class EagleLlamaForCausalLM(LlamaForCausalLM):
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
+        if inputs_embeds is not None:
+            raise NotImplementedError(
+                f"{type(self).__name__} does not support multimodal inputs yet."
+            )
         return self.model(input_ids, positions, hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py
index 7fc9fe2ebb6f6..572930c39a846 100644
--- a/vllm/model_executor/models/llama_eagle3.py
+++ b/vllm/model_executor/models/llama_eagle3.py
@@ -51,6 +51,25 @@ class LlamaDecoderLayer(LlamaDecoderLayer):
 
         self.hidden_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
+        if getattr(config, "norm_before_residual", False):
+            self._residual_norm = self._norm_before_residual
+        else:
+            self._residual_norm = self._norm_after_residual
+
+    def _norm_before_residual(
+            self,
+            hidden_states: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        hidden_states = self.hidden_norm(hidden_states)
+        residual = hidden_states
+        return hidden_states, residual
+
+    def _norm_after_residual(
+            self,
+            hidden_states: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        residual = hidden_states
+        hidden_states = self.hidden_norm(hidden_states)
+        return hidden_states, residual
+
     def forward(
         self,
         positions: torch.Tensor,
@@ -59,9 +78,10 @@ class LlamaDecoderLayer(LlamaDecoderLayer):
         residual: Optional[torch.Tensor],
     ) -> tuple[torch.Tensor, torch.Tensor]:
 
-        residual = hidden_states
         embeds = self.input_layernorm(embeds)
-        hidden_states = self.hidden_norm(hidden_states)
+
+        hidden_states, residual = self._residual_norm(
+            hidden_states=hidden_states)
 
         hidden_states = torch.cat([embeds, hidden_states], dim=-1)
         # Self Attention
@@ -102,7 +122,7 @@ class LlamaModel(nn.Module):
 
         self.layers = nn.ModuleList([
             LlamaDecoderLayer(
-                self.config,
+                config=self.config,
                 prefix=maybe_prefix(prefix, f"layers.{start_layer_id}"),
             )
         ])
@@ -202,7 +222,12 @@ class Eagle3LlamaForCausalLM(LlamaForCausalLM):
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
+        if inputs_embeds is not None:
+            raise NotImplementedError(
+                f"{type(self).__name__} does not support multimodal inputs yet."
+            )
         return self.model(input_ids, positions, hidden_states)
 
     def compute_logits(
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 0126ace09e707..c863ba406422d 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -8,11 +8,9 @@ from typing import (Final, Literal, Optional, Protocol, TypedDict, TypeVar,
 
 import torch
 import torch.nn as nn
-from packaging.version import Version
 from transformers import (BatchFeature, CLIPVisionConfig, LlavaConfig,
                           PixtralVisionConfig, PretrainedConfig,
                           SiglipVisionConfig)
-from transformers import __version__ as TRANSFORMERS_VERSION
 from transformers.models.llava import LlavaProcessor
 from transformers.models.pixtral import PixtralProcessor
 
@@ -307,29 +305,14 @@ class PixtralHFMultiModalProcessor(
 
         pixel_values = processed_outputs.get("pixel_values")
         if pixel_values is not None:
-            # Before/after https://github.com/huggingface/transformers/pull/35122
-            if Version(TRANSFORMERS_VERSION) <= Version("4.48.3"):
-                images = mm_data["images"]
-                assert isinstance(images, list)
+            # Avoid padding since we need the output for each image to be
+            # independent of other images for the cache to work correctly
+            image_sizes = processed_outputs["image_sizes"]
+            assert len(pixel_values) == len(image_sizes)
 
-                # Original output: (1, num_images, C, H, W)
-                # New output: (num_images, C, H, W)
-                assert (isinstance(pixel_values, list)
-                        and len(pixel_values) == 1)
-                assert (isinstance(pixel_values[0], list)
-                        and len(pixel_values[0]) == len(images))
-
-                processed_outputs["pixel_values"] = pixel_values[0]
-            else:
-                # Avoid padding since we need the output for each image to be
-                # independent of other images for the cache to work correctly
-                image_sizes = processed_outputs["image_sizes"]
-                assert len(pixel_values) == len(image_sizes)
-
-                processed_outputs["pixel_values"] = [
-                    p[:, :h, :w]
-                    for p, (h, w) in zip(pixel_values, image_sizes)
-                ]
+            processed_outputs["pixel_values"] = [
+                p[:, :h, :w] for p, (h, w) in zip(pixel_values, image_sizes)
+            ]
 
         return processed_outputs
 
@@ -784,17 +767,10 @@ class MantisProcessingInfo(LlavaProcessingInfo):
         vision_info = self.get_vision_encoder_info()
 
         kwargs.setdefault("patch_size", vision_info.get_patch_size())
-
-        if Version(TRANSFORMERS_VERSION) < Version("4.48"):
-            # BUG: num_additional_image_tokens = 0 but treated as 1,
-            # so we set vision_feature_select_strategy to None to offset this
-            kwargs.setdefault("vision_feature_select_strategy", None)
-        else:
-            # FIXED: https://github.com/huggingface/transformers/pull/33424/files#diff-6a37acc21efcadaae622b079b2712a131131448ff64262bd219aa346aeec38faL150
-            kwargs.setdefault(
-                "vision_feature_select_strategy",
-                hf_config.vision_feature_select_strategy,
-            )
+        kwargs.setdefault(
+            "vision_feature_select_strategy",
+            hf_config.vision_feature_select_strategy,
+        )
 
         return self.ctx.get_hf_processor(LlavaProcessor, **kwargs)
 
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 70f2d4a6420b9..e172758b2f2c5 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -331,10 +331,8 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo):
 
         return hf_processor
 
-    def get_image_processor(self):
-        hf_processor = self.get_hf_processor()
-        image_processor = hf_processor.image_processor  # type: ignore
-        return image_processor
+    def get_image_processor(self, **kwargs: object):
+        return self.get_hf_processor(**kwargs).image_processor
 
     def get_model_version(self):
         return get_version_by_config(self.get_hf_config())
diff --git a/vllm/model_executor/models/minimax_vl_01.py b/vllm/model_executor/models/minimax_vl_01.py
index 9aba82cb115ed..62a7d37ec9d33 100644
--- a/vllm/model_executor/models/minimax_vl_01.py
+++ b/vllm/model_executor/models/minimax_vl_01.py
@@ -5,7 +5,7 @@ from typing import Literal, Optional, TypedDict, Union, cast
 
 import torch
 import torch.nn as nn
-from transformers import BatchFeature
+from transformers import BatchFeature, PretrainedConfig
 
 from vllm.config import VllmConfig
 from vllm.jsontree import json_map_leaves
@@ -17,7 +17,6 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalFieldConfig
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.configs.minimax_vl_01 import MiniMaxVL01Config
 
 from .clip import CLIPVisionModel
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
@@ -90,8 +89,8 @@ class MiniMaxVL01DummyInputsBuilder(LlavaDummyInputsBuilder):
 
 class MiniMaxVL01ProcessingInfo(LlavaNextProcessingInfo):
 
-    def get_hf_config(self):
-        return self.ctx.get_hf_config(MiniMaxVL01Config)
+    def get_hf_config(self):  # Need to override the config type
+        return self.ctx.get_hf_config(PretrainedConfig)
 
     def get_hf_processor(self, **kwargs: object):
         hf_processor = self.ctx.get_hf_processor(**kwargs)
diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index dea85d320adfd..e73dc0c2be82e 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -533,7 +533,7 @@ class Mllama4ProcessingInfo(BaseProcessingInfo):
 
     def get_hf_processor(self, **kwargs: object) -> Llama4Processor:
         return self.ctx.get_hf_processor(Llama4Processor,
-                                         use_fast=True,
+                                         use_fast=kwargs.pop("use_fast", True),
                                          **kwargs)
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
@@ -906,11 +906,13 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
     def _rename_weight_for_modelopt_checkpoint(self, name: str) -> str:
         """Rename weights from ModelOpt llama4 fp8 checkpoints to vLLM
         format."""
-        if name.startswith("model."):
+        if name.startswith("model.") or name.startswith(
+                "language_model.model."):
+            renamed = name.replace("model.", "language_model.model.",
+                                   1) if name.startswith("model.") else name
             # Handle expert scale parameters with flat naming
             if "feed_forward.experts." in name and ("_input_scale" in name or
                                                     "_weight_scale" in name):
-                renamed = name.replace("model.", "language_model.model.", 1)
                 # Map checkpoint naming to vLLM's expected naming
                 if "down_proj_input_scale" in renamed:
                     return renamed.replace("down_proj_input_scale",
@@ -929,7 +931,6 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
             # Handle attention scale parameters
             elif "self_attn." in name and (".k_scale" in name
                                            or ".v_scale" in name):
-                renamed = name.replace("model.", "language_model.model.", 1)
                 if ".k_proj.k_scale" in renamed:
                     return renamed.replace(".k_proj.k_scale", ".attn.k_scale")
                 elif ".v_proj.v_scale" in renamed:
@@ -937,7 +938,7 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
                 return renamed
 
             # Standard model.* to language_model.model.* renaming
-            return name.replace("model.", "language_model.model.", 1)
+            return renamed
 
         elif name.startswith("lm_head.weight"):
             return name.replace("lm_head.weight",
diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py
index fc2b0c1f51821..4967032a244ec 100644
--- a/vllm/model_executor/models/modernbert.py
+++ b/vllm/model_executor/models/modernbert.py
@@ -8,7 +8,6 @@ from torch import nn
 from transformers import ModernBertConfig
 
 from vllm.attention import Attention, AttentionType
-from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.linear import (QKVParallelLinear,
@@ -200,7 +199,6 @@ class ModernBertEncoderLayer(nn.Module):
         return hidden_states
 
 
-@support_torch_compile
 class ModernBertModel(nn.Module):
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={"layers.": "encoder_layer.layers."})
diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py
index 0878ada34d1d8..8db52a69924c9 100644
--- a/vllm/model_executor/models/mpt.py
+++ b/vllm/model_executor/models/mpt.py
@@ -8,6 +8,7 @@ from typing import Optional, Union
 
 import torch
 import torch.nn as nn
+from transformers import MptConfig
 
 from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
@@ -25,7 +26,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.configs.mpt import MPTConfig
 
 from .interfaces import SupportsPP
 from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
@@ -50,7 +50,7 @@ class MPTAttention(nn.Module):
 
     def __init__(
         self,
-        config: MPTConfig,
+        config: MptConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
@@ -59,15 +59,15 @@ class MPTAttention(nn.Module):
         self.d_model = config.d_model
         self.total_num_heads = config.n_heads
         self.head_dim = self.d_model // self.total_num_heads
-        self.clip_qkv = config.attn_config["clip_qkv"]
-        self.qk_ln = config.attn_config["qk_ln"]
-        self.alibi_bias_max = config.attn_config["alibi_bias_max"]
+        self.clip_qkv = config.attn_config.clip_qkv
+        self.qk_ln = config.attn_config.qk_ln
+        self.alibi_bias_max = config.attn_config.alibi_bias_max
         if "kv_n_heads" in config.attn_config:
-            self.total_num_kv_heads = config.attn_config['kv_n_heads']
+            self.total_num_kv_heads = config.attn_config.kv_n_heads
         else:
             self.total_num_kv_heads = self.total_num_heads
-        assert not config.attn_config["prefix_lm"]
-        assert config.attn_config["alibi"]
+        assert not config.attn_config.prefix_lm
+        assert config.attn_config.alibi
 
         # pylint: disable=invalid-name
         self.Wqkv = QKVParallelLinear(
@@ -144,7 +144,7 @@ class MPTMLP(nn.Module):
 
     def __init__(
         self,
-        config: MPTConfig,
+        config: MptConfig,
         quant_config: Optional[QuantizationConfig] = None,
     ):
         super().__init__()
@@ -176,7 +176,7 @@ class MPTBlock(nn.Module):
 
     def __init__(
         self,
-        config: MPTConfig,
+        config: MptConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
diff --git a/vllm/model_executor/models/nemotron_vl.py b/vllm/model_executor/models/nemotron_vl.py
index 5d0513d707413..b90cb9b39a60b 100644
--- a/vllm/model_executor/models/nemotron_vl.py
+++ b/vllm/model_executor/models/nemotron_vl.py
@@ -137,34 +137,16 @@ class NemotronVLProcessor(InternVLProcessor):
 class NemotronVLProcessingInfo(BaseInternVLProcessingInfo):
     """Processing info for Nemotron VL models."""
 
-    def get_hf_processor(
-        self,
-        *,
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
-        **kwargs: object,
-    ) -> NemotronVLProcessor:
-        if min_dynamic_patch is not None:
-            kwargs["min_dynamic_patch"] = min_dynamic_patch
-        if max_dynamic_patch is not None:
-            kwargs["max_dynamic_patch"] = max_dynamic_patch
-        if dynamic_image_size is not None:
-            kwargs["dynamic_image_size"] = dynamic_image_size
-
-        image_processor = self.get_image_processor()
+    def get_hf_processor(self, **kwargs: object) -> NemotronVLProcessor:
         return self.ctx.init_processor(
             NemotronVLProcessor,
             config=self.get_hf_config(),
             tokenizer=self.get_tokenizer(),
-            image_processor=image_processor,
+            image_processor=self.get_image_processor(),
             **kwargs,
         )
 
-    def get_image_processor(
-        self,
-        **kwargs: object,
-    ):
+    def get_image_processor(self, **kwargs: object):
         return cached_image_processor_from_config(
             self.ctx.model_config,
             **kwargs,
diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py
index 2f7f8e437f0ad..4bea1392a6814 100644
--- a/vllm/model_executor/models/nvlm_d.py
+++ b/vllm/model_executor/models/nvlm_d.py
@@ -63,21 +63,7 @@ class NVLMProcessor(BaseInternVLProcessor):
 
 class NVLMProcessingInfo(BaseInternVLProcessingInfo):
 
-    def get_hf_processor(
-        self,
-        *,
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
-        **kwargs: object,
-    ) -> NVLMProcessor:
-        if min_dynamic_patch is not None:
-            kwargs["min_dynamic_patch"] = min_dynamic_patch
-        if max_dynamic_patch is not None:
-            kwargs["max_dynamic_patch"] = max_dynamic_patch
-        if dynamic_image_size is not None:
-            kwargs["dynamic_image_size"] = dynamic_image_size
-
+    def get_hf_processor(self, **kwargs: object) -> NVLMProcessor:
         return self.ctx.init_processor(
             NVLMProcessor,
             config=self.get_hf_config(),
diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py
index 111628d8d18cb..6b27980e0b0c3 100644
--- a/vllm/model_executor/models/ovis.py
+++ b/vllm/model_executor/models/ovis.py
@@ -25,7 +25,7 @@ import torch
 import torch.nn as nn
 from torch import Tensor
 from torch.nn.functional import gumbel_softmax, pad, softmax
-from transformers import BaseImageProcessor, BatchFeature
+from transformers import BatchFeature, PretrainedConfig
 
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.linear import ReplicatedLinear
@@ -48,8 +48,6 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.configs.ovis import (BaseVisualTokenizerConfig,
-                                                  OvisConfig)
 from vllm.transformers_utils.processors.ovis import OvisProcessor
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
@@ -83,7 +81,7 @@ class VisualTokenizer(torch.nn.Module):
 
     def __init__(
         self,
-        config: BaseVisualTokenizerConfig,
+        config: PretrainedConfig,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
     ):
@@ -107,7 +105,7 @@ class VisualTokenizer(torch.nn.Module):
 
     def _init_backbone(
         self,
-        config: BaseVisualTokenizerConfig,
+        config: PretrainedConfig,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
     ) -> nn.Module:
@@ -247,14 +245,12 @@ class VisualEmbedding(torch.nn.Embedding):
 
 class OvisProcessingInfo(BaseProcessingInfo):
 
-    def get_hf_config(self):
-        return self.ctx.get_hf_config(OvisConfig)
-
-    def get_hf_processor(self, **kwargs):
+    def get_hf_processor(self, **kwargs: object):
         return self.ctx.get_hf_processor(
             OvisProcessor,
             image_pad_token=self.get_image_pad_token(),
             image_segment_len=self.get_image_segment_len(),
+            **kwargs,
         )
 
     def get_image_segment_len(self) -> int:
@@ -274,9 +270,6 @@ class OvisProcessingInfo(BaseProcessingInfo):
         text_model_type = hf_text_config.model_type
         return IMAGE_PAD_TOKEN_MAP.get(text_model_type)
 
-    def get_image_processor(self) -> BaseImageProcessor:
-        return self.get_hf_processor().image_processor  # type: ignore
-
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
 
@@ -417,7 +410,7 @@ class Ovis(nn.Module, SupportsMultiModal, SupportsPP):
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
 
-        self.config: OvisConfig = config
+        self.config: PretrainedConfig = config
         self.llm = init_vllm_registered_model(
             vllm_config=vllm_config.with_hf_config(config.get_text_config()),
             prefix=maybe_prefix(prefix, "llm"),
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index aa739f22fd7bf..9ef4f8371eb3d 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -318,17 +318,6 @@ class Phi3HDImageEmbedding(Phi3ImageEmbeddingBase):
 
 class Phi3VProcessingInfo(BaseProcessingInfo):
 
-    def get_hf_processor(
-        self,
-        *,
-        num_crops: Optional[int] = None,
-        **kwargs: object,
-    ) -> ProcessorMixin:
-        if num_crops is not None:
-            kwargs["num_crops"] = num_crops
-
-        return self.ctx.get_hf_processor(**kwargs)
-
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
 
diff --git a/vllm/model_executor/models/phi4_multimodal.py b/vllm/model_executor/models/phi4_multimodal.py
index 432b707a61591..e13b8276bf17a 100644
--- a/vllm/model_executor/models/phi4_multimodal.py
+++ b/vllm/model_executor/models/phi4_multimodal.py
@@ -696,19 +696,12 @@ class Phi4MMProcessingInfo(BaseProcessingInfo):
     def get_hf_config(self) -> Phi4MultimodalConfig:
         return self.ctx.get_hf_config(Phi4MultimodalConfig)
 
-    def get_hf_processor(
-        self,
-        *,
-        dynamic_hd: Optional[int] = None,
-        **kwargs: object,
-    ) -> Phi4MMProcessor:
-        if dynamic_hd is not None:
-            kwargs["dynamic_hd"] = dynamic_hd
+    def get_hf_processor(self, **kwargs: object) -> Phi4MMProcessor:
+        return self.ctx.get_hf_processor(Phi4MMProcessor, **kwargs)
 
-        return self.ctx.get_hf_processor(**kwargs)
-
-    def get_feature_extractor(self) -> Phi4MultimodalFeatureExtractor:
-        return self.get_hf_processor().audio_processor
+    def get_feature_extractor(
+            self, **kwargs: object) -> Phi4MultimodalFeatureExtractor:
+        return self.get_hf_processor(**kwargs).audio_processor
 
     def get_image_processor(
         self,
@@ -1007,7 +1000,7 @@ class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]):
 
         if audio_data:
             audio_features = processed_outputs['audio_input_features']
-            sr = self.info.get_feature_extractor().sampling_rate
+            sr = self.info.get_feature_extractor(**mm_kwargs).sampling_rate
             feature_sizes = [
                 self.info.get_audio_num_frames(len(audio), sr)
                 for audio in audio_data
@@ -1043,7 +1036,8 @@ class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]):
         audio_token_id = tokenizer.vocab[tokenizer.audio_token]
 
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
-        audio_processor = self.info.get_feature_extractor()
+        audio_processor = self.info.get_feature_extractor(
+            **hf_processor_mm_kwargs)
 
         def get_image_replacement_phi4mm(item_idx: int):
             images = mm_items.get_items(
diff --git a/vllm/model_executor/models/phi4flash.py b/vllm/model_executor/models/phi4flash.py
index a4ded2b7a3047..1a761d01fc066 100644
--- a/vllm/model_executor/models/phi4flash.py
+++ b/vllm/model_executor/models/phi4flash.py
@@ -387,7 +387,8 @@ class Phi4Mamba(nn.Module):
                 has_initial_state=attn_metadata.context_lens_tensor > 0,
                 query_start_loc=attn_metadata.query_start_loc)
         else:
-            scan_outputs = selective_state_update(
+            scan_outputs = torch.empty_like(hidden_states.transpose(0, 1))
+            selective_state_update(
                 mamba_cache_params.ssm_state,
                 hidden_states.transpose(0, 1),
                 discrete_time_step.transpose(0, 1),
@@ -400,7 +401,8 @@ class Phi4Mamba(nn.Module):
                 None if self.yoco_kv else gate.transpose(0, 1),
                 time_proj_bias,
                 dt_softplus=True,
-                state_batch_indices=mamba_cache_params.state_indices_tensor)
+                state_batch_indices=mamba_cache_params.state_indices_tensor,
+                out=scan_outputs)
             scan_outputs = scan_outputs.transpose(0, 1)
 
         # 4. Final linear projection
diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index 9b61c3634d841..73e8446e6dea7 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -459,17 +459,6 @@ def cat_with_pad(tensors, dim, padding_value=0):
 
 class Phi4MMProcessingInfo(BaseProcessingInfo):
 
-    def get_hf_processor(
-        self,
-        *,
-        dynamic_hd: Optional[int] = None,
-        **kwargs: object,
-    ) -> ProcessorMixin:
-        if dynamic_hd is not None:
-            kwargs["dynamic_hd"] = dynamic_hd
-
-        return self.ctx.get_hf_processor(**kwargs)
-
     @property
     def image_tokens(self) -> list[str]:
         return [f"<|image_{i+1}|>" for i in range(100)]
@@ -487,8 +476,9 @@ class Phi4MMProcessingInfo(BaseProcessingInfo):
         image_processor = processor.image_processor
         return image_processor.dynamic_hd
 
-    def get_feature_extractor(self) -> SequenceFeatureExtractor:
-        return self.get_hf_processor().audio_processor
+    def get_feature_extractor(self,
+                              **kwargs: object) -> SequenceFeatureExtractor:
+        return self.get_hf_processor(**kwargs).audio_processor
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"audio": None, "image": None}
@@ -769,7 +759,7 @@ class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]):
             prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
             return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
 
-        sr = self.info.get_feature_extractor().sampling_rate
+        sr = self.info.get_feature_extractor(**mm_kwargs).sampling_rate
         if (audio_data := mm_data.get("audios", [])):
             mm_data['audios'] = [(data, sr) for data in audio_data]
 
@@ -816,7 +806,8 @@ class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]):
     ) -> Sequence[PromptUpdate]:
         image_tokens: list[str] = self.info.image_tokens  # type: ignore
         audio_tokens: list[str] = self.info.audio_tokens  # type: ignore
-        feature_extractor = self.info.get_feature_extractor()
+        feature_extractor = self.info.get_feature_extractor(
+            **hf_processor_mm_kwargs)
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
 
         def get_image_replacement_phi4mm(item_idx: int):
diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py
index 9bc577cfe3a3e..8b1df66f02805 100644
--- a/vllm/model_executor/models/plamo2.py
+++ b/vllm/model_executor/models/plamo2.py
@@ -257,7 +257,21 @@ class Plamo2MambaMixer(nn.Module):
         query_start_loc_p = (attn_metadata.query_start_loc[:num_prefills + 1]
                              if has_prefill else None)
 
-        ssd_output_list = []
+        # Preallocate output tensor to avoid memcpy cost for merging prefill
+        # and decode outputs
+        preallocated_ssm_out = torch.empty(
+            [
+                num_prefill_tokens + num_decodes,
+                (self.num_heads // self.tp_size) * self.head_dim
+            ],
+            dtype=hidden_states.dtype,
+            device=hidden_states.device,
+        )
+        preallocated_ssm_out_p, preallocated_ssm_out_d = torch.split(
+            preallocated_ssm_out,
+            [num_prefill_tokens, num_decodes],
+            dim=0,
+        )
 
         # Process prefill requests
         if has_prefill:
@@ -290,7 +304,7 @@ class Plamo2MambaMixer(nn.Module):
                 initial_states = torch.where(
                     mamba2_metadata.has_initial_states[:, None, None, None],
                     mamba_cache_params.ssm_state[state_indices_tensor_p], 0)
-            scan_output, varlen_state = mamba_chunk_scan_combined(
+            varlen_state = mamba_chunk_scan_combined(
                 hidden_states_p.view(1, num_prefill_tokens,
                                      self.num_heads // self.tp_size,
                                      self.head_dim),
@@ -312,15 +326,14 @@ class Plamo2MambaMixer(nn.Module):
                 return_final_states=False,
                 dt_softplus=True,
                 dt_limit=(0.0, float("inf")),
+                out=preallocated_ssm_out_p.view(1, num_prefill_tokens, -1,
+                                                self.head_dim),
             )
 
             # update ssm states
             # - varlen state is a (batch, nheads, headdim, dstate) tensor
             mamba_cache_params.ssm_state[state_indices_tensor_p] = varlen_state
 
-            # - reshape
-            ssd_output_list.append(scan_output.view(num_prefill_tokens, -1))
-
         # Process decode requests
         if has_decode:
             # 2. Convolution sequence transformation
@@ -349,8 +362,7 @@ class Plamo2MambaMixer(nn.Module):
             # - the hidden is reshaped into (bs, num_heads, head_dim)
             # - mamba_cache_params.ssm_state's slots will be selected
             #   using state_indices_tensor_d
-
-            hidden_states_d = selective_state_update(
+            selective_state_update(
                 mamba_cache_params.ssm_state,
                 hidden_states_d,
                 dt,
@@ -362,17 +374,13 @@ class Plamo2MambaMixer(nn.Module):
                 dt_bias=dt_bias,
                 dt_softplus=True,
                 state_batch_indices=state_indices_tensor_d,
+                out=preallocated_ssm_out_d.view(num_decodes, -1,
+                                                self.head_dim),
             )
             assert self.num_heads % self.tp_size == 0
-            ssd_output_list.append(
-                hidden_states_d.view(-1, (self.num_heads // self.tp_size) *
-                                     self.head_dim))
-
-        # Merge prefill and decode outputs before passing to MLP
-        hidden_states = torch.vstack(ssd_output_list)
 
         # 4. Final linear projection
-        out = self.out_proj(hidden_states)
+        out = self.out_proj(preallocated_ssm_out)
         return out
 
 
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 23f65b99c22ce..0e7507a4570be 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -330,6 +330,8 @@ class Qwen2Model(nn.Module):
         else:
             self.norm = PPMissingLayer()
 
+        self.aux_hidden_state_layers: tuple[int] = tuple()
+
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
@@ -350,18 +352,25 @@ class Qwen2Model(nn.Module):
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for layer in self.layers[self.start_layer:self.end_layer]:
-            hidden_states, residual = layer(
-                positions,
-                hidden_states,
-                residual,
-            )
+
+        aux_hidden_states = []
+        for idx, layer in enumerate(
+                self.layers[self.start_layer:self.end_layer]):
+            if idx in self.aux_hidden_state_layers:
+                aux_hidden_states.append(hidden_states + residual)
+            hidden_states, residual = layer(positions, hidden_states, residual)
+
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({
                 "hidden_states": hidden_states,
                 "residual": residual
             })
+
         hidden_states, _ = self.norm(hidden_states, residual)
+
+        if len(aux_hidden_states) > 0:
+            return hidden_states, aux_hidden_states
+
         return hidden_states
 
     def load_weights(self, weights: Iterable[tuple[str,
diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py
index c5a5c10d9509f..b9fed79c84cdd 100644
--- a/vllm/model_executor/models/qwen2_5_omni_thinker.py
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -132,50 +132,15 @@ class Qwen2_5OmniThinkerProcessingInfo(Qwen2AudioProcessingInfo,
     def get_hf_config(self):
         return self.ctx.get_hf_config(Qwen2_5OmniConfig).thinker_config
 
-    def get_hf_processor(
-        self,
-        *,
-        sampling_rate: Optional[int] = None,
-        min_pixels: Optional[int] = None,
-        max_pixels: Optional[int] = None,
-        size: Optional[dict[str, int]] = None,
-        fps: Optional[Union[float, list[float]]] = None,
-        **kwargs: object,
-    ) -> Qwen2_5OmniProcessor:
-        if fps is not None:
-            kwargs["fps"] = fps
-
-        # Monkey patch for Transformers v4.53
-        processor_class = Qwen2_5OmniProcessor
-        if processor_class.image_processor_class != "AutoImageProcessor":
-            processor_class.image_processor_class = "AutoImageProcessor"
-        if processor_class.video_processor_class != "AutoVideoProcessor":
-            processor_class.video_processor_class = "AutoVideoProcessor"
-
-        processor = self.ctx.get_hf_processor(
-            processor_class,
-            image_processor=self.get_image_processor(min_pixels=min_pixels,
-                                                     max_pixels=max_pixels,
-                                                     size=size,
-                                                     use_fast=kwargs.get(
-                                                         "use_fast", True)),
+    def get_hf_processor(self, **kwargs: object) -> Qwen2_5OmniProcessor:
+        return self.ctx.get_hf_processor(
+            Qwen2_5OmniProcessor,
+            use_fast=kwargs.pop("use_fast", True),
             **kwargs,
         )
-        if not hasattr(processor, "audio_token"):
-            processor.audio_token = "<|AUDIO|>"
-        if not hasattr(processor, "image_token"):
-            processor.image_token = "<|IMAGE|>"
-        if not hasattr(processor, "video_token"):
-            processor.video_token = "<|VIDEO|>"
-        return processor
 
-    def get_feature_extractor(
-        self,
-        *,
-        sampling_rate: Optional[int] = None,
-        **kwargs: object,
-    ):
-        hf_processor = self.get_hf_processor(sampling_rate=sampling_rate)
+    def get_feature_extractor(self, **kwargs: object):
+        hf_processor = self.get_hf_processor(**kwargs)
         feature_extractor = hf_processor.feature_extractor  # type: ignore
         assert isinstance(feature_extractor, WhisperFeatureExtractor)
         return feature_extractor
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 8ae096536fdc5..45fb7f9580ae4 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -43,9 +43,10 @@ from vllm.distributed import parallel_state
 from vllm.distributed import utils as dist_utils
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
-from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY
+from vllm.model_executor.layers.activation import get_act_and_mul_fn
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
                                                QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -171,16 +172,12 @@ class Qwen2_5_VisionMLP(nn.Module):
                  quant_config: Optional[QuantizationConfig] = None,
                  prefix: str = ""):
         super().__init__()
-        self.gate_proj = ColumnParallelLinear(in_features,
-                                              hidden_features,
-                                              bias=bias,
-                                              quant_config=quant_config,
-                                              prefix=f"{prefix}.gate_proj")
-        self.up_proj = ColumnParallelLinear(in_features,
-                                            hidden_features,
-                                            bias=bias,
-                                            quant_config=quant_config,
-                                            prefix=f"{prefix}.up_proj")
+        self.gate_up_proj = MergedColumnParallelLinear(
+            input_size=in_features,
+            output_sizes=[hidden_features] * 2,  # [gate_proj, up_proj]
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj")
         self.down_proj = RowParallelLinear(hidden_features,
                                            in_features,
                                            bias=bias,
@@ -189,10 +186,9 @@ class Qwen2_5_VisionMLP(nn.Module):
         self.act_fn = act_fn
 
     def forward(self, x: torch.Tensor):
-        x_gate, _ = self.gate_proj(x)
-        x_gate = self.act_fn(x_gate)
-        x_up, _ = self.up_proj(x)
-        x_down, _ = self.down_proj(x_gate * x_up)
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x_down, _ = self.down_proj(x)
         return x_down
 
 
@@ -250,11 +246,15 @@ class Qwen2_5_VisionAttention(nn.Module):
         # Detect attention implementation.
         self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
         if self.attn_backend not in {
-                _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS
+                _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS,
+                _Backend.ROCM_AITER_FA
         }:
             raise RuntimeError(
                 f"Qwen2.5-VL does not support {self.attn_backend} backend now."
             )
+        self.is_flash_attn_backend = self.attn_backend in {
+            _Backend.FLASH_ATTN, _Backend.ROCM_AITER_FA
+        }
 
     def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
         # [s, b, 3 * head * head_dim]
@@ -301,10 +301,13 @@ class Qwen2_5_VisionAttention(nn.Module):
             q = apply_rotary_pos_emb_vision(q, rotary_pos_emb)
             k = apply_rotary_pos_emb_vision(k, rotary_pos_emb)
 
-        if self.attn_backend == _Backend.FLASH_ATTN:
+        if self.is_flash_attn_backend:
             # from vllm_flash_attn.flash_attn_interface import (
             #   flash_attn_varlen_func)
-            from flash_attn import flash_attn_varlen_func
+            if self.attn_backend == _Backend.ROCM_AITER_FA:
+                from aiter import flash_attn_varlen_func
+            else:
+                from flash_attn import flash_attn_varlen_func
 
             q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
 
@@ -315,7 +318,7 @@ class Qwen2_5_VisionAttention(nn.Module):
                                             cu_seqlens_k=cu_seqlens,
                                             max_seqlen_q=max_seqlen,
                                             max_seqlen_k=max_seqlen,
-                                            dropout_p=0,
+                                            dropout_p=0.0,
                                             causal=False)
 
             context_layer = rearrange(output,
@@ -540,14 +543,14 @@ class Qwen2_5_VisionTransformer(nn.Module):
         self.rotary_pos_emb = Qwen2_5_VisionRotaryEmbedding(head_dim // 2)
 
         self.blocks = nn.ModuleList([
-            Qwen2_5_VisionBlock(
-                dim=self.hidden_size,
-                num_heads=self.num_heads,
-                mlp_hidden_dim=vision_config.intermediate_size,
-                act_fn=_ACTIVATION_REGISTRY[vision_config.hidden_act],
-                norm_layer=norm_layer,
-                quant_config=quant_config,
-                prefix=f"{prefix}.blocks.{layer_idx}")
+            Qwen2_5_VisionBlock(dim=self.hidden_size,
+                                num_heads=self.num_heads,
+                                mlp_hidden_dim=vision_config.intermediate_size,
+                                act_fn=get_act_and_mul_fn(
+                                    vision_config.hidden_act),
+                                norm_layer=norm_layer,
+                                quant_config=quant_config,
+                                prefix=f"{prefix}.blocks.{layer_idx}")
             for layer_idx in range(depth)
         ])
         self.merger = Qwen2_5_VisionPatchMerger(
@@ -639,7 +642,8 @@ class Qwen2_5_VisionTransformer(nn.Module):
         cu_seqlens: torch.Tensor,
     ) -> tuple[Optional[int], Optional[list[int]]]:
         max_seqlen, seqlens = None, None
-        if self.attn_backend == _Backend.FLASH_ATTN:
+        if (self.attn_backend == _Backend.FLASH_ATTN
+                or self.attn_backend == _Backend.ROCM_AITER_FA):
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
         elif self.attn_backend == _Backend.XFORMERS:
             seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
@@ -752,6 +756,8 @@ class Qwen2_5_VisionTransformer(nn.Module):
             ("attn.qkv.", "attn.q.", "q"),
             ("attn.qkv.", "attn.k.", "k"),
             ("attn.qkv.", "attn.v.", "v"),
+            ("mlp.gate_up_proj.", "mlp.gate_proj.", 0),
+            ("mlp.gate_up_proj.", "mlp.up_proj.", 1),
         ]
         params_dict = dict(self.named_parameters(remove_duplicate=False))
         loaded_params: set[str] = set()
@@ -780,25 +786,10 @@ class Qwen2_5_VLProcessingInfo(Qwen2VLProcessingInfo):
     def get_hf_config(self):
         return self.ctx.get_hf_config(Qwen2_5_VLConfig)
 
-    def get_hf_processor(
-        self,
-        *,
-        min_pixels: Optional[int] = None,
-        max_pixels: Optional[int] = None,
-        size: Optional[dict[str, int]] = None,
-        fps: Optional[Union[float, list[float]]] = None,
-        **kwargs: object,
-    ) -> Qwen2_5_VLProcessor:
-        if fps is not None:
-            kwargs["fps"] = fps
-
+    def get_hf_processor(self, **kwargs: object) -> Qwen2_5_VLProcessor:
         return self.ctx.get_hf_processor(
             Qwen2_5_VLProcessor,
-            image_processor=self.get_image_processor(min_pixels=min_pixels,
-                                                     max_pixels=max_pixels,
-                                                     size=size,
-                                                     use_fast=kwargs.get(
-                                                         "use_fast", True)),
+            use_fast=kwargs.pop("use_fast", True),
             **kwargs,
         )
 
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index d7fec30acd8d3..3ef55cd704cf0 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -86,22 +86,12 @@ class Qwen2AudioProcessingInfo(BaseProcessingInfo):
     def get_hf_config(self):
         return self.ctx.get_hf_config(Qwen2AudioConfig)
 
-    def get_hf_processor(
-        self,
-        *,
-        # Ignored in initialization
-        sampling_rate: Optional[int] = None,
-        **kwargs: object,
-    ) -> Qwen2AudioProcessor:
+    def get_hf_processor(self, **kwargs: object) -> Qwen2AudioProcessor:
         return self.ctx.get_hf_processor(Qwen2AudioProcessor, **kwargs)
 
-    def get_feature_extractor(
-        self,
-        *,
-        # Ignored in initialization
-        sampling_rate: Optional[int] = None,
-    ) -> WhisperFeatureExtractor:
-        hf_processor = self.get_hf_processor(sampling_rate=sampling_rate)
+    def get_feature_extractor(self,
+                              **kwargs: object) -> WhisperFeatureExtractor:
+        hf_processor = self.get_hf_processor(**kwargs)
         feature_extractor = hf_processor.feature_extractor  # type: ignore
         assert isinstance(feature_extractor, WhisperFeatureExtractor)
         return feature_extractor
diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py
index f12e9a041a944..9b6b70c75c341 100644
--- a/vllm/model_executor/models/qwen2_rm.py
+++ b/vllm/model_executor/models/qwen2_rm.py
@@ -117,8 +117,5 @@ class Qwen2ForProcessRewardModel(Qwen2RewardBaseModel):
             Pooler.for_encode(
                 pooler_config,
                 default_pooling_type=PoolingType.STEP,
-                default_normalize=False,
-                default_softmax=True,
-                default_step_tag_id=151651,
             )
         })
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index ad63bb4af4e9d..40d77312b72c2 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -69,8 +69,6 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.platforms import _Backend, current_platform
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.config import uses_mrope
-from vllm.transformers_utils.processor import (
-    cached_image_processor_from_config)
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 
 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
@@ -276,10 +274,14 @@ class Qwen2VisionAttention(nn.Module):
         # Detect attention implementation.
         self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
         if self.attn_backend not in {
-                _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS
+                _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS,
+                _Backend.ROCM_AITER_FA
         }:
             raise RuntimeError(
                 f"Qwen2-VL does not support {self.attn_backend} backend now.")
+        self.is_flash_attn_backend = self.attn_backend in {
+            _Backend.FLASH_ATTN, _Backend.ROCM_AITER_FA
+        }
 
     def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
         # [s, b, 3 * head * head_dim]
@@ -326,10 +328,13 @@ class Qwen2VisionAttention(nn.Module):
             q = apply_rotary_pos_emb_vision(q, rotary_pos_emb)
             k = apply_rotary_pos_emb_vision(k, rotary_pos_emb)
 
-        if self.attn_backend == _Backend.FLASH_ATTN:
+        if self.is_flash_attn_backend:
             # from vllm_flash_attn.flash_attn_interface import (
             #   flash_attn_varlen_func)
-            from flash_attn import flash_attn_varlen_func
+            if self.attn_backend == _Backend.ROCM_AITER_FA:
+                from aiter import flash_attn_varlen_func
+            else:
+                from flash_attn import flash_attn_varlen_func
 
             q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
 
@@ -340,7 +345,7 @@ class Qwen2VisionAttention(nn.Module):
                                             cu_seqlens_k=cu_seqlens,
                                             max_seqlen_q=max_seqlen,
                                             max_seqlen_k=max_seqlen,
-                                            dropout_p=0,
+                                            dropout_p=0.0,
                                             causal=False)
 
             context_layer = rearrange(output,
@@ -622,7 +627,8 @@ class Qwen2VisionTransformer(nn.Module):
             self, cu_seqlens: torch.Tensor
     ) -> tuple[Optional[int], Optional[list[int]]]:
         max_seqlen, seqlens = None, None
-        if self.attn_backend == _Backend.FLASH_ATTN:
+        if (self.attn_backend == _Backend.FLASH_ATTN
+                or self.attn_backend == _Backend.ROCM_AITER_FA):
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
         elif self.attn_backend == _Backend.XFORMERS:
             seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
@@ -752,73 +758,15 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
     def get_hf_config(self):
         return self.ctx.get_hf_config(Qwen2VLConfig)
 
-    def get_hf_processor(
-        self,
-        *,
-        min_pixels: Optional[int] = None,
-        max_pixels: Optional[int] = None,
-        size: Optional[dict[str, int]] = None,
-        **kwargs: object,
-    ) -> Qwen2VLProcessor:
+    def get_hf_processor(self, **kwargs: object) -> Qwen2VLProcessor:
         return self.ctx.get_hf_processor(
             Qwen2VLProcessor,
-            image_processor=self.get_image_processor(min_pixels=min_pixels,
-                                                     max_pixels=max_pixels,
-                                                     size=size,
-                                                     use_fast=kwargs.get(
-                                                         "use_fast", True)),
+            use_fast=kwargs.pop("use_fast", True),
             **kwargs,
         )
 
-    def _get_image_processor_kwargs(
-        self,
-        *,
-        min_pixels: Optional[int] = None,
-        max_pixels: Optional[int] = None,
-        size: Optional[dict[str, int]] = None,
-        **kwargs: object,
-    ):
-        mm_config = self.ctx.model_config.get_multimodal_config()
-        if mm_config.mm_processor_kwargs:
-            kwargs.update(mm_config.mm_processor_kwargs)
-
-        if min_pixels is not None:
-            kwargs["min_pixels"] = min_pixels
-
-            if size is None:
-                size = {"shortest_edge": min_pixels}
-            else:
-                size["shortest_edge"] = min_pixels
-
-        if max_pixels is not None:
-            kwargs["max_pixels"] = max_pixels
-
-            if size is None:
-                size = {"longest_edge": max_pixels}
-            else:
-                size["longest_edge"] = max_pixels
-
-        if size is not None:
-            kwargs["size"] = size
-
-        return kwargs
-
-    def get_image_processor(
-        self,
-        *,
-        min_pixels: Optional[int] = None,
-        max_pixels: Optional[int] = None,
-        size: Optional[dict[str, int]] = None,
-        **kwargs: object,
-    ) -> Qwen2VLImageProcessor:
-        kwargs["use_fast"] = kwargs.get("use_fast", True)
-        return cached_image_processor_from_config(
-            self.ctx.model_config,
-            **self._get_image_processor_kwargs(min_pixels=min_pixels,
-                                               max_pixels=max_pixels,
-                                               size=size,
-                                               **kwargs),
-        )
+    def get_image_processor(self, **kwargs: object) -> Qwen2VLImageProcessor:
+        return self.get_hf_processor(**kwargs).image_processor
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None, "video": None}
@@ -1023,20 +971,6 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]
     def _get_data_parser(self) -> MultiModalDataParser:
         return Qwen2VLMultiModalDataParser()
 
-    def _call_hf_processor(
-        self,
-        prompt: str,
-        mm_data: Mapping[str, object],
-        mm_kwargs: Mapping[str, object],
-        tok_kwargs: Mapping[str, object],
-    ) -> BatchFeature:
-        mm_kwargs = self.info._get_image_processor_kwargs(**mm_kwargs)
-        return self.info.ctx.call_hf_processor(
-            self.info.get_hf_processor(**mm_kwargs),
-            dict(text=prompt, **mm_data),
-            dict(**mm_kwargs, **tok_kwargs),
-        )
-
     def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py
index 393ce41a91a00..d2ae8959b103d 100644
--- a/vllm/model_executor/models/qwen3.py
+++ b/vllm/model_executor/models/qwen3.py
@@ -288,6 +288,13 @@ class Qwen3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+    def set_aux_hidden_state_layers(self, layers: tuple[int]) -> None:
+        self.model.aux_hidden_state_layers = layers
+
+    def get_eagle3_aux_hidden_state_layers(self) -> tuple[int]:
+        num_layers = len(self.model.layers)
+        return (2, num_layers // 2, num_layers - 3)
+
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.model.get_input_embeddings(input_ids)
 
diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
index 12899c28016b9..ca14fd06574ec 100644
--- a/vllm/model_executor/models/qwen3_moe.py
+++ b/vllm/model_executor/models/qwen3_moe.py
@@ -22,7 +22,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Qwen3MoE model compatible with HuggingFace weights."""
-from collections.abc import Iterable
+import typing
+from collections.abc import Callable, Iterable
 from typing import Any, Optional, Union
 
 import torch
@@ -31,8 +32,9 @@ from transformers import PretrainedConfig
 
 from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, VllmConfig
-from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config
+from vllm.distributed import (get_ep_group, get_pp_group,
+                              get_tensor_model_parallel_world_size)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import FusedMoE
@@ -50,8 +52,8 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (AutoWeightsLoader, extract_layer_index,
+from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP
+from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index,
                     is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
@@ -101,23 +103,47 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
         config: PretrainedConfig,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        enable_eplb: bool = False,
     ):
         super().__init__()
         self.tp_size = get_tensor_model_parallel_world_size()
 
+        self.ep_group = get_ep_group().device_group
+        self.ep_rank = self.ep_group.rank()
+        self.ep_size = self.ep_group.size()
+        self.n_routed_experts = config.num_experts
+
         if self.tp_size > config.num_experts:
             raise ValueError(
                 f"Tensor parallel size {self.tp_size} is greater than "
                 f"the number of experts {config.num_experts}.")
 
-        self.experts = FusedMoE(num_experts=config.num_experts,
+        # Load balancing settings.
+        vllm_config = get_current_vllm_config()
+        parallel_config = vllm_config.parallel_config
+        self.enable_eplb = enable_eplb
+
+        self.n_logical_experts = self.n_routed_experts
+        self.n_redundant_experts = parallel_config.num_redundant_experts
+        self.n_physical_experts = (self.n_logical_experts +
+                                   self.n_redundant_experts)
+        self.n_local_physical_experts = self.n_physical_experts // self.ep_size
+
+        self.physical_expert_start = (self.ep_rank *
+                                      self.n_local_physical_experts)
+        self.physical_expert_end = (self.physical_expert_start +
+                                    self.n_local_physical_experts)
+
+        self.experts = FusedMoE(num_experts=self.n_routed_experts,
                                 top_k=config.num_experts_per_tok,
                                 hidden_size=config.hidden_size,
                                 intermediate_size=config.moe_intermediate_size,
                                 reduce_results=False,
                                 renormalize=config.norm_topk_prob,
                                 quant_config=quant_config,
-                                prefix=f"{prefix}.experts")
+                                prefix=f"{prefix}.experts",
+                                enable_eplb=self.enable_eplb,
+                                num_redundant_experts=self.n_redundant_experts)
 
         self.gate = ReplicatedLinear(config.hidden_size,
                                      config.num_experts,
@@ -246,6 +272,7 @@ class Qwen3MoeDecoderLayer(nn.Module):
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        enable_eplb: bool = False,
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
@@ -277,7 +304,8 @@ class Qwen3MoeDecoderLayer(nn.Module):
             (layer_idx + 1) % config.decoder_sparse_step == 0):
             self.mlp = Qwen3MoeSparseMoeBlock(config=config,
                                               quant_config=quant_config,
-                                              prefix=f"{prefix}.mlp")
+                                              prefix=f"{prefix}.mlp",
+                                              enable_eplb=enable_eplb)
         else:
             self.mlp = Qwen3MoeMLP(hidden_size=config.hidden_size,
                                    intermediate_size=config.intermediate_size,
@@ -323,6 +351,9 @@ class Qwen3MoeModel(nn.Module):
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
+        parallel_config = vllm_config.parallel_config
+        enable_eplb = parallel_config.enable_eplb
+        self.num_redundant_experts = parallel_config.num_redundant_experts
 
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
@@ -336,7 +367,8 @@ class Qwen3MoeModel(nn.Module):
             lambda prefix: Qwen3MoeDecoderLayer(config=config,
                                                 cache_config=cache_config,
                                                 quant_config=quant_config,
-                                                prefix=prefix),
+                                                prefix=prefix,
+                                                enable_eplb=enable_eplb),
             prefix=f"{prefix}.layers",
         )
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -382,7 +414,8 @@ class Qwen3MoeModel(nn.Module):
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
-            num_experts=self.config.num_experts)
+            num_experts=self.config.num_experts,
+            num_redundant_experts=self.num_redundant_experts)
 
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
@@ -433,27 +466,51 @@ class Qwen3MoeModel(nn.Module):
                 weight_loader(param, loaded_weight, shard_id)
                 break
             else:
+                is_expert_weight = False
                 for mapping in expert_params_mapping:
                     param_name, weight_name, expert_id, shard_id = mapping
                     if weight_name not in name:
                         continue
-                    name = name.replace(weight_name, param_name)
-                    # Skip layers on other devices.
-                    if is_pp_missing_parameter(name, self):
+
+                    # Anyway, this is an expert weight and should not be
+                    # attempted to load as other weights later
+                    is_expert_weight = True
+
+                    # Do not modify `name` since the loop may continue here
+                    # Instead, create a new variable
+                    name_mapped = name.replace(weight_name, param_name)
+
+                    if is_pp_missing_parameter(name_mapped, self):
                         continue
+
                     # Skip loading extra parameters for GPTQ/modelopt models.
-                    if name.endswith(
-                            ignore_suffixes) and name not in params_dict:
+                    if name_mapped.endswith(
+                            ignore_suffixes
+                    ) and name_mapped not in params_dict:
                         continue
-                    param = params_dict[name]
-                    weight_loader = param.weight_loader
-                    weight_loader(param,
-                                  loaded_weight,
-                                  name,
-                                  shard_id=shard_id,
-                                  expert_id=expert_id)
-                    break
+
+                    param = params_dict[name_mapped]
+                    # We should ask the weight loader to return success or not
+                    # here since otherwise we may skip experts with other
+                    # available replicas.
+                    weight_loader = typing.cast(Callable[..., bool],
+                                                param.weight_loader)
+                    success = weight_loader(param,
+                                            loaded_weight,
+                                            name_mapped,
+                                            shard_id=shard_id,
+                                            expert_id=expert_id,
+                                            return_success=True)
+                    if success:
+                        name = name_mapped
+                        break
                 else:
+                    if is_expert_weight:
+                        # We've checked that this is an expert weight
+                        # However it's not mapped locally to this rank
+                        # So we simply skip it
+                        continue
+
                     # Skip loading extra parameters for GPTQ/modelopt models.
                     if name.endswith(
                             ignore_suffixes) and name not in params_dict:
@@ -482,7 +539,8 @@ class Qwen3MoeModel(nn.Module):
         return loaded_params
 
 
-class Qwen3MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
+class Qwen3MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA,
+                          MixtureOfExperts):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -514,6 +572,66 @@ class Qwen3MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+        # Set MoE hyperparameters
+        self.expert_weights = []
+
+        self.moe_layers: list[FusedMoE] = []
+        example_layer = None
+        for layer in self.model.layers:
+            if isinstance(layer, PPMissingLayer):
+                continue
+
+            assert isinstance(layer, Qwen3MoeDecoderLayer)
+            if isinstance(layer.mlp, Qwen3MoeSparseMoeBlock):
+                example_layer = layer.mlp
+                self.moe_layers.append(layer.mlp.experts)
+
+        if example_layer is None:
+            raise RuntimeError("No Qwen3MoE layer found in the model.layers.")
+
+        self.num_moe_layers = len(self.moe_layers)
+        self.num_expert_groups = 1
+        self.num_shared_experts = 0
+        self.num_logical_experts = example_layer.n_logical_experts
+        self.num_physical_experts = example_layer.n_physical_experts
+        self.num_local_physical_experts = example_layer.n_local_physical_experts
+        self.num_routed_experts = example_layer.n_routed_experts
+        self.num_redundant_experts = example_layer.n_redundant_experts
+
+    def set_eplb_state(
+        self,
+        expert_load_view: torch.Tensor,
+        logical_to_physical_map: torch.Tensor,
+        logical_replica_count: torch.Tensor,
+    ) -> None:
+        for layer_idx, layer in enumerate(self.moe_layers):
+            # Register the expert weights.
+            self.expert_weights.append(layer.get_expert_weights())
+            layer.set_eplb_state(
+                moe_layer_idx=layer_idx,
+                expert_load_view=expert_load_view,
+                logical_to_physical_map=logical_to_physical_map,
+                logical_replica_count=logical_replica_count,
+            )
+
+    def update_physical_experts_metadata(
+        self,
+        num_physical_experts: int,
+        num_local_physical_experts: int,
+    ) -> None:
+        assert self.num_local_physical_experts == num_local_physical_experts
+        self.num_physical_experts = num_physical_experts
+        self.num_local_physical_experts = num_local_physical_experts
+        self.num_redundant_experts = (num_physical_experts -
+                                      self.num_logical_experts)
+        for layer in self.model.layers:
+            if isinstance(layer.mlp, Qwen3MoeSparseMoeBlock):
+                moe = layer.mlp
+                moe.n_local_physical_experts = num_local_physical_experts
+                moe.n_physical_experts = num_physical_experts
+                moe.n_redundant_experts = self.num_redundant_experts
+                moe.experts.update_expert_map()
+
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.model.get_input_embeddings(input_ids)
 
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 51831a770347a..9b6ab52d86805 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -129,6 +129,7 @@ _TEXT_GENERATION_MODELS = {
     "Qwen3ForCausalLM": ("qwen3", "Qwen3ForCausalLM"),
     "Qwen3MoeForCausalLM": ("qwen3_moe", "Qwen3MoeForCausalLM"),
     "RWForCausalLM": ("falcon", "FalconForCausalLM"),
+    "Step3TextForCausalLM": ("step3_text", "Step3TextForCausalLM"),
     "StableLMEpochForCausalLM": ("stablelm", "StablelmForCausalLM"),
     "StableLmForCausalLM": ("stablelm", "StablelmForCausalLM"),
     "Starcoder2ForCausalLM": ("starcoder2", "Starcoder2ForCausalLM"),
@@ -205,6 +206,7 @@ _MULTIMODAL_MODELS = {
     "Gemma3ForConditionalGeneration": ("gemma3_mm", "Gemma3ForConditionalGeneration"),  # noqa: E501
     "GLM4VForCausalLM": ("glm4v", "GLM4VForCausalLM"),
     "Glm4vForConditionalGeneration": ("glm4_1v", "Glm4vForConditionalGeneration"),  # noqa: E501
+    "Glm4v_moeForConditionalGeneration": ("glm4_1v", "Glm4vForConditionalGeneration"),  # noqa: E501
     "GraniteSpeechForConditionalGeneration": ("granite_speech", "GraniteSpeechForConditionalGeneration"),  # noqa: E501
     "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"),
     "InternVLChatModel": ("internvl", "InternVLChatModel"),
@@ -238,6 +240,7 @@ _MULTIMODAL_MODELS = {
     "Qwen2_5OmniModel": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"),  # noqa: E501
     "Qwen2_5OmniForConditionalGeneration": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"),  # noqa: E501
     "UltravoxModel": ("ultravox", "UltravoxModel"),
+    "Step3VLForConditionalGeneration": ("step3_vl", "Step3VLForConditionalGeneration"),  # noqa: E501
     "TarsierForConditionalGeneration": ("tarsier", "TarsierForConditionalGeneration"),  # noqa: E501
     "Tarsier2ForConditionalGeneration": ("qwen2_vl", "Tarsier2ForConditionalGeneration"),  # noqa: E501
     "VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"),  # noqa: E501
@@ -268,8 +271,9 @@ _TRANSFORMERS_SUPPORTED_MODELS = {
 }
 
 _TRANSFORMERS_BACKEND_MODELS = {
-    "TransformersForMultimodalLM": ("transformers", "TransformersForMultimodalLM"), # noqa: E501
+    "TransformersModel": ("transformers", "TransformersModel"),
     "TransformersForCausalLM": ("transformers", "TransformersForCausalLM"),
+    "TransformersForMultimodalLM": ("transformers", "TransformersForMultimodalLM"), # noqa: E501
 }
 # yapf: enable
 
diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py
index 5ae5c0bc1d5dc..c76aabcd27ccb 100644
--- a/vllm/model_executor/models/skyworkr1v.py
+++ b/vllm/model_executor/models/skyworkr1v.py
@@ -7,9 +7,8 @@
 # Copyright (c) 2025 Skywork
 # Licensed under The MIT License [see LICENSE for details]
 # --------------------------------------------------------
-from abc import ABC, abstractmethod
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Literal, Optional, TypedDict, TypeVar, Union
+from typing import Literal, Optional, TypedDict, Union
 
 import torch
 import torch.nn as nn
@@ -232,7 +231,7 @@ def image_to_pixel_values_skyworkr1v(
     return pixel_values
 
 
-class BaseSkyworkR1VProcessor(ABC):
+class SkyworkR1VProcessor:
     """
     This model doesn't define its own HF processor,
     so we implement our own one here.
@@ -279,17 +278,18 @@ class BaseSkyworkR1VProcessor(ABC):
         self.use_thumbnail: bool = config.use_thumbnail
 
     @property
-    @abstractmethod
     def image_token_id(self) -> int:
-        raise NotImplementedError
+        return self.tokenizer.get_vocab()[IMG_CONTEXT]
 
-    @abstractmethod
     def get_image_repl(
         self,
         feature_size: int,
         num_patches: Optional[int],
     ) -> PromptUpdateDetails[str]:
-        raise NotImplementedError
+        repl_features = IMG_CONTEXT * feature_size
+        repl_full = IMG_START + repl_features + IMG_END
+
+        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
 
     def resolve_min_max_num(
         self,
@@ -426,35 +426,15 @@ class BaseSkyworkR1VProcessor(ABC):
         }
 
 
-class SkyworkR1VProcessor(BaseSkyworkR1VProcessor):
+class SkyworkR1VProcessingInfo(BaseProcessingInfo):
 
-    @property
-    def image_token_id(self) -> int:
-        return self.tokenizer.get_vocab()[IMG_CONTEXT]
-
-    def get_image_repl(
-        self,
-        feature_size: int,
-        num_patches: Optional[int],
-    ) -> PromptUpdateDetails[str]:
-        repl_features = IMG_CONTEXT * feature_size
-        repl_full = IMG_START + repl_features + IMG_END
-
-        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
-
-
-class BaseSkyworkR1VProcessingInfo(BaseProcessingInfo):
-
-    @abstractmethod
-    def get_hf_processor(
-        self,
-        *,
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
-        **kwargs: object,
-    ) -> BaseSkyworkR1VProcessor:
-        raise NotImplementedError
+    def get_hf_processor(self, **kwargs: object) -> SkyworkR1VProcessor:
+        return self.ctx.init_processor(
+            SkyworkR1VProcessor,
+            config=self.get_hf_config(),
+            tokenizer=self.get_tokenizer(),
+            **kwargs,
+        )
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
@@ -464,7 +444,7 @@ class BaseSkyworkR1VProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        processor: Optional[BaseSkyworkR1VProcessor],
+        processor: Optional[SkyworkR1VProcessor],
     ) -> int:
         if processor is None:
             processor = self.get_hf_processor()
@@ -500,10 +480,8 @@ class BaseSkyworkR1VProcessingInfo(BaseProcessingInfo):
         return largest_feature_pinpoint
 
 
-_I = TypeVar("_I", bound=BaseSkyworkR1VProcessingInfo)
-
-
-class SkyworkR1VDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
+class SkyworkR1VDummyInputsBuilder(
+        BaseDummyInputsBuilder[SkyworkR1VProcessingInfo]):
 
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_images = mm_counts.get("image", 0)
@@ -527,7 +505,8 @@ class SkyworkR1VDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
         }
 
 
-class SkyworkR1VMultiModalProcessor(BaseMultiModalProcessor[_I]):
+class SkyworkR1VMultiModalProcessor(
+        BaseMultiModalProcessor[SkyworkR1VProcessingInfo]):
 
     def _call_hf_processor(
         self,
@@ -617,31 +596,6 @@ class SkyworkR1VMultiModalProcessor(BaseMultiModalProcessor[_I]):
         ]
 
 
-class SkyworkR1VProcessingInfo(BaseSkyworkR1VProcessingInfo):
-
-    def get_hf_processor(
-        self,
-        *,
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
-        **kwargs: object,
-    ) -> SkyworkR1VProcessor:
-        if min_dynamic_patch is not None:
-            kwargs["min_dynamic_patch"] = min_dynamic_patch
-        if max_dynamic_patch is not None:
-            kwargs["max_dynamic_patch"] = max_dynamic_patch
-        if dynamic_image_size is not None:
-            kwargs["dynamic_image_size"] = dynamic_image_size
-
-        return self.ctx.init_processor(
-            SkyworkR1VProcessor,
-            config=self.get_hf_config(),
-            tokenizer=self.get_tokenizer(),
-            **kwargs,
-        )
-
-
 @MULTIMODAL_REGISTRY.register_processor(
     SkyworkR1VMultiModalProcessor,
     info=SkyworkR1VProcessingInfo,
diff --git a/vllm/model_executor/models/smolvlm.py b/vllm/model_executor/models/smolvlm.py
index 0f22ba5b406ce..2adfad67152b3 100644
--- a/vllm/model_executor/models/smolvlm.py
+++ b/vllm/model_executor/models/smolvlm.py
@@ -19,15 +19,7 @@ from .idefics3 import Idefics3ProcessingInfo
 
 class SmolVLMProcessingInfo(Idefics3ProcessingInfo):
 
-    def get_hf_processor(
-        self,
-        *,
-        max_image_size: Optional[dict[str, int]] = None,
-        **kwargs: object,
-    ) -> SmolVLMProcessor:
-        if max_image_size is not None:
-            kwargs["max_image_size"] = max_image_size
-
+    def get_hf_processor(self, **kwargs: object) -> SmolVLMProcessor:
         return self.ctx.get_hf_processor(SmolVLMProcessor, **kwargs)
 
     def _get_image_token(
diff --git a/vllm/model_executor/models/step3_text.py b/vllm/model_executor/models/step3_text.py
new file mode 100644
index 0000000000000..47d2af5c2a140
--- /dev/null
+++ b/vllm/model_executor/models/step3_text.py
@@ -0,0 +1,521 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Inference-only Jurassic model."""
+from collections.abc import Iterable
+from typing import Any, Optional
+
+import torch
+from torch import nn
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, ModelConfig, VllmConfig
+from vllm.distributed import (get_pp_group,
+                              get_tensor_model_parallel_world_size,
+                              tensor_model_parallel_all_reduce)
+from vllm.logger import init_logger
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsPP
+from .utils import (PPMissingLayer, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers)
+
+logger = init_logger(__name__)
+
+
+class FusedMoEBlock(nn.Module):
+
+    def __init__(self,
+                 config: ModelConfig,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+
+        if self.tp_size > config.moe_num_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {config.moe_num_experts}.")
+
+        self.experts = FusedMoE(num_experts=config.moe_num_experts,
+                                top_k=config.moe_top_k,
+                                hidden_size=config.hidden_size,
+                                intermediate_size=config.moe_intermediate_size,
+                                reduce_results=False,
+                                renormalize=config.norm_expert_weight,
+                                quant_config=quant_config,
+                                prefix=f"{prefix}.experts")
+        self.gate = ReplicatedLinear(config.hidden_size,
+                                     config.moe_num_experts,
+                                     bias=False,
+                                     quant_config=None,
+                                     prefix=f"{prefix}.gate")
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        orig_shape = hidden_states.shape
+        hidden_dim = hidden_states.shape[-1]
+        hidden_states = hidden_states.view(-1, hidden_dim)
+
+        router_logits, _ = self.gate(hidden_states)
+
+        final_hidden_states = self.experts(hidden_states=hidden_states,
+                                           router_logits=router_logits)
+        if self.tp_size > 1:
+            final_hidden_states = tensor_model_parallel_all_reduce(
+                final_hidden_states)
+
+        return final_hidden_states.view(orig_shape)
+
+
+class Step3TextMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size, [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj")
+        self.down_proj = RowParallelLinear(intermediate_size,
+                                           hidden_size,
+                                           bias=False,
+                                           quant_config=quant_config,
+                                           prefix=f"{prefix}.down_proj")
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+        self.hidden_size = hidden_size
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        gate_up, _ = self.gate_up_proj(hidden_states)
+        intermediate_act = self.act_fn(gate_up)
+        output, _ = self.down_proj(intermediate_act)
+        return output
+
+
+class Step3TextAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        norm_eps: float,
+        rope_theta: int,
+        share_q_dim: Optional[int] = None,
+        rope_scaling: Optional[dict[str, Any]] = None,
+        max_position_embedding: int = 8192,
+        head_dim: int = 256,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+
+        if num_kv_heads != 1:
+            raise ValueError(f"Step3TextAttention num_kv_heads must be 1, "
+                             f"but got {num_kv_heads}.")
+        self.num_kv_heads = num_kv_heads
+
+        self.head_dim = head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.q_size = share_q_dim if share_q_dim else self.head_dim
+
+        self.qkv_proj = ReplicatedLinear(
+            hidden_size,
+            self.q_size + self.kv_size * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+        self.inter_norm = RMSNorm(self.q_size, eps=norm_eps)
+        self.wq = ColumnParallelLinear(
+            self.q_size,
+            self.head_dim * self.total_num_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.wq",
+        )
+        self.rotary_emb = get_rope(self.head_dim,
+                                   rotary_dim=self.head_dim,
+                                   max_position=max_position_embedding,
+                                   base=rope_theta,
+                                   rope_scaling=rope_scaling)
+        scaling = self.head_dim**-0.5
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              scaling,
+                              self.num_kv_heads,
+                              cache_config=cache_config,
+                              prefix=f"{prefix}.attn")
+
+    def forward(self, positions: torch.Tensor,
+                hidden_states: torch.Tensor) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q = self.inter_norm(q)
+        q = self.wq(q)[0]
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        residual, _ = self.o_proj(attn_output)
+        return residual
+
+
+class Step3TextDecoderLayer(nn.Module):
+
+    def __init__(self,
+                 config: ModelConfig,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = "") -> None:
+        super().__init__()
+        config = config.hf_config
+        self.hidden_size = config.hidden_size
+        rope_scaling = getattr(config, "rope_scaling", None)
+
+        self.self_attn = Step3TextAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=1,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            norm_eps=config.rms_norm_eps,
+            max_position_embedding=config.max_position_embedding,
+            head_dim=config.head_dim,
+            share_q_dim=config.share_q_dim,
+            rope_theta=config.rope_theta,
+            rope_scaling=rope_scaling,
+            prefix=f"{prefix}.self_attn")
+
+        layer_idx = int(prefix.split("layers.")[1].split(".")[0])
+        moe_layers_enum = getattr(config, "moe_layers_enum", None)
+        if moe_layers_enum is not None:
+            moe_layers_idx = [
+                int(i) for i in moe_layers_enum.strip().split(',')
+            ]
+        else:
+            # Default to 1dense.
+            moe_layers_idx = [i for i in range(1, config.num_hidden_layers)]
+
+        if layer_idx in moe_layers_idx:
+            self.moe = FusedMoEBlock(config=config,
+                                     quant_config=quant_config,
+                                     prefix=f"{prefix}.moe")
+            self.share_expert = Step3TextMLP(
+                hidden_size=self.hidden_size,
+                intermediate_size=config.share_expert_dim,
+                hidden_act="silu",
+                quant_config=quant_config,
+                prefix=f"{prefix}.share_expert")
+            self.use_moe = True
+        else:
+            self.mlp = Step3TextMLP(hidden_size=config.hidden_size,
+                                    intermediate_size=config.intermediate_size,
+                                    hidden_act="silu",
+                                    quant_config=quant_config,
+                                    prefix=f"{prefix}.mlp")
+            self.use_moe = False
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+    def forward(
+            self, positions: torch.Tensor, hidden_states: torch.Tensor,
+            residual: Optional[torch.Tensor]
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+
+        if self.use_moe:
+            share_output = self.share_expert(hidden_states)
+            moe_output = self.moe(hidden_states)
+            hidden_states = share_output + moe_output
+        else:
+            hidden_states = self.mlp(hidden_states)
+
+        return hidden_states, residual
+
+
+@support_torch_compile
+class Step3TextModel(nn.Module):
+
+    def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        self.vocab_size = config.vocab_size
+        self.config = config
+
+        if get_pp_group().is_first_rank or (config.tie_word_embeddings
+                                            and get_pp_group().is_last_rank):
+            self.embed_tokens = VocabParallelEmbedding(
+                self.vocab_size,
+                config.hidden_size,
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: Step3TextDecoderLayer(config=vllm_config.
+                                                 model_config,
+                                                 cache_config=cache_config,
+                                                 quant_config=quant_config,
+                                                 prefix=prefix),
+            prefix=f"{prefix}.layers",
+        )
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(["hidden_states"],
+                                                    config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            hidden_states, residual = layer(positions, hidden_states, residual)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual,
+            })
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class Step3TextForCausalLM(nn.Module, SupportsPP):
+
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+    ):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        lora_config = vllm_config.lora_config
+        self.config = config
+        self.vllm_config = vllm_config
+
+        self.model = Step3TextModel(vllm_config=vllm_config, prefix=prefix)
+
+        if get_pp_group().is_last_rank:
+            self.unpadded_vocab_size = config.vocab_size
+            if lora_config:
+                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+            self.lm_head = ParallelLMHead(
+                self.unpadded_vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                padding_size=DEFAULT_VOCAB_PADDING_SIZE
+                if not lora_config else lora_config.lora_vocab_padding_size,
+            )
+            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                    config.vocab_size)
+            self.sampler = get_sampler()
+        else:
+            self.lm_head = PPMissingLayer()
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def forward(self,
+                input_ids: torch.Tensor,
+                positions: torch.Tensor,
+                intermediate_tensors: Optional[IntermediateTensors] = None,
+                inputs_embeds: Optional[torch.Tensor] = None):
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: Optional[torch.Tensor],
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        qkv_params_mapping = [
+            # (param_name, shard_name, relative_start_idx, relative_end_idx)
+            (".qkv_proj", ".q_proj", 0, self.config.share_q_dim /
+             (self.config.share_q_dim + self.config.head_dim * 2)),
+            (".qkv_proj", ".k_proj", self.config.share_q_dim /
+             (self.config.share_q_dim + self.config.head_dim * 2),
+             (self.config.share_q_dim + self.config.head_dim) /
+             (self.config.share_q_dim + self.config.head_dim * 2)),
+            (".qkv_proj", ".v_proj",
+             (self.config.share_q_dim + self.config.head_dim) /
+             (self.config.share_q_dim + self.config.head_dim * 2),
+             (self.config.share_q_dim + self.config.head_dim * 2) /
+             (self.config.share_q_dim + self.config.head_dim * 2)),
+        ]
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+
+        expert_params_mapping = [
+            (".moe.experts.w13_weight", ".moe.gate_proj.weight", "w1"),
+            (".moe.experts.w13_weight", ".moe.up_proj.weight", "w3"),
+            (".moe.experts.w2_weight", ".moe.down_proj.weight", "w2")
+        ]
+
+        disable_moe_stacked_params = [
+            data[1] for data in expert_params_mapping
+        ]
+
+        for name, loaded_weight in weights:
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                if any(disable_moe_stacked_param in name
+                       for disable_moe_stacked_param in
+                       disable_moe_stacked_params):
+                    continue
+                name = name.replace(weight_name, param_name)
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                loaded_params.add(name)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    # Skip loading extra bias for GPTQ models.
+                    if ((name.endswith(".bias") or name.endswith("_bias"))
+                            and name not in params_dict):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    for expert_id in range(loaded_weight.shape[0]):
+                        loaded_weight_expert = loaded_weight[expert_id]
+                        weight_loader(param,
+                                      loaded_weight_expert,
+                                      name,
+                                      shard_id=shard_id,
+                                      expert_id=expert_id)
+                    loaded_params.add(name)
+                    break
+                else:
+                    for (param_name, weight_name, start_idx,
+                         end_idx) in qkv_params_mapping:
+                        if weight_name not in name:
+                            continue
+                        name = name.replace(weight_name, param_name)
+                        if is_pp_missing_parameter(name, self):
+                            continue
+                        param = params_dict[name]
+                        dim = param.shape[param.output_dim]
+                        begin_idx = int(start_idx * dim)
+                        end_idx = int(end_idx * dim)
+                        param_slice = param.narrow(param.output_dim, begin_idx,
+                                                   end_idx - begin_idx)
+                        param_slice.copy_(loaded_weight)
+                        loaded_params.add(name)
+                        break
+                    else:
+                        if is_pp_missing_parameter(name, self):
+                            continue
+                        param = params_dict[name]
+                        weight_loader = getattr(param, "weight_loader",
+                                                default_weight_loader)
+                        weight_loader(param, loaded_weight)
+                        loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py
new file mode 100644
index 0000000000000..363c12a4bf2b8
--- /dev/null
+++ b/vllm/model_executor/models/step3_vl.py
@@ -0,0 +1,1052 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import math
+from collections.abc import Iterable, Mapping, Sequence
+from functools import cached_property
+from itertools import product
+from math import ceil, sqrt
+from typing import Any, Literal, Optional, TypedDict, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from PIL import Image
+from torchvision import transforms
+from torchvision.transforms.functional import InterpolationMode
+from transformers import BatchFeature, PretrainedConfig, TensorType
+
+from vllm.config import VllmConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs, NestedTensors)
+from vllm.multimodal.parse import ImageSize, MultiModalDataItems
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate, PromptUpdateDetails)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs import Step3VisionEncoderConfig
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
+                    init_vllm_registered_model, maybe_prefix,
+                    merge_multimodal_embeddings)
+
+
+class Step3VLImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    pixel_values: torch.Tensor
+    patch_pixel_values: Optional[torch.Tensor]
+    num_patches: list[int]
+
+
+class Step3VLImageEmbeddingInputs(TypedDict):
+    type: Literal["image_embeds"]
+    image_embeds: torch.Tensor
+
+
+Step3VLImageInputs = Union[Step3VLImagePixelInputs,
+                           Step3VLImageEmbeddingInputs]
+
+ImageWithPatches = tuple[Image.Image, list[Image.Image], list[int] | None]
+
+MAX_IMAGE_SIZE: int = 3024
+
+
+class Step3VisionProcessor:
+
+    def __init__(self, size, interpolation_mode="bicubic", patch_size=None):
+        mean = [0.48145466, 0.4578275, 0.40821073]
+        std = [0.26862954, 0.26130258, 0.27577711]
+        patch_size = patch_size if patch_size is not None else size
+
+        self.transform = transforms.Compose([
+            transforms.ToTensor(),
+            transforms.Normalize(mean, std),
+            transforms.Resize(
+                (size, size),
+                interpolation=InterpolationMode.BICUBIC if interpolation_mode
+                == "bicubic" else InterpolationMode.BILINEAR,
+                antialias=True),
+        ])
+
+        self.patch_transform = transforms.Compose([
+            transforms.ToTensor(),
+            transforms.Normalize(mean, std),
+            transforms.Resize(
+                (patch_size, patch_size),
+                interpolation=InterpolationMode.BICUBIC if interpolation_mode
+                == "bicubic" else InterpolationMode.BILINEAR,
+                antialias=True),
+        ]) if patch_size is not None else None
+
+    def __call__(self, image, is_patch=False):
+        if is_patch:
+            return {"pixel_values": self.patch_transform(image).unsqueeze(0)}
+        else:
+            return {"pixel_values": self.transform(image).unsqueeze(0)}
+
+
+class ImagePatcher:
+
+    def determine_window_size(self, long: int, short: int) -> int:
+        if long <= 728:
+            return short if long / short > 1.5 else 0
+        return min(short, 504) if long / short > 4 else 504
+
+    def slide_window(
+        self,
+        width: int,
+        height: int,
+        sizes: list[tuple[int, int]],
+        steps: list[tuple[int, int]],
+        img_rate_thr: float = 0.6,
+    ) -> tuple[list[tuple[int, int, int, int]], tuple[int, int]]:
+        assert 1 >= img_rate_thr >= 0, "The `in_rate_thr` should lie in 0~1"
+        windows = []
+        # Sliding windows.
+        for size, step in zip(sizes, steps):
+            size_w, size_h = size
+            step_w, step_h = step
+
+            x_num = 1 if width <= size_w else ceil((width - size_w) / step_w +
+                                                   1)
+            x_start = [step_w * i for i in range(x_num)]
+            if len(x_start) > 1 and x_start[-1] + size_w > width:
+                x_start[-1] = width - size_w
+
+            y_num = 1 if height <= size_h else ceil((height - size_h) /
+                                                    step_h + 1)
+            y_start = [step_h * i for i in range(y_num)]
+            if len(y_start) > 1 and y_start[-1] + size_h > height:
+                y_start[-1] = height - size_h
+
+            start = np.array(list(product(y_start, x_start)), dtype=int)
+            start[:, [0, 1]] = start[:, [1, 0]]
+            windows.append(np.concatenate([start, start + size], axis=1))
+        windows = np.concatenate(windows, axis=0)
+
+        return [(int(box[0]), int(box[1]), int(box[2] - box[0]),
+                 int(box[3] - box[1])) for box in windows], (x_num, y_num)
+
+    def square_pad(self, img: Image.Image) -> Image.Image:
+        w, h = img.size
+        if w == h:
+            return img
+        size = max(w, h)
+        padded = Image.new(img.mode, (size, size), 0)
+        padded.paste(img, (0, 0))
+        return padded
+
+    def get_image_size_for_padding(self, img_width: int,
+                                   img_height: int) -> tuple[int, int]:
+        ratio = img_width / img_height
+        if min(img_height, img_width) < 32 and (ratio > 4 or ratio < 1 / 4):
+            new_size = max(img_height, img_width)
+            return new_size, new_size
+        return img_width, img_height
+
+    def get_image_size_for_preprocess(self, img_width: int,
+                                      img_height: int) -> tuple[int, int]:
+
+        if max(img_height, img_width) > MAX_IMAGE_SIZE:
+            scale_factor = MAX_IMAGE_SIZE / max(img_height, img_width)
+            img_width = int(img_width * scale_factor)
+            img_height = int(img_height * scale_factor)
+        return img_width, img_height
+
+    def get_image_size_for_crop(self, img_width: int, img_height: int,
+                                window_size: int):
+        w_ratio = img_width / window_size
+        h_ratio = img_height / window_size
+
+        if w_ratio < 1:
+            width_new = img_width
+        else:
+            decimal_w = w_ratio - img_width // window_size
+            w_ratio = int(w_ratio) + 1 if decimal_w > 0.2 else int(w_ratio)
+            width_new = window_size * w_ratio
+        if h_ratio < 1:
+            height_new = img_height
+        else:
+            decimal_h = h_ratio - img_height // window_size
+            h_ratio = int(h_ratio) + 1 if decimal_h > 0.2 else int(h_ratio)
+            height_new = window_size * h_ratio
+        return int(width_new), int(height_new)
+
+    def patch_crop(self, img: Image.Image, i: int, j: int, th: int, tw: int):
+        target = img.crop((j, i, j + tw, i + th))
+        return target
+
+    def get_num_patches(self, img_width: int,
+                        img_height: int) -> tuple[int, int]:
+        img_width, img_height = self.get_image_size_for_padding(
+            img_width, img_height)
+        img_width, img_height = self.get_image_size_for_preprocess(
+            img_width, img_height)
+        window_size = self.determine_window_size(max(img_height, img_width),
+                                                 min(img_height, img_width))
+        if window_size == 0:
+            return 0, 0
+        else:
+            img_width, img_height = self.get_image_size_for_crop(
+                img_width, img_height, window_size)
+            center_list, (x_num, y_num) = self.slide_window(
+                img_width, img_height, [(window_size, window_size)],
+                [(window_size, window_size)])
+            full_rows = (len(center_list) - 1) // x_num + 1
+            if len(center_list) > 0 and len(center_list) % x_num == 0:
+                full_rows -= 1
+            return len(center_list), full_rows
+
+    def __call__(
+        self, img: Image.Image
+    ) -> tuple[Image.Image, list[Image.Image], list[bool] | None]:
+        img_width, img_height = img.size
+        new_img_width, new_img_height = self.get_image_size_for_padding(
+            img_width, img_height)
+        if new_img_width != img_width or new_img_height != img_height:
+            img = self.square_pad(img)
+            img_width, img_height = img.size
+
+        new_img_width, new_img_height = self.get_image_size_for_preprocess(
+            img_width, img_height)
+        img = img.resize((new_img_width, new_img_height),
+                         Image.Resampling.BILINEAR)
+        window_size = self.determine_window_size(
+            max(new_img_height, new_img_width),
+            min(new_img_height, new_img_width))
+
+        if window_size == 0:
+            return img, [], None
+        else:
+            new_img_width, new_img_height = self.get_image_size_for_crop(
+                new_img_width, new_img_height, window_size)
+            if (new_img_width, new_img_height) != (img_width, img_height):
+                img_for_crop = img.resize((new_img_width, new_img_height),
+                                          Image.Resampling.BILINEAR)
+            else:
+                img_for_crop = img
+
+            patches = []
+            newlines = []
+            center_list, (x_num, y_num) = self.slide_window(
+                new_img_width, new_img_height, [(window_size, window_size)],
+                [(window_size, window_size)])
+            for patch_id, center_lf_point in enumerate(center_list):
+                x, y, patch_w, patch_h = center_lf_point
+                big_patch = self.patch_crop(img_for_crop, y, x, patch_h,
+                                            patch_w)
+                patches.append(big_patch)
+                if (patch_id + 1) % x_num == 0:
+                    newlines.append(patch_id)
+
+            if newlines and newlines[-1] == len(patches) - 1:
+                newlines.pop()
+
+            return img, patches, [i in newlines for i in range(len(patches))
+                                  ] if len(patches) > 0 else None
+
+
+class Step3VLProcessor:
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        tokenizer: AnyTokenizer,
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.tokenizer = tokenizer
+
+        self.image_size = 728
+        self.patch_size = 504
+        self.image_preprocessor = Step3VisionProcessor(self.image_size,
+                                                       "bilinear",
+                                                       self.patch_size)
+
+        self.num_image_feature_size = 169
+        self.num_patch_feature_size = 81
+        self.image_token = "<im_patch>"
+        self.image_feature_placeholder = (self.image_token *
+                                          self.num_image_feature_size)
+        self.patch_feature_placeholder = (self.image_token *
+                                          self.num_patch_feature_size)
+
+        self.patcher = ImagePatcher()
+
+    @property
+    def image_token_id(self) -> int:
+        return self.tokenizer.get_vocab()[self.image_token]
+
+    def get_num_image_tokens(self, img_width: int, img_height: int) -> int:
+        num_patches, num_newlines = self.patcher.get_num_patches(
+            img_width, img_height)
+
+        return num_patches * (
+            self.num_patch_feature_size +
+            2) + self.num_image_feature_size + 2 + num_newlines
+
+    def _split_images(self,
+                      images: list[Image.Image]) -> list[ImageWithPatches]:
+        result = []
+        for img in images:
+            result.append(self.patcher(img))
+        return result
+
+    def _convert_images_to_pixel_values(
+        self,
+        images: list[Image.Image],
+        is_patch: bool = False,
+    ) -> list[torch.Tensor]:
+        return [
+            self.image_preprocessor(img, is_patch=is_patch)["pixel_values"]
+            for img in images
+        ]
+
+    def _get_patch_repl(
+        self,
+        num_patches: int,
+        patch_newline_mask: list[bool] | None,
+    ) -> tuple[str, list[int]]:
+        text = ""
+        token_ids = []
+        for i in range(num_patches):
+            assert len(patch_newline_mask) == num_patches
+            text += f"<patch_start>{self.patch_feature_placeholder}<patch_end>"
+            token_ids.extend(
+                [self.tokenizer.convert_tokens_to_ids("<patch_start>")] +
+                [self.image_token_id] * self.num_patch_feature_size +
+                [self.tokenizer.convert_tokens_to_ids("<patch_end>")])
+            if patch_newline_mask and patch_newline_mask[i]:
+                text += "<patch_newline>"
+                token_ids.append(
+                    self.tokenizer.convert_tokens_to_ids("<patch_newline>"))
+        return text, token_ids
+
+    def _get_image_repl(
+        self,
+        num_images: int,
+    ) -> tuple[str, list[int]]:
+        text = f"<im_start>{self.image_feature_placeholder}<im_end>"
+        token_ids = [
+            self.tokenizer.convert_tokens_to_ids("<im_start>")
+        ] + [self.image_token_id] * self.num_image_feature_size + [
+            self.tokenizer.convert_tokens_to_ids("<im_end>")
+        ]
+        return text * num_images, token_ids * num_images
+
+    def _get_image_repl_features(
+        self,
+        num_images: int,
+        num_patches: int,
+        patch_new_line_idx: Optional[list[bool]],
+    ) -> tuple[str, list[int]]:
+        if num_patches > 0:
+            patch_repl, patch_repl_ids = self._get_patch_repl(
+                num_patches, patch_new_line_idx)
+        else:
+            patch_repl = ""
+            patch_repl_ids = []
+        image_repl, image_repl_ids = self._get_image_repl(num_images)
+        return patch_repl + image_repl, patch_repl_ids + image_repl_ids
+
+    def replace_placeholder(self, text: str, placeholder: str,
+                            repls: list[str]) -> str:
+        parts = text.split(placeholder)
+
+        if len(parts) - 1 != len(repls):
+            raise ValueError(
+                "The number of placeholders does not match the number of replacements."  # noqa: E501
+            )
+
+        result = [parts[0]]
+        for i, repl in enumerate(repls):
+            result.append(repl)
+            result.append(parts[i + 1])
+
+        return "".join(result)
+
+    def __call__(
+        self,
+        text: Optional[Union[str, list[str]]] = None,
+        images: Optional[Union[Image.Image, list[Image.Image]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+    ) -> BatchFeature:
+        if text is None:
+            text = []
+        if not isinstance(text, list):
+            text = [text]
+        if images is None:
+            images = []
+        if not isinstance(images, list):
+            images = [images]
+
+        if len(images) == 0:
+            image_inputs = {}
+            text_inputs = self.tokenizer(text)
+        else:
+            splitted_images_data = self._split_images(images)
+            pixel_values_lst = []
+            patch_pixel_values_lst = []
+            patch_newline_mask_lst = []
+            image_repl_str_lst = []
+            image_repl_ids_lst = []
+            num_patches = []
+            for raw_img, img_patches, patch_newline_mask in splitted_images_data:  # noqa: E501
+                pixel_values_lst.extend(
+                    self._convert_images_to_pixel_values([raw_img]))
+
+                if len(img_patches) > 0:
+                    patch_pixel_values_lst.extend(
+                        self._convert_images_to_pixel_values(img_patches,
+                                                             is_patch=True))
+                num_patches.append(len(img_patches))
+
+                image_repl_str, image_repl_ids = self._get_image_repl_features(
+                    1, len(img_patches), patch_newline_mask)
+                image_repl_str_lst.append(image_repl_str)
+                image_repl_ids_lst.extend(image_repl_ids)
+
+                if patch_newline_mask is not None:
+                    patch_newline_mask_lst.extend(patch_newline_mask)
+
+            image_inputs = {
+                "pixel_values": torch.cat(pixel_values_lst),
+                "num_patches": num_patches,
+            }
+            if patch_pixel_values_lst:
+                image_inputs["patch_pixel_values"] = torch.cat(
+                    patch_pixel_values_lst)
+            if patch_newline_mask_lst:
+                image_inputs["patch_newline_mask"] = torch.tensor(
+                    patch_newline_mask_lst, dtype=torch.bool)
+
+            text = [
+                self.replace_placeholder(t, self.image_token,
+                                         image_repl_str_lst) for t in text
+            ]
+            text_inputs = self.tokenizer(text)
+
+        return BatchFeature(
+            {
+                **text_inputs,
+                **image_inputs,
+            },
+            tensor_type=return_tensors,
+        )
+
+
+class Step3VLProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_processor(self) -> Step3VLProcessor:
+        return Step3VLProcessor(
+            self.get_hf_config(),
+            self.get_tokenizer(),
+        )
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+    def get_max_image_tokens(self) -> int:
+        hf_processor = self.get_hf_processor()
+        return hf_processor.get_num_image_tokens(
+            self.get_image_size_with_most_features().width,
+            self.get_image_size_with_most_features().height)
+
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
+        return {"image": self.get_max_image_tokens()}
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        return ImageSize(3024, 3024)
+
+    def get_num_mm_tokens(self, mm_data: MultiModalDataDict) -> int:
+        if len(mm_data) != 1 or "image" not in mm_data:
+            raise ValueError(
+                "mm_data could only contain one key 'image' for steo1o")
+
+        image_data = mm_data["image"]
+        if not isinstance(image_data, (list, tuple)):
+            image_data = [image_data]
+
+        return sum(self.get_hf_processor().get_num_image_tokens(
+            img.width, img.height) for img in image_data)
+
+
+class Step3VLDummyInputsBuilder(BaseDummyInputsBuilder[Step3VLProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+        return "<im_patch>" * num_images
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+        num_images = mm_counts.get("image", 0)
+
+        return {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images)
+        }
+
+
+class Step3VLMultiModalProcessor(BaseMultiModalProcessor[Step3VLProcessingInfo]
+                                 ):
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, Any],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        image_placeholder_token_id = hf_processor.image_token_id
+        batch_num_patches = out_mm_kwargs["num_patches"].tolist()
+
+        def get_replacement_step1o(item_idx: int):
+            img_out = out_mm_kwargs.get_item("image", item_idx)
+            num_patches = batch_num_patches[item_idx]
+            if num_patches > 0:
+                patch_newline_mask = img_out["patch_newline_mask"].data.tolist(
+                )
+                image_repl_ids = hf_processor._get_image_repl_features(
+                    1, num_patches, patch_newline_mask)[1]
+            else:
+                image_repl_ids = hf_processor._get_image_repl_features(
+                    1, 0, None)[1]
+            return PromptUpdateDetails.select_token_id(
+                seq=image_repl_ids,
+                embed_token_id=image_placeholder_token_id,
+            )
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[image_placeholder_token_id],
+                replacement=get_replacement_step1o,
+            )
+        ]
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        num_patches = hf_inputs.get("num_patches", torch.empty(0))
+
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            patch_pixel_values=MultiModalFieldConfig.flat_from_sizes(
+                "image", num_patches),
+            num_patches=MultiModalFieldConfig.batched("image"),
+            patch_newline_mask=MultiModalFieldConfig.flat_from_sizes(
+                "image", num_patches),
+        )
+
+
+def get_abs_pos(abs_pos, tgt_size):
+    dim = abs_pos.size(-1)
+    abs_pos_new = abs_pos.squeeze(0)
+    cls_token, old_pos_embed = abs_pos_new[:1], abs_pos_new[1:]
+
+    src_size = int(math.sqrt(abs_pos_new.shape[0] - 1))
+    tgt_size = int(math.sqrt(tgt_size))
+    dtype = abs_pos.dtype
+
+    if src_size != tgt_size:
+        old_pos_embed = old_pos_embed.view(1, src_size, src_size,
+                                           dim).permute(0, 3, 1,
+                                                        2).contiguous()
+        old_pos_embed = old_pos_embed.to(torch.float32)
+        new_pos_embed = F.interpolate(
+            old_pos_embed,
+            size=(tgt_size, tgt_size),
+            mode='bicubic',
+            antialias=True,
+            align_corners=False,
+        ).to(dtype)
+        new_pos_embed = new_pos_embed.permute(0, 2, 3, 1)
+        new_pos_embed = new_pos_embed.view(tgt_size * tgt_size, dim)
+        vision_pos_embed = torch.cat([cls_token, new_pos_embed], dim=0)
+        vision_pos_embed = vision_pos_embed.view(1, tgt_size * tgt_size + 1,
+                                                 dim)
+        return vision_pos_embed
+    else:
+        return abs_pos
+
+
+class Step3VisionEmbeddings(nn.Module):
+
+    def __init__(self, config: Step3VisionEncoderConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.class_embedding = nn.Parameter(torch.randn(1, self.embed_dim))
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            bias=True,
+        )
+
+        self.num_patches = (self.image_size // self.patch_size)**2
+        self.pad_tp_size = 4  # hard code for padding
+        # To load the pretrained weights, we still use P+1 as the seqlen
+        self.position_embedding = torch.nn.Embedding(self.num_patches + 1,
+                                                     self.embed_dim)
+        self.register_buffer("position_ids",
+                             torch.arange(self.num_patches + 1).expand(
+                                 (1, -1)),
+                             persistent=False)
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        batch_size = pixel_values.shape[0]
+        patch_embeds = self.patch_embedding(
+            pixel_values)  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+
+        # pad
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        embeddings = embeddings + get_abs_pos(
+            self.position_embedding(self.position_ids), patch_embeds.size(1))
+        embeddings = torch.cat([
+            embeddings[:, 0, :].unsqueeze(1).repeat(1, self.pad_tp_size - 1,
+                                                    1), embeddings
+        ],
+                               dim=1)
+        return embeddings
+
+
+class Step3VisionAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self,
+                 config,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.total_num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.total_num_heads
+
+        self.scale = self.head_dim**-0.5
+
+        tp_size = get_tensor_model_parallel_world_size()
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.qkv_proj = QKVParallelLinear(self.embed_dim,
+                                          self.head_dim,
+                                          self.total_num_heads,
+                                          bias=True,
+                                          quant_config=quant_config,
+                                          prefix=prefix)
+        self.out_proj = RowParallelLinear(self.embed_dim,
+                                          self.embed_dim,
+                                          bias=True,
+                                          quant_config=quant_config,
+                                          prefix=prefix)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads,
+                           self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ):
+        """Input shape: Batch x Time x Channel"""
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.chunk(chunks=3, dim=-1)
+        q = q.view(bsz, tgt_len, self.num_heads, self.head_dim)
+        k = k.view(bsz, tgt_len, self.num_heads, self.head_dim)
+        v = v.view(bsz, tgt_len, self.num_heads, self.head_dim)
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
+        attn_output = F.scaled_dot_product_attention(q,
+                                                     k,
+                                                     v,
+                                                     scale=self.scale,
+                                                     is_causal=False)
+        attn_output = attn_output.transpose(1, 2).reshape(
+            bsz, tgt_len, self.num_heads * self.head_dim)
+
+        attn_output, _ = self.out_proj(attn_output)
+
+        return attn_output
+
+
+class Step3VisionMLP(nn.Module):
+
+    def __init__(self,
+                 config,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+        self.config = config
+        self.activation_fn = get_act_fn(config.hidden_act)
+        self.fc1 = ColumnParallelLinear(config.hidden_size,
+                                        config.intermediate_size,
+                                        bias=True,
+                                        quant_config=quant_config,
+                                        prefix=prefix)
+        self.fc2 = RowParallelLinear(config.intermediate_size,
+                                     config.hidden_size,
+                                     bias=True,
+                                     quant_config=quant_config,
+                                     prefix=prefix)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states, _ = self.fc2(hidden_states)
+        return hidden_states
+
+
+class Step3VisionEncoderLayer(nn.Module):
+
+    def __init__(self,
+                 config: Step3VisionEncoderConfig,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = Step3VisionAttention(config,
+                                              quant_config,
+                                              prefix=f"{prefix}.self_attn")
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim,
+                                        eps=config.layer_norm_eps)
+        self.mlp = Step3VisionMLP(config, quant_config, prefix=f"{prefix}.mlp")
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim,
+                                        eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.FloatTensor:
+        hidden_states = hidden_states + self.layer_norm1(
+            self.self_attn(hidden_states))
+        hidden_states = hidden_states + self.layer_norm2(
+            self.mlp(hidden_states))
+        return hidden_states
+
+
+class Step3VisionEncoder(nn.Module):
+
+    def __init__(self,
+                 config: Step3VisionEncoderConfig,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([
+            Step3VisionEncoderLayer(config,
+                                    quant_config,
+                                    prefix=f"{prefix}.layers.{i}")
+            for i in range(config.num_hidden_layers)
+        ])
+
+    def forward(
+        self,
+        inputs_embeds,
+    ):
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            hidden_states = encoder_layer(hidden_states)
+        return hidden_states
+
+
+class Step3VisionTransformer(nn.Module):
+
+    def __init__(self,
+                 config: Step3VisionEncoderConfig,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+        self.config = config
+        self.image_size = config.image_size
+        self.embeddings = Step3VisionEmbeddings(config)
+        self.transformer = Step3VisionEncoder(config,
+                                              quant_config,
+                                              prefix=f"{prefix}.transformer")
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+    ):
+        hidden_states = self.embeddings(pixel_values)
+        hidden_states = self.transformer(inputs_embeds=hidden_states)
+        return hidden_states
+
+
+@MULTIMODAL_REGISTRY.register_processor(Step3VLMultiModalProcessor,
+                                        info=Step3VLProcessingInfo,
+                                        dummy_inputs=Step3VLDummyInputsBuilder)
+class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                      SupportsPP):
+
+    hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
+        "model.": "language_model.model.",
+        "lm_head.": "language_model.lm_head.",
+    })
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<im_patch>"
+
+        raise ValueError("Only image modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        self.vision_model = Step3VisionTransformer(config.vision_config,
+                                                   None,
+                                                   prefix=maybe_prefix(
+                                                       prefix, "vision_model"))
+        self.vit_downsampler = nn.Conv2d(
+            config.vision_config.hidden_size,
+            config.vision_config.output_hidden_size,
+            kernel_size=2,
+            stride=config.understand_projector_stride)
+        self.vit_downsampler2 = nn.Conv2d(
+            config.vision_config.output_hidden_size,
+            config.vision_config.output_hidden_size * 2,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+        )
+        self.vit_large_projector = nn.Linear(
+            config.vision_config.output_hidden_size * 2,
+            config.hidden_size,
+            bias=config.projector_bias,
+        )
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"))
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    @cached_property
+    def sampler(self):
+        if hasattr(self.language_model, "sampler"):
+            return self.language_model.sampler
+
+        return get_sampler()
+
+    @property
+    def device(self):
+        return next(self.parameters()).device
+
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[Step3VLImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        patch_pixel_values = kwargs.pop("patch_pixel_values", None)
+        num_patches = kwargs.pop("num_patches", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        if pixel_values is not None:
+            pixel_values = flatten_bn(pixel_values, concat=True)
+            if pixel_values.dim() >= 3:
+                pixel_values = pixel_values.view(-1, *pixel_values.shape[-3:])
+            if patch_pixel_values is not None:
+                patch_pixel_values = flatten_bn(patch_pixel_values,
+                                                concat=True)
+                patch_pixel_values = patch_pixel_values.view(
+                    -1, *patch_pixel_values.shape[-3:])
+                # Handle empty patch_pixel_values by setting to None
+                if patch_pixel_values.shape[0] == 0:
+                    patch_pixel_values = None
+            num_patches = flatten_bn(num_patches, concat=True).tolist()
+
+            return Step3VLImagePixelInputs(
+                type="pixel_values",
+                pixel_values=pixel_values.to(self.dtype).to(self.device),
+                patch_pixel_values=patch_pixel_values.to(self.dtype).to(
+                    self.device) if patch_pixel_values is not None else None,
+                num_patches=num_patches,
+            )
+
+        if image_embeds is not None:
+            if image_embeds.dim() == 2 or image_embeds.dim() >= 3:
+                image_embeds = image_embeds.view(-1, image_embeds.shape[-1])
+            else:
+                raise ValueError(
+                    f"Unexpected shape for image_embeds: {image_embeds.shape}")
+
+            return Step3VLImageEmbeddingInputs(
+                type="image_embeds",
+                image_embeds=image_embeds.to(self.dtype).to(self.device),
+            )
+        return None
+
+    def _process_image_features(self,
+                                image_features: torch.Tensor) -> torch.Tensor:
+        B, P = image_features.shape[:2]
+        HW = int(sqrt(P))
+        image_features = image_features.permute(0, 2, 1).view(B, -1, HW, HW)
+        image_features = self.vit_downsampler(image_features)
+        image_features = self.vit_downsampler2(image_features)
+        n_dim = image_features.size(1)
+        image_features = image_features.view(B, n_dim, -1).permute(0, 2, 1)
+        image_features = self.vit_large_projector(image_features)
+        return image_features
+
+    def _get_vision_model_output(self,
+                                 input_tensor: torch.Tensor) -> torch.Tensor:
+        return self.vision_model(input_tensor)[:, 4:]
+
+    def _process_image_input(
+            self, image_input: Step3VLImageInputs) -> tuple[torch.Tensor, ...]:
+
+        if image_input["type"] == "image_embeds":
+            image_features = image_input["image_embeds"]
+        else:
+            image_features = self._get_vision_model_output(
+                image_input["pixel_values"])
+            patch_image_features = self._get_vision_model_output(
+                image_input["patch_pixel_values"]
+            ) if image_input["patch_pixel_values"] is not None else None
+            num_patches = image_input["num_patches"]
+
+        image_features = self._process_image_features(image_features)
+        patch_image_features = self._process_image_features(
+            patch_image_features) if patch_image_features is not None else None
+
+        merged_image_features = []
+        cur_patch_idx = 0
+        for i, num_patch in enumerate(num_patches):
+            cur_feature = []
+            if num_patch > 0:
+                patch_slice = patch_image_features[
+                    cur_patch_idx:cur_patch_idx + num_patch]
+                cur_feature.append(patch_slice.view(-1, patch_slice.shape[-1]))
+            cur_feature.append(image_features[i].view(
+                -1, image_features.shape[-1]))
+            cur_patch_idx += num_patch
+            merged_image_features.append(
+                torch.cat(cur_feature) if len(cur_feature) >
+                1 else cur_feature[0])
+        return merged_image_features
+
+    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+        vision_embeddings = self._process_image_input(image_input)
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        if multimodal_embeddings is None:
+            inputs_embeds = self.language_model.model.get_input_embeddings(
+                input_ids)
+        else:
+            is_text = input_ids != self.config.image_token_id
+            text_ids = input_ids[is_text]
+            text_embeds = self.language_model.model.get_input_embeddings(
+                text_ids)
+            inputs_embeds = torch.empty(input_ids.shape[0],
+                                        text_embeds.shape[-1],
+                                        dtype=text_embeds.dtype,
+                                        device=text_embeds.device)
+            inputs_embeds[is_text] = text_embeds
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                self.config.image_token_id)
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            # always pass the input via `inputs_embeds`
+            # to make sure the computation graph is consistent
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        hidden_states = self.language_model(input_ids,
+                                            positions,
+                                            intermediate_tensors,
+                                            inputs_embeds=inputs_embeds)
+
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        return self.language_model.sample(logits, sampling_metadata)
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        loader = AutoWeightsLoader(self)
+        loaded_weights = loader.load_weights(weights,
+                                             mapper=self.hf_to_vllm_mapper)
+        return loaded_weights
diff --git a/vllm/model_executor/models/tarsier.py b/vllm/model_executor/models/tarsier.py
index 979d789b330cf..70cf5e95a54e1 100644
--- a/vllm/model_executor/models/tarsier.py
+++ b/vllm/model_executor/models/tarsier.py
@@ -178,13 +178,11 @@ class TarsierProcessingInfo(BaseProcessingInfo):
         return get_vision_encoder_info(self.get_hf_config())
 
     def get_hf_processor(self, **kwargs: object) -> TarsierProcessor:
-        hf_processor = self.ctx.get_hf_processor(TarsierProcessor, **kwargs)
-        # Patch for patch_size if needed (copied from vLLM LLaVA)
-        if hasattr(hf_processor,
-                   'patch_size') and hf_processor.patch_size is None:
-            patch_size = self.get_vision_encoder_info().get_patch_size()
-            hf_processor.patch_size = patch_size
-        return hf_processor
+        vision_info = self.get_vision_encoder_info()
+
+        kwargs.setdefault("patch_size", vision_info.get_patch_size())
+
+        return self.ctx.get_hf_processor(TarsierProcessor, **kwargs)
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
diff --git a/vllm/model_executor/models/telechat2.py b/vllm/model_executor/models/telechat2.py
index f0b31b1332fb1..49a7677151a94 100644
--- a/vllm/model_executor/models/telechat2.py
+++ b/vllm/model_executor/models/telechat2.py
@@ -37,9 +37,20 @@ from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper,
 class TeleChat2Model(LlamaModel):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        hf_config = vllm_config.model_config.hf_config
+
+        vllm_config.model_config.hf_config.attribute_map = {
+            "num_hidden_layers": "n_layer",
+            "num_attention_heads": "n_head",
+            "intermediate_size": "ffn_hidden_size",
+            "rms_norm_eps": "layer_norm_epsilon"
+        }
+        vllm_config.model_config.hf_config.hidden_act = "silu"
+
         # 1. Initialize the LlamaModel with bias
-        vllm_config.model_config.hf_config.bias = True
-        vllm_config.model_config.hf_config.mlp_bias = True
+        hf_config.bias = True
+        hf_config.mlp_bias = True
+
         super().__init__(vllm_config=vllm_config, prefix=prefix)
         # 2. Remove the bias from the qkv_proj and gate_up_proj based on config
         # Telechat2's gate_up_proj and qkv_proj don't have bias
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index 8cd95605cdfae..5059d1e1d9fea 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -48,7 +48,6 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.processor import cached_get_processor
 from vllm.utils import is_list_of
 
 from .interfaces import (SupportsLoRA, SupportsMultiModal, SupportsPP,
@@ -189,10 +188,6 @@ class MultiModalProcessingInfo(BaseProcessingInfo):
         image_tokens = mm_tokens["num_image_tokens"][0]
         return image_tokens
 
-    def get_hf_processor(self):
-        processor = cached_get_processor(self.ctx.model_config.model)
-        return processor
-
     def get_max_image_size(self):
         return 10_000, 10_000  # hardcode for arbitrary very large size
 
@@ -656,6 +651,18 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
 
 
+@support_torch_compile
+class TransformersModel(TransformersBase):
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            # Add `model.` prefix for base model checkpoints
+            "": "model.",
+            # Remove `model.` from places it should not be
+            "model.model.": "model.",
+            "model.score": "score",
+        })
+
+
 @support_torch_compile
 class TransformersForCausalLM(TransformersBase):
 
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index a4569ccd5a845..bef34c1be49fe 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -71,13 +71,7 @@ UltravoxAudioInputs = Union[UltravoxAudioFeatureInputs,
 
 class UltravoxProcessingInfo(BaseProcessingInfo):
 
-    def get_hf_processor(
-        self,
-        *,
-        # Ignored in initialization
-        sampling_rate: Optional[int] = None,
-        **kwargs: object,
-    ) -> ProcessorMixin:
+    def get_hf_processor(self, **kwargs: object) -> ProcessorMixin:
         config = self.ctx.model_config.hf_config
         hf_processor = self.ctx.get_hf_processor(**kwargs)
 
@@ -89,13 +83,9 @@ class UltravoxProcessingInfo(BaseProcessingInfo):
 
         return hf_processor
 
-    def get_feature_extractor(
-        self,
-        *,
-        # Ignored in initialization
-        sampling_rate: Optional[int] = None,
-    ) -> WhisperFeatureExtractor:
-        hf_processor = self.get_hf_processor(sampling_rate=sampling_rate)
+    def get_feature_extractor(self,
+                              **kwargs: object) -> WhisperFeatureExtractor:
+        hf_processor = self.get_hf_processor(**kwargs)
         audio_processor = hf_processor.audio_processor  # type: ignore
         feature_extractor = audio_processor.feature_extractor  # type: ignore
         assert isinstance(feature_extractor, WhisperFeatureExtractor)
@@ -156,7 +146,7 @@ class UltravoxMultiModalProcessor(
         audios = mm_data.pop("audios", [])
         assert isinstance(audios, list)
 
-        feature_extractor = self.info.get_feature_extractor()
+        feature_extractor = self.info.get_feature_extractor(**mm_kwargs)
         mm_kwargs = dict(
             **mm_kwargs,
             sampling_rate=feature_extractor.sampling_rate,
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 62deb68035b92..28508e1bac1ee 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -393,7 +393,7 @@ def merge_multimodal_embeddings_from_map(
         inputs_embeds: torch.Tensor, multimodal_embeddings: NestedTensors,
         placeholder_map: MultiModalPlaceholderMap.IndexMap) -> torch.Tensor:
     """
-    Merge ``multimodal_embeddings`` into ``inputs_embeds`` using the provided 
+    Merge ``multimodal_embeddings`` into ``inputs_embeds`` using the provided
     placeholder map .
 
     Note:
@@ -418,17 +418,23 @@ def _merge_multimodal_embeddings(
     Note:
         This updates ``inputs_embeds`` in place.
     """
-    num_expected_tokens = is_multimodal.sum().item()
-    assert isinstance(num_expected_tokens, int)
-
     flattened = _flatten_embeddings(multimodal_embeddings)
-    if flattened.shape[0] != num_expected_tokens:
-        expr = _embedding_count_expression(multimodal_embeddings)
-        raise ValueError(
-            f"Attempted to assign {expr} = {flattened.shape[0]} "
-            f"multimodal tokens to {num_expected_tokens} placeholders")
+    try:
+        # This is equivalent to: inputs_embeds[is_multimodal] = flattened.
+        inputs_embeds.masked_scatter_(is_multimodal.unsqueeze(-1), flattened)
+    except RuntimeError as e:
+        num_expected_tokens = is_multimodal.sum().item()
+        assert isinstance(num_expected_tokens, int)
+
+        if flattened.shape[0] != num_expected_tokens:
+            expr = _embedding_count_expression(multimodal_embeddings)
+            raise ValueError(
+                f"Attempted to assign {expr} = {flattened.shape[0]} "
+                f"multimodal tokens to {num_expected_tokens} placeholders"
+            ) from e
+        else:
+            raise ValueError("Error during masked scatter operation") from e
 
-    inputs_embeds[is_multimodal] = flattened
     return inputs_embeds
 
 
@@ -478,11 +484,11 @@ def merge_multimodal_embeddings(
     Merge ``multimodal_embeddings`` into ``inputs_embeds`` by overwriting the
     positions in ``inputs_embeds`` corresponding to placeholder tokens in
     ``input_ids``.
-    
-    ``placeholder_token_id`` can be a list of token ids (e.g, token ids 
-    of img_start, img_break, and img_end tokens) when needed: This means 
-    the order of these tokens in the ``input_ids`` MUST MATCH the order of 
-    their embeddings in ``multimodal_embeddings`` since we need to 
+
+    ``placeholder_token_id`` can be a list of token ids (e.g, token ids
+    of img_start, img_break, and img_end tokens) when needed: This means
+    the order of these tokens in the ``input_ids`` MUST MATCH the order of
+    their embeddings in ``multimodal_embeddings`` since we need to
     slice-merge instead of individually scattering.
 
     For example, if input_ids is "TTTTTSIIIBIIIBIIIETTT", where
@@ -491,9 +497,9 @@ def merge_multimodal_embeddings(
     - I is image embedding token
     - B is image break token
     - E is image end token.
-    
-    Then the image embeddings (that correspond to I's) from vision encoder 
-    must be padded with embeddings of S, B, and E in the same order of 
+
+    Then the image embeddings (that correspond to I's) from vision encoder
+    must be padded with embeddings of S, B, and E in the same order of
     input_ids for a correct embedding merge.
 
     Note:
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index ac6a659bbaa32..de30509b1ccb4 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -7,9 +7,7 @@ from typing import Final, Generic, Optional, Protocol, TypeVar, Union
 import torch
 from transformers import PretrainedConfig
 
-import vllm.envs as envs
-from vllm.attention.selector import (backend_name_to_enum,
-                                     get_global_forced_attn_backend)
+from vllm.attention.selector import get_env_variable_attn_backend
 from vllm.logger import init_logger
 from vllm.platforms import _Backend, current_platform
 
@@ -75,32 +73,12 @@ def get_vit_attn_backend(support_fa: bool = False) -> _Backend:
     Get the available attention backend for Vision Transformer.
     """
     # TODO(Isotr0py): Remove `support_fa` after support FA for all ViTs attn.
-    selected_backend: Optional[_Backend] = get_global_forced_attn_backend()
-    if selected_backend is None:
-        backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND
-        if backend_by_env_var is not None:
-            selected_backend = backend_name_to_enum(backend_by_env_var)
-    if selected_backend is None:
-        if current_platform.is_cuda():
-            device_available = current_platform.has_device_capability(80)
-            if device_available and support_fa:
-                from transformers.utils import is_flash_attn_2_available
-                if is_flash_attn_2_available():
-                    selected_backend = _Backend.FLASH_ATTN
-                else:
-                    logger.warning_once(
-                        "Current `vllm-flash-attn` has a bug inside vision "
-                        "module, so we use xformers backend instead. You can "
-                        "run `pip install flash-attn` to use flash-attention "
-                        "backend.")
-                    selected_backend = _Backend.XFORMERS
-            else:
-                # For Volta and Turing GPUs, use xformers instead.
-                selected_backend = _Backend.XFORMERS
-        else:
-            # Default to torch SDPA for other non-GPU platforms.
-            selected_backend = _Backend.TORCH_SDPA
-    return selected_backend
+
+    selected_backend: Optional[_Backend] = get_env_variable_attn_backend()
+    if selected_backend is not None:
+        return selected_backend
+
+    return current_platform.get_vit_attn_backend(support_fa)
 
 
 def resolve_visual_encoder_outputs(
diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py
index 97cab628317e4..6b06c0ac6683f 100644
--- a/vllm/model_executor/models/voxtral.py
+++ b/vllm/model_executor/models/voxtral.py
@@ -26,8 +26,7 @@ from vllm.logger import init_logger
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models import SupportsPP
 # yapf: disable
-from vllm.model_executor.models.whisper import (
-    WhisperEncoder, WhisperForConditionalGeneration)
+from vllm.model_executor.models.whisper import WhisperEncoder
 # yapf: enable
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
@@ -50,6 +49,18 @@ from .utils import (flatten_bn, init_vllm_registered_model, maybe_prefix,
 
 logger = init_logger(__name__)
 
+ISO639_1_SUPPORTED_LANGS = {
+    "ar": "Arabic",
+    "nl": "Dutch",
+    "en": "English",
+    "fr": "French",
+    "de": "German",
+    "hi": "Hindi",
+    "it": "Italian",
+    "pt": "Portuguese",
+    "es": "Spanish",
+}
+
 
 class VoxtralProcessorAdapter:
     """
@@ -301,6 +312,7 @@ class VoxtralMultiModalProcessor(BaseMultiModalProcessor[VoxtralProcessingInfo]
                                         dummy_inputs=VoxtralDummyInputsBuilder)
 class VoxtralForConditionalGeneration(nn.Module, SupportsMultiModal,
                                       SupportsPP, SupportsTranscription):
+    supported_languages = ISO639_1_SUPPORTED_LANGS
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
@@ -441,8 +453,8 @@ class VoxtralForConditionalGeneration(nn.Module, SupportsMultiModal,
     # for speech-to-text transcription
     def get_generation_prompt(cls, audio: np.ndarray,
                               model_config: ModelConfig,
-                              stt_config: SpeechToTextConfig, language: str,
-                              task_type: str,
+                              stt_config: SpeechToTextConfig,
+                              language: Optional[str], task_type: str,
                               request_prompt: str) -> PromptType:
         tokenizer = cached_tokenizer_from_config(model_config)
         audio = Audio(audio, int(stt_config.sample_rate),
@@ -457,11 +469,6 @@ class VoxtralForConditionalGeneration(nn.Module, SupportsMultiModal,
         prompts_dict["prompt_token_ids"] = tokenized.tokens
         return cast(PromptType, prompts_dict)
 
-    @classmethod
-    def validate_language(cls, language: str) -> bool:
-        # same as whisper
-        return WhisperForConditionalGeneration.validate_language(language)
-
     @classmethod
     def get_num_audio_tokens(cls, audio_duration_s: float,
                              stt_config: SpeechToTextConfig,
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index d98dab5fac0e4..ca02ecd828ba3 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -109,51 +109,6 @@ ISO639_1_SUPPORTED_LANGS = {
     "vi": "Vietnamese",
     "cy": "Welsh"
 }
-ISO639_1_OTHER_LANGS = {
-    "lo": "Lao",
-    "jw": "Javanese",
-    "tk": "Turkmen",
-    "yi": "Yiddish",
-    "so": "Somali",
-    "bn": "Bengali",
-    "nn": "Norwegian Nynorsk",
-    "si": "Sinhala",
-    "yo": "Yoruba",
-    "sa": "Sanskrit",
-    "mi": "Māori",
-    "fo": "Faroese",  # codespell:ignore
-    "mt": "Maltese",
-    "tg": "Tajik",
-    "mg": "Malagasy",
-    "haw": "Hawaiian",
-    "km": "Khmer",
-    "br": "Breton",
-    "ps": "Pashto",
-    "ln": "Lingala",
-    "la": "Latin",
-    "ml": "Malayalam",
-    "sq": "Albanian",
-    "su": "Sundanese",
-    "eu": "Basque",
-    "ka": "Georgian",
-    "uz": "Uzbek",
-    "sn": "Shona",
-    "ht": "Haitian",
-    "as": "Assamese",
-    "mn": "Mongolian",
-    "te": "Telugu",
-    "pa": "Panjabi",
-    "tt": "Tatar",
-    "gu": "Gujarati",
-    "oc": "Occitan",
-    "ha": "Hausa",
-    "ba": "Bashkir",
-    "my": "Burmese",
-    "sd": "Sindhi",
-    "am": "Amharic",
-    "lb": "Luxembourgish",
-    "bo": "Tibetan"
-}
 
 
 class WhisperAudioInputs(TypedDict):
@@ -668,23 +623,22 @@ class WhisperProcessingInfo(BaseProcessingInfo):
     def get_hf_config(self) -> WhisperConfig:
         return self.ctx.get_hf_config(WhisperConfig)
 
-    def get_hf_processor(self,
-                         sampling_rate: Optional[int] = None
-                         ) -> WhisperProcessor:
-        # HACK: Transformers 4.53.0 has issue with whisper tokenizer to
+    def get_hf_processor(self, **kwargs: object) -> WhisperProcessor:
+        # HACK: Transformers 4.53.2 has issue with whisper tokenizer to
         # initialize processor. We use a monkeypatch to fix it here.
         # See: https://github.com/vllm-project/vllm/issues/20224
         processor_class = WhisperProcessor
         tokenizer_class = ("WhisperTokenizer", "WhisperTokenizerFast")
         if processor_class.tokenizer_class != tokenizer_class:
             processor_class.tokenizer_class = tokenizer_class
-        return self.ctx.get_hf_processor(processor_class)
+        return self.ctx.get_hf_processor(processor_class, **kwargs)
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"audio": 1}
 
-    def get_feature_extractor(self) -> WhisperFeatureExtractor:
-        hf_processor = self.get_hf_processor()
+    def get_feature_extractor(self,
+                              **kwargs: object) -> WhisperFeatureExtractor:
+        hf_processor = self.get_hf_processor(**kwargs)
         feature_extractor = hf_processor.feature_extractor  # type: ignore
         assert isinstance(feature_extractor, WhisperFeatureExtractor)
         return feature_extractor
@@ -747,7 +701,7 @@ class WhisperMultiModalProcessor(
         tok_kwargs: Mapping[str, object],
     ) -> BatchFeature:
         if mm_data:
-            feature_extractor = self.info.get_feature_extractor()
+            feature_extractor = self.info.get_feature_extractor(**mm_kwargs)
             mm_data = dict(audio=mm_data.pop("audios"))
             mm_kwargs = dict(
                 **mm_kwargs,
@@ -807,22 +761,20 @@ class WhisperForConditionalGeneration(nn.Module, SupportsTranscription,
 
     # Whisper only supports audio-conditioned generation.
     supports_transcription_only = True
+    supported_languages = ISO639_1_SUPPORTED_LANGS
 
     @classmethod
-    def validate_language(cls, language: str) -> bool:
-        if language in ISO639_1_SUPPORTED_LANGS:
-            return True
-        elif language in ISO639_1_OTHER_LANGS:
+    def validate_language(cls, language: Optional[str]) -> Optional[str]:
+        if language is None:
+            # TODO language should be optional and can be guessed.
+            # For now we default to en. See
+            # https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/generation_whisper.py#L1520
             logger.warning(
-                "The selected language %s has limited accuracy with"
-                " reported WER>=0.5. Results may be less accurate "
-                "for this choice.", language)
-            return True
-        else:
-            raise ValueError(f"Unsupported language: {language}."
-                             "Language should be one of:" +
-                             f" {list(ISO639_1_SUPPORTED_LANGS.values())}" +
-                             f"or {list(ISO639_1_OTHER_LANGS.values())}")
+                "Defaulting to language='en'. If you wish to transcribe "
+                "audio in a different language, pass the `language` field "
+                "in the TranscriptionRequest.")
+            language = "en"
+        return super().validate_language(language)
 
     @classmethod
     def get_generation_prompt(
@@ -830,9 +782,12 @@ class WhisperForConditionalGeneration(nn.Module, SupportsTranscription,
             audio: np.ndarray,
             model_config: ModelConfig,  # not needed here
             stt_config: SpeechToTextConfig,
-            language: str,
+            language: Optional[str],
             task_type: str,
             request_prompt: str) -> PromptType:
+        if language is None:
+            raise ValueError(
+                "Language must be specified when creating the Whisper prompt")
         prompt = {
             "encoder_prompt": {
                 # Whisper does not support encoder prompt.
diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py
index 2b20ca2a3ba3f..41ed0b09c5a2a 100644
--- a/vllm/model_executor/utils.py
+++ b/vllm/model_executor/utils.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Utils for model executor."""
+
 import copy
 from typing import Any, Optional
 
@@ -9,6 +10,7 @@ import torch
 
 def set_random_seed(seed: int) -> None:
     from vllm.platforms import current_platform
+
     current_platform.seed_everything(seed)
 
 
@@ -29,7 +31,7 @@ def set_weight_attrs(
         return
     for key, value in weight_attrs.items():
         assert not hasattr(
-            weight, key), (f"Overwriting existing tensor attribute: {key}")
+            weight, key), f"Overwriting existing tensor attribute: {key}"
 
         # NOTE(woosuk): During weight loading, we often do something like:
         # narrowed_tensor = param.data.narrow(0, offset, len)
@@ -41,6 +43,7 @@ def set_weight_attrs(
         # we sync the param tensor after its weight loader is called.
         # TODO(woosuk): Remove this hack once we have a better solution.
         from vllm.platforms import current_platform
+
         if current_platform.is_tpu() and key == "weight_loader":
             value = _make_synced_weight_loader(value)
         setattr(weight, key, value)
@@ -77,4 +80,17 @@ def get_packed_modules_mapping(model: torch.nn.Module) -> dict[str, list[str]]:
                 f"safely because of conflicts from {type(child).__name__}.")
         else:
             parent_map.update(child_map)
-    return parent_map
\ No newline at end of file
+    return parent_map
+
+
+def get_moe_expert_mapping(
+    model: torch.nn.Module, ) -> list[tuple[str, str, int, str]]:
+    if parent_map := getattr(model, "get_expert_mapping", None):
+        return parent_map()
+    else:
+        # We only check main components instead of whole model submodules
+        for child in model.children():
+            child_map = getattr(child, "get_expert_mapping", None)
+            if child_map is not None:
+                return child_map()
+        return []
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index a0448a80ac7c2..1006c1ce4b241 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -3,6 +3,7 @@
 
 from io import BytesIO
 from pathlib import Path
+from typing import Union
 
 import pybase64
 import torch
@@ -23,9 +24,10 @@ def rescale_image_size(image: Image.Image,
     return image
 
 
-# TODO: Support customizable background color to fill in.
 def rgba_to_rgb(
-    image: Image.Image, background_color=(255, 255, 255)) -> Image.Image:
+    image: Image.Image,
+    background_color: Union[tuple[int, int, int], list[int]] = (255, 255, 255)
+) -> Image.Image:
     """Convert an RGBA image to RGB with filled background color."""
     assert image.mode == "RGBA"
     converted = Image.new("RGB", image.size, background_color)
@@ -55,10 +57,35 @@ class ImageMediaIO(MediaIO[Image.Image]):
         # for flexible control.
         self.kwargs = kwargs
 
+        # Extract RGBA background color from kwargs if provided
+        # Default to white background for backward compatibility
+        rgba_bg = kwargs.get('rgba_background_color', (255, 255, 255))
+        # Convert list to tuple for consistency
+        if isinstance(rgba_bg, list):
+            rgba_bg = tuple(rgba_bg)
+
+        # Validate rgba_background_color format
+        if not (isinstance(rgba_bg, tuple) and len(rgba_bg) == 3
+                and all(isinstance(c, int) and 0 <= c <= 255
+                        for c in rgba_bg)):
+            raise ValueError(
+                "rgba_background_color must be a list or tuple of 3 integers "
+                "in the range [0, 255].")
+        self.rgba_background_color = rgba_bg
+
+    def _convert_image_mode(self, image: Image.Image) -> Image.Image:
+        """Convert image mode with custom background color."""
+        if image.mode == self.image_mode:
+            return image
+        elif image.mode == "RGBA" and self.image_mode == "RGB":
+            return rgba_to_rgb(image, self.rgba_background_color)
+        else:
+            return convert_image_mode(image, self.image_mode)
+
     def load_bytes(self, data: bytes) -> Image.Image:
         image = Image.open(BytesIO(data))
         image.load()
-        return convert_image_mode(image, self.image_mode)
+        return self._convert_image_mode(image)
 
     def load_base64(self, media_type: str, data: str) -> Image.Image:
         return self.load_bytes(pybase64.b64decode(data, validate=True))
@@ -66,7 +93,7 @@ class ImageMediaIO(MediaIO[Image.Image]):
     def load_file(self, filepath: Path) -> Image.Image:
         image = Image.open(filepath)
         image.load()
-        return convert_image_mode(image, self.image_mode)
+        return self._convert_image_mode(image)
 
     def encode_base64(
         self,
@@ -77,7 +104,7 @@ class ImageMediaIO(MediaIO[Image.Image]):
         image = media
 
         with BytesIO() as buffer:
-            image = convert_image_mode(image, self.image_mode)
+            image = self._convert_image_mode(image)
             image.save(buffer, image_format)
             data = buffer.getvalue()
 
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 78d244a6b4fc8..46240855d12a2 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import json
 import sys
 from abc import ABC, abstractmethod
 from collections import defaultdict
@@ -1156,6 +1155,18 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
 
         self.data_parser = self._get_data_parser()
 
+        # Avoid unnecessary recomputation
+        self._supported_mm_limits = self.info.get_supported_mm_limits()
+        self._allowed_mm_limits = self.info.get_allowed_mm_limits()
+
+    @property
+    def supported_mm_limits(self):
+        return self._supported_mm_limits
+
+    @property
+    def allowed_mm_limits(self):
+        return self._allowed_mm_limits
+
     def __call__(
         self,
         prompt: str,
@@ -1176,6 +1187,28 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         """
         return MultiModalDataParser()
 
+    def validate_num_items(
+        self,
+        modality: str,
+        num_items: int,
+    ) -> None:
+        supported_limit = self.supported_mm_limits.get(modality, 0)
+        allowed_limit = self.allowed_mm_limits.get(modality, 0)
+
+        if supported_limit is None:
+            supported_limit = allowed_limit
+
+        limit = min(supported_limit, allowed_limit)
+
+        if num_items > limit:
+            msg = (f"At most {limit} {modality}(s) may be provided in "
+                   "one prompt.")
+
+            if num_items <= supported_limit:
+                msg += " Set `--limit-mm-per-prompt` to increase this limit."
+
+            raise ValueError(msg)
+
     def _to_mm_items(
         self,
         mm_data: MultiModalDataDict,
@@ -1188,26 +1221,9 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         [`_get_hf_mm_data`][vllm.multimodal.processing.BaseMultiModalProcessor._get_hf_mm_data].
         """
         mm_items = self.data_parser.parse_mm_data(mm_data)
-        supported_mm_limits = self.info.get_supported_mm_limits()
-        allowed_mm_limits = self.info.get_allowed_mm_limits()
 
         for modality, items in mm_items.items():
-            supported_limit = supported_mm_limits.get(modality, 0)
-            allowed_limit = allowed_mm_limits.get(modality, 0)
-            num_items = len(items)
-
-            if supported_limit is not None and num_items > supported_limit:
-                raise ValueError(
-                    f"The model only supports at most {supported_limit} "
-                    f"{modality} items, but you passed {num_items} "
-                    f"{modality} items in the same prompt.")
-
-            if num_items > allowed_limit:
-                raise ValueError(
-                    "You set or defaulted to "
-                    f"'{json.dumps({modality: allowed_limit})}' in "
-                    f"`--limit-mm-per-prompt`, but passed {num_items} "
-                    f"{modality} items in the same prompt.")
+            self.validate_num_items(modality, len(items))
 
         return mm_items
 
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 7f6fb47a21fa6..d876887fc155d 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -156,7 +156,7 @@ class MultiModalProfiler(Generic[_I]):
         return self.processor.dummy_inputs
 
     def get_mm_limits(self) -> Mapping[str, int]:
-        return self.processing_info.get_allowed_mm_limits()
+        return self.processor.allowed_mm_limits
 
     def _get_dummy_mm_inputs(
         self,
@@ -180,11 +180,14 @@ class MultiModalProfiler(Generic[_I]):
     def _get_mm_num_tokens(
         self,
         mm_inputs: MultiModalInputs,
+        mm_embeddings_only: bool = True,
     ) -> Mapping[str, int]:
         placeholders_by_modality = mm_inputs["mm_placeholders"]
 
         return {
-            modality: sum(item.get_num_embeds() for item in placeholders)
+            modality:
+            sum(item.get_num_embeds() if mm_embeddings_only else item.length
+                for item in placeholders)
             for modality, placeholders in placeholders_by_modality.items()
         }
 
@@ -253,10 +256,11 @@ class MultiModalProfiler(Generic[_I]):
             multi_modal_placeholders=mm_inputs["mm_placeholders"],
         )
 
-    def get_mm_max_tokens(
+    def _get_mm_max_tokens(
         self,
         seq_len: int,
         mm_counts: Optional[Mapping[str, int]] = None,
+        mm_embeddings_only: bool = True,
     ) -> Mapping[str, int]:
         if mm_counts is None:
             mm_counts = self.get_mm_limits()
@@ -285,4 +289,25 @@ class MultiModalProfiler(Generic[_I]):
             return max_tokens_per_item
 
         mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
-        return self._get_mm_num_tokens(mm_inputs)
+        return self._get_mm_num_tokens(mm_inputs,
+                                       mm_embeddings_only=mm_embeddings_only)
+
+    def get_mm_max_contiguous_tokens(
+        self,
+        seq_len: int,
+        mm_counts: Optional[Mapping[str, int]] = None,
+    ):
+        """
+        Returns the maximum length of the multimodal (image placeholders+text)
+        tokens, including any break/text tokens in-between image embeddings.
+
+        <im_start> [IMG] [IMG] [IMG] <row_break> [IMG] [IMG] [IMG] <im_end>
+        Returns 9, even when the number of image embeddings is 6.
+        
+        This is important to take into account when profiling and
+        initializing the encoder cache size.
+        """
+
+        return self._get_mm_max_tokens(seq_len,
+                                       mm_counts,
+                                       mm_embeddings_only=False)
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index c44fcacd246c4..5f5b620e0cf79 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -5,7 +5,6 @@ from dataclasses import dataclass
 from typing import TYPE_CHECKING, Generic, Optional, Protocol, TypeVar
 
 import torch.nn as nn
-from typing_extensions import deprecated
 
 from vllm.envs import VLLM_MM_INPUT_CACHE_GIB
 from vllm.inputs import InputProcessingContext
@@ -105,13 +104,6 @@ class MultiModalRegistry:
 
         return True  # Success
 
-    @deprecated("Legacy input processor/mapper pipeline has been removed. "
-                "Please update your model runner to use "
-                "`seq_group_metadata.multi_modal_data` directly without "
-                "further processing.")
-    def create_input_mapper(self, model_config: "ModelConfig"):
-        return lambda data, mm_processor_kwargs: data
-
     def get_max_tokens_per_item_by_modality(
         self,
         model_config: "ModelConfig",
@@ -129,7 +121,7 @@ class MultiModalRegistry:
         seq_len = model_config.max_model_len
         mm_limits = self.get_mm_limits_per_prompt(model_config)
 
-        return profiler.get_mm_max_tokens(
+        return profiler.get_mm_max_contiguous_tokens(
             seq_len,
             {
                 modality: 1
@@ -182,16 +174,6 @@ class MultiModalRegistry:
         """
         return sum(self.get_max_tokens_by_modality(model_config).values())
 
-    @deprecated("Legacy input processor/mapper pipeline has been removed. "
-                "Please update your model runner to use "
-                "`seq_group_metadata.multi_modal_data` directly without "
-                "further processing.")
-    def init_mm_limits_per_prompt(
-        self,
-        model_config: "ModelConfig",
-    ) -> None:
-        pass
-
     def get_mm_limits_per_prompt(
         self,
         model_config: "ModelConfig",
@@ -246,13 +228,6 @@ class MultiModalRegistry:
         model_cls, _ = get_model_architecture(model_config)
         return model_cls
 
-    @deprecated("Legacy input processor/mapper pipeline has been removed. "
-                "Please update your model runner to use "
-                "`seq_group_metadata.multi_modal_data` directly without "
-                "further processing.")
-    def has_processor(self, model_config: "ModelConfig") -> bool:
-        return True
-
     def create_processor(
         self,
         model_config: "ModelConfig",
diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
index c13659f8a06e6..56edb8629e45b 100644
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -1,11 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
 import logging
 import traceback
 from itertools import chain
 from typing import TYPE_CHECKING, Optional
 
+from vllm import envs
 from vllm.plugins import load_plugins_by_group
 from vllm.utils import resolve_obj_by_qualname, supports_xccl
 
@@ -31,20 +31,26 @@ def vllm_version_matches_substr(substr: str) -> bool:
 
 
 def tpu_platform_plugin() -> Optional[str]:
-    is_tpu = False
     logger.debug("Checking if TPU platform is available.")
+
+    # Check for Pathways TPU proxy
+    if envs.VLLM_TPU_USING_PATHWAYS:
+        logger.debug("Confirmed TPU platform is available via Pathways proxy.")
+        return "tpu_commons.platforms.tpu_jax.TpuPlatform"
+
+    # Check for libtpu installation
     try:
         # While it's technically possible to install libtpu on a
         # non-TPU machine, this is a very uncommon scenario. Therefore,
-        # we assume that libtpu is installed if and only if the machine
+        # we assume that libtpu is installed only if the machine
         # has TPUs.
+
         import libtpu  # noqa: F401
-        is_tpu = True
         logger.debug("Confirmed TPU platform is available.")
+        return "vllm.platforms.tpu.TpuPlatform"
     except Exception as e:
         logger.debug("TPU platform is not available because: %s", str(e))
-
-    return "vllm.platforms.tpu.TpuPlatform" if is_tpu else None
+        return None
 
 
 def cuda_platform_plugin() -> Optional[str]:
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index c35d22c1d6824..b61b39a9274d0 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -162,7 +162,7 @@ class CudaPlatformBase(Platform):
                 if cls.is_device_capability(100):
                     # Blackwell => Force CutlassMLA.
                     use_cutlass_mla = True
-                    envs.VLLM_ATTENTION_BACKEND = "CUTLASS_MLA_VLLM_V1"
+                    envs.VLLM_ATTENTION_BACKEND = "CUTLASS_MLA"
                 else:
                     # Not Blackwell
                     use_flashmla = True
@@ -170,7 +170,7 @@ class CudaPlatformBase(Platform):
                 # Forced case
                 use_flashmla = (envs.VLLM_ATTENTION_BACKEND == "FLASHMLA")
                 use_cutlass_mla = (
-                    envs.VLLM_ATTENTION_BACKEND == "CUTLASS_MLA_VLLM_V1")
+                    envs.VLLM_ATTENTION_BACKEND == "CUTLASS_MLA")
 
             from vllm.attention.ops.flashmla import is_flashmla_supported
             if use_flashmla and is_flashmla_supported()[0] \
@@ -182,7 +182,7 @@ class CudaPlatformBase(Platform):
             if use_cutlass_mla and cache_config.block_size != 128:
                 cache_config.block_size = 128
                 logger.info("Forcing kv cache block size to 128 for "
-                            "CUTLASS_MLA_VLLM_V1 backend.")
+                            "CUTLASS_MLA backend.")
 
         compilation_config = vllm_config.compilation_config
         if (envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput"
@@ -206,14 +206,28 @@ class CudaPlatformBase(Platform):
         torch.cuda.reset_peak_memory_stats(device)
         return torch.cuda.max_memory_allocated(device)
 
+    @classmethod
+    def get_vit_attn_backend(cls, support_fa: bool = False) -> _Backend:
+        if cls.has_device_capability(80) and support_fa:
+            from transformers.utils import is_flash_attn_2_available
+            if is_flash_attn_2_available():
+                return _Backend.FLASH_ATTN
+            logger.warning_once(
+                "Current `vllm-flash-attn` has a bug inside vision "
+                "module, so we use xformers backend instead. You can "
+                "run `pip install flash-attn` to use flash-attention "
+                "backend.")
+        # Fallback for Volta/Turing GPUs or FA not supported
+        return _Backend.XFORMERS
+
     @classmethod
     def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
                              kv_cache_dtype, block_size, use_v1,
                              use_mla) -> str:
         if use_mla:
-            # TODO(lucas): refactor to  be more concise
+            # TODO(lucas): refactor to be more concise
             #  we should probably consider factoring out V1 here
-            if selected_backend == _Backend.CUTLASS_MLA_VLLM_V1:
+            if selected_backend == _Backend.CUTLASS_MLA:
                 if use_v1:
                     logger.info_once("Using Cutlass MLA backend on V1 engine.")
                     return ("vllm.v1.attention.backends.mla."
@@ -256,6 +270,7 @@ class CudaPlatformBase(Platform):
             FLEX_ATTENTION_V1 = "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend"  # noqa: E501
             TRITON_ATTN_VLLM_V1 = "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend"  # noqa: E501
             FLASH_ATTN_V1 = "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"  # noqa: E501
+            TREE_ATTN_V1 = "vllm.v1.attention.backends.tree_attn.TreeAttentionBackend"  # noqa: E501
 
             if selected_backend == _Backend.FLASHINFER:
                 logger.info_once("Using FlashInfer backend on V1 engine.")
@@ -273,6 +288,9 @@ class CudaPlatformBase(Platform):
             elif selected_backend == _Backend.FLASH_ATTN:
                 logger.info_once("Using Flash Attention backend on V1 engine.")
                 return FLASH_ATTN_V1
+            elif selected_backend == _Backend.TREE_ATTN:
+                logger.info_once("Using Tree Attention backend on V1 engine.")
+                return TREE_ATTN_V1
 
             from vllm.attention.selector import is_attn_backend_supported
 
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 02cc392244bac..61ce868c13b47 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -46,6 +46,7 @@ class _Backend(enum.Enum):
     ROCM_FLASH = enum.auto()
     ROCM_AITER_MLA = enum.auto()  # Supported by V1
     ROCM_AITER_MLA_VLLM_V1 = enum.auto()
+    ROCM_AITER_FA = enum.auto()  # used for ViT attn backend
     TORCH_SDPA = enum.auto()
     FLASHINFER = enum.auto()
     FLASHINFER_VLLM_V1 = enum.auto()
@@ -53,7 +54,7 @@ class _Backend(enum.Enum):
     TRITON_MLA_VLLM_V1 = enum.auto()
     FLASHMLA_VLLM_V1 = enum.auto()
     FLASHMLA = enum.auto()  # Supported by V1
-    CUTLASS_MLA_VLLM_V1 = enum.auto()
+    CUTLASS_MLA = enum.auto()
     PALLAS = enum.auto()
     PALLAS_VLLM_V1 = enum.auto()
     IPEX = enum.auto()
@@ -61,6 +62,7 @@ class _Backend(enum.Enum):
     DIFFERENTIAL_FLASH_ATTN = enum.auto()
     NO_ATTENTION = enum.auto()
     FLEX_ATTENTION = enum.auto()
+    TREE_ATTN = enum.auto()
 
 
 class PlatformEnum(enum.Enum):
@@ -186,6 +188,10 @@ class Platform:
         else:
             return device_id
 
+    @classmethod
+    def get_vit_attn_backend(cls, support_fa: bool = False) -> _Backend:
+        return _Backend.TORCH_SDPA
+
     @classmethod
     def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
                              dtype: torch.dtype, kv_cache_dtype: Optional[str],
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index b2e69f60343f6..54ffc83cd565a 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -173,6 +173,18 @@ class RocmPlatform(Platform):
         "quark", "ptpc_fp8"
     ]
 
+    @classmethod
+    def get_vit_attn_backend(cls, support_fa: bool = False) -> _Backend:
+        if support_fa:
+            if (envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MHA
+                    and on_gfx9()):
+                # Note: AITER FA is only supported for Qwen-VL models.
+                # TODO: Add support for other VL models in their model class.
+                return _Backend.ROCM_AITER_FA
+            if on_gfx9():
+                return _Backend.FLASH_ATTN
+        return _Backend.TORCH_SDPA
+
     @classmethod
     def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
                              kv_cache_dtype, block_size, use_v1,
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index 1d0bb3654929b..d8a663f2f0c4a 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -30,7 +30,7 @@ class XPUPlatform(Platform):
     # see https://github.com/ray-project/ray/blob/6a5eb5865eeb9ccf058a79b44f107e327e360673/python/ray/_private/accelerators/intel_gpu.py#L20 # noqa: E501
     ray_device_key: str = "GPU"
     dist_backend: str = "ccl"  # ccl | xccl
-    device_control_env_var: str = "ONEAPI_DEVICE_SELECTOR"
+    device_control_env_var: str = "ZE_AFFINITY_MASK"
 
     @classmethod
     def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
diff --git a/vllm/plugins/lora_resolvers/README.md b/vllm/plugins/lora_resolvers/README.md
index 7e7c55f5c69c7..48f27dddea07e 100644
--- a/vllm/plugins/lora_resolvers/README.md
+++ b/vllm/plugins/lora_resolvers/README.md
@@ -6,7 +6,8 @@ via the LoRAResolver plugin framework.
 Note that `VLLM_ALLOW_RUNTIME_LORA_UPDATING` must be set to true to allow LoRA resolver plugins
 to work, and `VLLM_PLUGINS` must be set to include the desired resolver plugins.
 
-# lora_filesystem_resolver
+## lora_filesystem_resolver
+
 This LoRA Resolver is installed with vLLM by default.
 To use, set `VLLM_PLUGIN_LORA_CACHE_DIR` to a local directory. When vLLM receives a request
 for a LoRA adapter `foobar` it doesn't currently recognize, it will look in that local directory
diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py
index 23eb775f2dc69..7077f68353fc5 100644
--- a/vllm/pooling_params.py
+++ b/vllm/pooling_params.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from copy import deepcopy
 from typing import TYPE_CHECKING, Optional
 
 import msgspec
@@ -19,13 +20,25 @@ class PoolingParams(
     """API parameters for pooling models.
 
     Attributes:
+        normalize: Whether to normalize the embeddings outputs.
         dimensions: Reduce the dimensions of embeddings
                     if model support matryoshka representation.
+        activation: Whether to apply activation function to
+                    the classification outputs.
+        softmax: Whether to apply softmax to the reward outputs.
     """
 
+    ## for embeddings models
     dimensions: Optional[int] = None
+    normalize: Optional[bool] = None
 
-    output_kind: RequestOutputKind = RequestOutputKind.FINAL_ONLY
+    ## for classification models
+    activation: Optional[bool] = None
+
+    ## for reward models
+    softmax: Optional[bool] = None
+    step_tag_id: Optional[int] = None
+    returned_token_ids: Optional[list[int]] = None
 
     task: Optional[PoolingTask] = None
     """Internal use only."""
@@ -33,15 +46,32 @@ class PoolingParams(
     requires_token_ids: bool = False
     """Internal use only."""
 
+    output_kind: RequestOutputKind = RequestOutputKind.FINAL_ONLY
+
+    @property
+    def all_parameters(self) -> list[str]:
+        return [
+            "dimensions", "normalize", "activation", "softmax", "step_tag_id",
+            "returned_token_ids"
+        ]
+
+    @property
+    def valid_parameters(self):
+        return {
+            "embed": ["dimensions", "normalize"],
+            "classify": ["activation"],
+            "score": ["activation"],
+            "encode": ["softmax", "step_tag_id", "returned_token_ids"],
+        }
+
     def clone(self) -> "PoolingParams":
         """Returns a deep copy of the PoolingParams instance."""
-        return PoolingParams(
-            dimensions=self.dimensions,
-            task=self.task,
-            requires_token_ids=self.requires_token_ids,
-        )
+        return deepcopy(self)
+
+    def verify(self,
+               task: PoolingTask,
+               model_config: Optional["ModelConfig"] = None) -> None:
 
-    def verify(self, task: PoolingTask, model_config: "ModelConfig") -> None:
         if self.task is None:
             self.task = task
         elif self.task != task:
@@ -52,28 +82,91 @@ class PoolingParams(
         # which is not available in model config. So, it's not included
         # in this method
 
-        if self.dimensions is not None:
-            if not model_config.is_matryoshka:
-                raise ValueError(
-                    f'Model "{model_config.served_model_name}" does not '
-                    f'support matryoshka representation, '
-                    f'changing output dimensions will lead to poor results.')
+        self._merge_default_parameters(model_config)
+        self._set_default_parameters(model_config)
+        self._verify_valid_parameters()
 
-            mds = model_config.matryoshka_dimensions
-            if mds is not None:
-                if self.dimensions not in mds:
+    def _merge_default_parameters(self,
+                                  model_config: Optional["ModelConfig"] = None
+                                  ) -> None:
+
+        if model_config is None:
+            return
+
+        pooler_config = model_config.pooler_config
+        if pooler_config is None:
+            return
+
+        assert self.task is not None, "task must be set"
+        valid_parameters = self.valid_parameters[self.task]
+
+        for k in valid_parameters:
+            if getattr(pooler_config, k, None) is None:
+                continue
+
+            if getattr(self, k, None) is None:
+                setattr(self, k, getattr(pooler_config, k))
+
+    def _set_default_parameters(self, model_config: Optional["ModelConfig"]):
+        if self.task == "embed":
+            if self.normalize is None:
+                self.normalize = True
+
+            if self.dimensions is not None and model_config is not None:
+                if not model_config.is_matryoshka:
                     raise ValueError(
-                        f'Model "{model_config.served_model_name}" '
-                        f'only supports {str(mds)} matryoshka dimensions, '
-                        f'use other output dimensions will '
-                        f'lead to poor results.')
-            elif self.dimensions < 1:
-                raise ValueError("Dimensions must be greater than 0")
+                        f'Model "{model_config.served_model_name}" does not '
+                        f'support matryoshka representation, '
+                        f'changing output dimensions will lead to poor results.'
+                    )
+
+                mds = model_config.matryoshka_dimensions
+                if mds is not None:
+                    if self.dimensions not in mds:
+                        raise ValueError(
+                            f'Model "{model_config.served_model_name}" '
+                            f'only supports {str(mds)} matryoshka dimensions, '
+                            f'use other output dimensions will '
+                            f'lead to poor results.')
+                elif self.dimensions < 1:
+                    raise ValueError("Dimensions must be greater than 0")
+
+        elif self.task in ["classify", "score"]:
+            if self.activation is None:
+                self.activation = True
+
+        elif self.task == "encode":
+            if self.softmax is None:
+                self.softmax = True
+        else:
+            raise ValueError(f"Unknown pooling task: {self.task}")
+
+    def _verify_valid_parameters(self):
+        assert self.task is not None, "task must be set"
+        valid_parameters = self.valid_parameters[self.task]
+        invalid_parameters = []
+        for k in self.all_parameters:
+            if k in valid_parameters:
+                continue
+
+            if getattr(self, k, None) is not None:
+                invalid_parameters.append(k)
+
+        if invalid_parameters:
+            raise ValueError(
+                f"Task {self.task} only supports {valid_parameters} "
+                f"parameters, does not support "
+                f"{invalid_parameters} parameters")
 
     def __repr__(self) -> str:
         return (f"PoolingParams("
-                f"dimensions={self.dimensions}, "
                 f"task={self.task}, "
+                f"normalize={self.normalize}, "
+                f"dimensions={self.dimensions}, "
+                f"activation={self.activation}, "
+                f"softmax={self.softmax}, "
+                f"step_tag_id={self.step_tag_id}, "
+                f"returned_token_ids={self.returned_token_ids}, "
                 f"requires_token_ids={self.requires_token_ids})")
 
     def __post_init__(self) -> None:
diff --git a/vllm/ray/lazy_utils.py b/vllm/ray/lazy_utils.py
new file mode 100644
index 0000000000000..bb3535579cfdf
--- /dev/null
+++ b/vllm/ray/lazy_utils.py
@@ -0,0 +1,22 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+def is_ray_initialized():
+    """Check if Ray is initialized."""
+    try:
+        import ray
+        return ray.is_initialized()
+    except ImportError:
+        return False
+
+
+def is_in_ray_actor():
+    """Check if we are in a Ray actor."""
+
+    try:
+        import ray
+        return (ray.is_initialized()
+                and ray.get_runtime_context().get_actor_id() is not None)
+    except ImportError:
+        return False
diff --git a/vllm/reasoning/__init__.py b/vllm/reasoning/__init__.py
index d61e4f11dfa29..1c3f78f2edbfb 100644
--- a/vllm/reasoning/__init__.py
+++ b/vllm/reasoning/__init__.py
@@ -8,6 +8,7 @@ from .granite_reasoning_parser import GraniteReasoningParser
 from .hunyuan_a13b_reasoning_parser import HunyuanA13BReasoningParser
 from .mistral_reasoning_parser import MistralReasoningParser
 from .qwen3_reasoning_parser import Qwen3ReasoningParser
+from .step3_reasoning_parser import Step3ReasoningParser
 
 __all__ = [
     "ReasoningParser",
@@ -18,4 +19,5 @@ __all__ = [
     "Qwen3ReasoningParser",
     "Glm4MoeModelReasoningParser",
     "MistralReasoningParser",
+    "Step3ReasoningParser",
 ]
diff --git a/vllm/reasoning/step3_reasoning_parser.py b/vllm/reasoning/step3_reasoning_parser.py
new file mode 100644
index 0000000000000..f642ea977c580
--- /dev/null
+++ b/vllm/reasoning/step3_reasoning_parser.py
@@ -0,0 +1,109 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Sequence
+from typing import Optional, Union
+
+import regex as re
+from transformers import PreTrainedTokenizerBase
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaMessage)
+from vllm.logger import init_logger
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
+
+logger = init_logger(__name__)
+
+
+@ReasoningParserManager.register_module("step3")
+class Step3ReasoningParser(ReasoningParser):
+    """
+    Reasoning parser for Step3 model.
+
+    The Step3 model uses </think> token to denote the end of reasoning 
+    text. This parser extracts all content before </think> as reasoning content.
+    """
+
+    def __init__(self, tokenizer: PreTrainedTokenizerBase):
+        super().__init__(tokenizer)
+        self.think_end_token = "</think>"
+
+        self.reasoning_regex = re.compile(rf"(.*?){self.think_end_token}",
+                                          re.DOTALL)
+
+        if not self.model_tokenizer:
+            raise ValueError(
+                "The model tokenizer must be passed to the ReasoningParser "
+                "constructor during construction.")
+
+        self.think_end_token_id = self.vocab.get(self.think_end_token)
+        if self.think_end_token_id is None:
+            raise RuntimeError(
+                "Step3 reasoning parser could not locate think end "
+                "token in the tokenizer!")
+
+    def extract_reasoning_content_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> Union[DeltaMessage, None]:
+        """
+        Extract reasoning content from a delta message.
+        Handles streaming output where previous + delta = current.
+        Uses token IDs for faster processing.
+        For text "abc</think>xyz":
+        - 'abc' goes to reasoning_content
+        - 'xyz' goes to content
+        """
+        # Skip single special token
+        if len(delta_token_ids
+               ) == 1 and delta_token_ids[0] == self.think_end_token_id:
+            return None
+
+        if self.think_end_token_id in delta_token_ids:
+            # </think> in delta, extract reasoning content and remaining content
+            end_index = delta_text.find(self.think_end_token)
+            reasoning_content = delta_text[:end_index]
+            content = delta_text[end_index + len(self.think_end_token):]
+            return DeltaMessage(reasoning_content=reasoning_content,
+                                content=content if content else None)
+        elif self.think_end_token_id in previous_token_ids:
+            # </think> already seen in previous text, everything is content
+            return DeltaMessage(content=delta_text)
+        else:
+            # No </think> seen yet, everything is reasoning
+            return DeltaMessage(reasoning_content=delta_text)
+
+    def extract_reasoning_content(
+            self, model_output: str, request: ChatCompletionRequest
+    ) -> tuple[Optional[str], Optional[str]]:
+
+        # Check if the model output contains the </think> token
+        if self.think_end_token not in model_output:
+            # If no </think> token, everything is reasoning content
+            return model_output, None
+        else:
+            # Find the first occurrence of </think>
+            end_index = model_output.find(self.think_end_token)
+            reasoning_content = model_output[:end_index]
+
+            # Content after </think> token
+            content = model_output[end_index + len(self.think_end_token):]
+
+            if len(content) == 0:
+                content = None
+
+            return reasoning_content, content
+
+    def is_reasoning_end(self, input_ids: list[int]) -> bool:
+        return self.think_end_token_id in input_ids
+
+    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
+        if self.think_end_token_id not in input_ids[:-1]:
+            return []
+        else:
+            return input_ids[input_ids.index(self.think_end_token_id) + 1:]
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 322e53b753948..52e4cbd096153 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -156,6 +156,7 @@ class SamplingParams(
             Note that the implementation follows the OpenAI API: The API will
             always return the log probability of the sampled token, so there
             may be up to `logprobs+1` elements in the response.
+            When set to -1, return all `vocab_size` log probabilities.
         prompt_logprobs: Number of log probabilities to return per prompt token.
         detokenize: Whether to detokenize the output. Defaults to True.
         skip_special_tokens: Whether to skip special tokens in the output.
@@ -414,9 +415,10 @@ class SamplingParams(
             raise ValueError(
                 f"min_tokens must be less than or equal to "
                 f"max_tokens={self.max_tokens}, got {self.min_tokens}.")
-        if self.logprobs is not None and self.logprobs < 0:
+        if (self.logprobs is not None and self.logprobs != -1
+                and self.logprobs < 0):
             raise ValueError(
-                f"logprobs must be non-negative, got {self.logprobs}.")
+                f"logprobs must be non-negative or -1, got {self.logprobs}.")
         if self.prompt_logprobs is not None and self.prompt_logprobs < 0:
             raise ValueError(f"prompt_logprobs must be non-negative, got "
                              f"{self.prompt_logprobs}.")
diff --git a/vllm/sequence.py b/vllm/sequence.py
index fe87b52f9df15..6e65a2bd03189 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -10,7 +10,7 @@ from collections.abc import Mapping
 from collections.abc import Sequence as GenericSequence
 from dataclasses import dataclass, field
 from functools import reduce
-from typing import Any, Callable, Optional, Union
+from typing import TYPE_CHECKING, Any, Callable, Optional, Union
 
 import msgspec
 import torch
@@ -21,6 +21,10 @@ from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import RequestOutputKind, SamplingParams
 
+if TYPE_CHECKING:
+    from vllm.v1.worker.kv_connector_model_runner_mixin import (
+        KVConnectorOutput)
+
 VLLM_TOKEN_ID_ARRAY_TYPE = "l"
 
 VLLM_INVALID_TOKEN_ID = -1
@@ -1159,14 +1163,11 @@ class IntermediateTensors:
     states and residuals to be sent to the next stage. This data structure
     contains the hidden states and residuals for a request.
     
-    Each stage also needs to handle its own finished_sending and 
-    finished_recving in case of kv transfer.
+    Each stage also needs to handle its own kv_connector_output.
     """
 
     tensors: dict[str, torch.Tensor]
-    # [req_ids]
-    finished_sending: Optional[set[str]] = None
-    finished_recving: Optional[set[str]] = None
+    kv_connector_output: Optional["KVConnectorOutput"]
 
     def __init__(self, tensors):
         # manually define this function, so that
diff --git a/vllm/test_utils.py b/vllm/test_utils.py
index 1e61ca6b3deaf..23679b8228d6f 100644
--- a/vllm/test_utils.py
+++ b/vllm/test_utils.py
@@ -118,7 +118,7 @@ MODELS_ON_S3 = [
     "stabilityai/stablelm-zephyr-3b",
     "state-spaces/mamba-130m-hf",
     "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ",
-    "THUDM/glm-4v-9b",
+    "zai-org/glm-4v-9b",
     "TIGER-Lab/Mantis-8B-siglip-llama3",
     "TIGER-Lab/VLM2Vec-Full",
     "tiiuae/falcon-40b",
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 04ff08825bbc5..8fe153464d360 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -29,19 +29,15 @@ from vllm import envs
 from vllm.logger import init_logger
 # yapf conflicts with isort for this block
 # yapf: disable
-from vllm.transformers_utils.configs import (ChatGLMConfig, Cohere2Config,
-                                             DbrxConfig, DeepseekVLV2Config,
-                                             EAGLEConfig, Exaone4Config,
-                                             ExaoneConfig, JAISConfig,
+from vllm.transformers_utils.configs import (ChatGLMConfig, DeepseekVLV2Config,
+                                             EAGLEConfig, JAISConfig,
                                              KimiVLConfig, MedusaConfig,
-                                             MiniMaxText01Config,
-                                             MiniMaxVL01Config, MllamaConfig,
-                                             MLPSpeculatorConfig, MPTConfig,
+                                             MllamaConfig, MLPSpeculatorConfig,
                                              Nemotron_Nano_VL_Config,
                                              NemotronConfig, NVLM_D_Config,
-                                             OvisConfig, RWConfig,
-                                             SkyworkR1VChatConfig, SolarConfig,
-                                             Telechat2Config, UltravoxConfig)
+                                             RWConfig, SpeculatorsConfig,
+                                             Step3TextConfig, Step3VLConfig,
+                                             UltravoxConfig)
 # yapf: enable
 from vllm.transformers_utils.configs.mistral import adapt_config_dict
 from vllm.transformers_utils.utils import check_gguf_file
@@ -77,29 +73,21 @@ _CONFIG_REGISTRY_OVERRIDE_HF: dict[str, type[PretrainedConfig]] = {
 
 _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = {
     "chatglm": ChatGLMConfig,
-    "cohere2": Cohere2Config,
-    "dbrx": DbrxConfig,
     "deepseek_vl_v2": DeepseekVLV2Config,
     "kimi_vl": KimiVLConfig,
     "Llama_Nemotron_Nano_VL": Nemotron_Nano_VL_Config,
-    "mpt": MPTConfig,
     "RefinedWeb": RWConfig,  # For tiiuae/falcon-40b(-instruct)
     "RefinedWebModel": RWConfig,  # For tiiuae/falcon-7b(-instruct)
     "jais": JAISConfig,
     "mlp_speculator": MLPSpeculatorConfig,
     "medusa": MedusaConfig,
     "eagle": EAGLEConfig,
-    "exaone": ExaoneConfig,
-    "exaone4": Exaone4Config,
-    "minimax_text_01": MiniMaxText01Config,
-    "minimax_vl_01": MiniMaxVL01Config,
+    "speculators": SpeculatorsConfig,
     "nemotron": NemotronConfig,
     "NVLM_D": NVLM_D_Config,
-    "ovis": OvisConfig,
-    "solar": SolarConfig,
-    "skywork_chat": SkyworkR1VChatConfig,
-    "telechat": Telechat2Config,
     "ultravox": UltravoxConfig,
+    "step3_vl": Step3VLConfig,
+    "step3_text": Step3TextConfig,
     **_CONFIG_REGISTRY_OVERRIDE_HF
 }
 
@@ -301,6 +289,36 @@ def _maybe_remap_hf_config_attrs(config: PretrainedConfig) -> PretrainedConfig:
     return config
 
 
+def maybe_override_with_speculators_target_model(
+    model: str,
+    tokenizer: str,
+    trust_remote_code: bool,
+    revision: Optional[str] = None,
+    **kwargs,
+) -> tuple[str, str]:
+    """
+    If running a speculators config, override running model with target model
+    """
+    is_gguf = check_gguf_file(model)
+    if is_gguf:
+        kwargs["gguf_file"] = Path(model).name
+        gguf_model_repo = Path(model).parent
+    else:
+        gguf_model_repo = None
+    config_dict, _ = PretrainedConfig.get_config_dict(
+        model if gguf_model_repo is None else gguf_model_repo,
+        revision=revision,
+        trust_remote_code=trust_remote_code,
+        token=_get_hf_token(),
+        **kwargs,
+    )
+    spec_config = config_dict.get("speculators_config", None)
+    # Return the target model
+    if spec_config is not None:
+        model = tokenizer = spec_config["verifier"]["name_or_path"]
+    return model, tokenizer
+
+
 def get_config(
     model: Union[str, Path],
     trust_remote_code: bool,
@@ -359,9 +377,12 @@ def get_config(
             token=_get_hf_token(),
             **kwargs,
         )
-
         # Use custom model class if it's in our registry
         model_type = config_dict.get("model_type")
+        if model_type is None:
+            model_type = "speculators" if config_dict.get(
+                "speculators_config") is not None else model_type
+
         if model_type in _CONFIG_REGISTRY:
             config_class = _CONFIG_REGISTRY[model_type]
             config = config_class.from_pretrained(
@@ -428,6 +449,20 @@ def get_config(
         model_type = MODEL_FOR_CAUSAL_LM_MAPPING_NAMES[config.model_type]
         config.update({"architectures": [model_type]})
 
+    # ModelOpt 0.31.0 and after saves the quantization config in the model
+    # config file.
+    quantization_config = config_dict.get("quantization_config", None)
+
+    # ModelOpt 0.29.0 and before saves the quantization config in a separate
+    # "hf_quant_config.json" in the same directory as the model config file.
+    if quantization_config is None \
+        and file_or_path_exists(model, "hf_quant_config.json", revision):
+        quantization_config = get_hf_file_to_dict("hf_quant_config.json",
+                                                  model, revision)
+
+    if quantization_config is not None:
+        config.quantization_config = quantization_config
+
     if hf_overrides_kw:
         logger.debug("Overriding HF config with %s", hf_overrides_kw)
         config.update(hf_overrides_kw)
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index 89303213a27e1..64ace167a5a00 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -1,13 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Model configs may be defined in this directory for the following reasons:
+
+- There is no configuration file defined by HF Hub or Transformers library.
+- There is a need to override the existing config to support vLLM.
+"""
 
 from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
-from vllm.transformers_utils.configs.cohere2 import Cohere2Config
-from vllm.transformers_utils.configs.dbrx import DbrxConfig
 from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekVLV2Config
 from vllm.transformers_utils.configs.eagle import EAGLEConfig
-from vllm.transformers_utils.configs.exaone import ExaoneConfig
-from vllm.transformers_utils.configs.exaone4 import Exaone4Config
 # RWConfig is for the original tiiuae/falcon-40b(-instruct) and
 # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
 # `FalconConfig` class from the official HuggingFace transformers library.
@@ -15,36 +17,26 @@ from vllm.transformers_utils.configs.falcon import RWConfig
 from vllm.transformers_utils.configs.jais import JAISConfig
 from vllm.transformers_utils.configs.kimi_vl import KimiVLConfig
 from vllm.transformers_utils.configs.medusa import MedusaConfig
-from vllm.transformers_utils.configs.minimax_text_01 import MiniMaxText01Config
-from vllm.transformers_utils.configs.minimax_vl_01 import MiniMaxVL01Config
 from vllm.transformers_utils.configs.mllama import MllamaConfig
 from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig
 from vllm.transformers_utils.configs.moonvit import MoonViTConfig
-from vllm.transformers_utils.configs.mpt import MPTConfig
 from vllm.transformers_utils.configs.nemotron import NemotronConfig
 from vllm.transformers_utils.configs.nemotron_h import NemotronHConfig
 from vllm.transformers_utils.configs.nemotron_vl import Nemotron_Nano_VL_Config
 from vllm.transformers_utils.configs.nvlm_d import NVLM_D_Config
-from vllm.transformers_utils.configs.ovis import OvisConfig
-from vllm.transformers_utils.configs.skyworkr1v import SkyworkR1VChatConfig
-from vllm.transformers_utils.configs.solar import SolarConfig
-from vllm.transformers_utils.configs.telechat2 import Telechat2Config
+from vllm.transformers_utils.configs.speculators.base import SpeculatorsConfig
+from vllm.transformers_utils.configs.step3_vl import (Step3TextConfig,
+                                                      Step3VisionEncoderConfig,
+                                                      Step3VLConfig)
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
 
 __all__ = [
     "ChatGLMConfig",
-    "Cohere2Config",
-    "DbrxConfig",
     "DeepseekVLV2Config",
-    "MPTConfig",
     "RWConfig",
     "JAISConfig",
     "MedusaConfig",
     "EAGLEConfig",
-    "ExaoneConfig",
-    "Exaone4Config",
-    "MiniMaxText01Config",
-    "MiniMaxVL01Config",
     "MllamaConfig",
     "MLPSpeculatorConfig",
     "MoonViTConfig",
@@ -53,9 +45,9 @@ __all__ = [
     "NemotronHConfig",
     "Nemotron_Nano_VL_Config",
     "NVLM_D_Config",
-    "OvisConfig",
-    "SkyworkR1VChatConfig",
-    "SolarConfig",
-    "Telechat2Config",
+    "SpeculatorsConfig",
     "UltravoxConfig",
+    "Step3VLConfig",
+    "Step3VisionEncoderConfig",
+    "Step3TextConfig",
 ]
diff --git a/vllm/transformers_utils/configs/chatglm.py b/vllm/transformers_utils/configs/chatglm.py
index 7c5de3e948ed7..176d2b8f63fe4 100644
--- a/vllm/transformers_utils/configs/chatglm.py
+++ b/vllm/transformers_utils/configs/chatglm.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
-# https://github.com/THUDM/ChatGLM2-6B
+# https://github.com/zai-org/ChatGLM2-6B
 from transformers import PretrainedConfig
 
 
diff --git a/vllm/transformers_utils/configs/cohere2.py b/vllm/transformers_utils/configs/cohere2.py
deleted file mode 100644
index e547a9c281cff..0000000000000
--- a/vllm/transformers_utils/configs/cohere2.py
+++ /dev/null
@@ -1,195 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# ruff: noqa
-
-# Adapted from
-# https://github.com/huggingface/transformers/blob/main/src/transformers/models/cohere2/configuration_cohere2.py
-from transformers import PretrainedConfig
-from transformers.modeling_rope_utils import rope_config_validation
-
-
-class Cohere2Config(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`CohereModel`]. It is used to instantiate an Cohere
-    model according to the specified arguments, defining the model architecture.
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information. Instantiating a configuration
-    with the defaults will yield a similar configuration to that of the [CohereForAI/c4ai-command-r-v01](https://huggingface.co/CohereForAI/c4ai-command-r-v01) model.
-
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 256000):
-            Vocabulary size of the Cohere model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`CohereModel`]
-        hidden_size (`int`, *optional*, defaults to 8192):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 22528):
-            Dimension of the MLP representations.
-        logit_scale (`float`, *optional*, defaults to 0.0625):
-            The scaling factor for the output logits.
-        num_hidden_layers (`int`, *optional*, defaults to 40):
-            Number of hidden layers in the Transformer decoder.
-        num_attention_heads (`int`, *optional*, defaults to 64):
-            Number of attention heads for each attention layer in the Transformer decoder.
-        num_key_value_heads (`int`, *optional*):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details checkout [this
-            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
-            `num_attention_heads`.
-        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
-            The non-linear activation function (function or string) in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to 8192):
-            The maximum sequence length that this model might ever be used with.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
-            The epsilon used by the layer normalization.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        pad_token_id (`int`, *optional*, defaults to 0):
-            Padding token id.
-        bos_token_id (`int`, *optional*, defaults to 5):
-            Beginning of stream token id.
-        eos_token_id (`int`, *optional*, defaults to 255001):
-            End of stream token id.
-        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
-            Whether to tie weight embeddings
-        rope_theta (`float`, *optional*, defaults to 10000.0):
-            The base period of the RoPE embeddings.
-        rope_scaling (`dict`, *optional*):
-            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
-            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
-            accordingly.
-            Expected contents:
-                `rope_type` (`str`):
-                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
-                    'llama3'], with 'default' being the original RoPE implementation.
-                `factor` (`float`, *optional*):
-                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
-                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
-                    original maximum pre-trained length.
-                `original_max_position_embeddings` (`int`, *optional*):
-                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
-                    pretraining.
-                `attention_factor` (`float`, *optional*):
-                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
-                    computation. If unspecified, it defaults to value recommended by the implementation, using the
-                    `factor` field to infer the suggested value.
-                `beta_fast` (`float`, *optional*):
-                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
-                    ramp function. If unspecified, it defaults to 32.
-                `beta_slow` (`float`, *optional*):
-                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
-                    ramp function. If unspecified, it defaults to 1.
-                `short_factor` (`list[float]`, *optional*):
-                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
-                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
-                    size divided by the number of attention heads divided by 2
-                `long_factor` (`list[float]`, *optional*):
-                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
-                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
-                    size divided by the number of attention heads divided by 2
-                `low_freq_factor` (`float`, *optional*):
-                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
-                `high_freq_factor` (`float`, *optional*):
-                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
-        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
-            Whether to use a bias in the query, key, value and output projection layers during self-attention.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-        sliding_window (`int`, *optional*, defaults to 4096):
-            Size of the sliding window attention context.
-        sliding_window_pattern (`int`, *optional*, defaults to 4):
-            Pattern for the sliding window attention.
-        cache_implementation (`str`, *optional*, defaults to `"hybrid"`): the cache type to be used with `generate`.
-
-    ```python
-    >>> from transformers import Cohere2Model, Cohere2Config
-
-    >>> # Initializing a Cohere Nextmodel configuration
-    >>> configuration = Cohere2Config()
-
-    >>> # Initializing a model from the Cohere2 configuration
-    >>> model = Cohere2Model(configuration) # doctest: +SKIP
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config # doctest: +SKIP
-    ```
-    """
-
-    model_type = "cohere2"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=256000,
-        hidden_size=8192,
-        intermediate_size=22528,
-        logit_scale=0.0625,
-        num_hidden_layers=40,
-        num_attention_heads=64,
-        num_key_value_heads=None,
-        hidden_act="silu",
-        max_position_embeddings=8192,
-        initializer_range=0.02,
-        layer_norm_eps=1e-5,
-        use_cache=True,
-        pad_token_id=0,
-        bos_token_id=5,
-        eos_token_id=255001,
-        tie_word_embeddings=True,
-        rope_theta=10000.0,
-        rope_scaling=None,
-        attention_bias=False,
-        attention_dropout=0.0,
-        sliding_window=4096,
-        sliding_window_pattern=4,
-        cache_implementation="hybrid",
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.logit_scale = logit_scale
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-
-        self.num_key_value_heads = num_key_value_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.layer_norm_eps = layer_norm_eps
-        self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
-        self.attention_bias = attention_bias
-        self.attention_dropout = attention_dropout
-        self.sliding_window = sliding_window
-        self.sliding_window_pattern = sliding_window_pattern
-        # Need to specify head_dim in the config so it can be used in the attention forward functions
-        self.head_dim = hidden_size // num_attention_heads
-        self.cache_implementation = cache_implementation
-
-        # Validate the correctness of rotary position embeddings parameters
-        rope_config_validation(self)
-
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
-
-
-__all__ = ["Cohere2Config"]
diff --git a/vllm/transformers_utils/configs/dbrx.py b/vllm/transformers_utils/configs/dbrx.py
deleted file mode 100644
index 7dbda99f85a4e..0000000000000
--- a/vllm/transformers_utils/configs/dbrx.py
+++ /dev/null
@@ -1,280 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# yapf: disable
-# ruff: noqa: E501
-# coding=utf-8
-# Copied from
-# https://huggingface.co/databricks/dbrx-base/blob/main/configuration_dbrx.py
-"""Dbrx configuration."""
-
-from typing import Any, Optional
-
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-
-logger = logging.get_logger(__name__)
-
-DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP = {} # type: ignore
-
-
-class DbrxAttentionConfig(PretrainedConfig):
-    """Configuration class for Dbrx Attention.
-
-    [`DbrxAttention`] class. It is used to instantiate attention layers
-    according to the specified arguments, defining the layers architecture.
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-    Args:
-        attn_pdrop (`float`, *optional*, defaults to 0.0):
-            The dropout probability for the attention layers.
-        clip_qkv (`float`, *optional*, defaults to None):
-            If not `None`, clip the queries, keys, and values in the attention layer to this value.
-        kv_n_heads (Optional[int]): For grouped_query_attention only, allow user to specify number of kv heads.
-        rope_theta (float): The base frequency for rope.
-    """
-
-    def __init__(
-        self,
-        attn_pdrop: float = 0,
-        clip_qkv: Optional[float] = None,
-        kv_n_heads: int = 1,
-        rope_theta: float = 10000.0,
-        **kwargs: Any,
-    ):
-        super().__init__(**kwargs)
-        self.attn_pdrop = attn_pdrop
-        self.clip_qkv = clip_qkv
-        self.kv_n_heads = kv_n_heads
-        self.rope_theta = rope_theta
-
-        for k in ["model_type"]:
-            if k in kwargs:
-                kwargs.pop(k)
-        if len(kwargs) != 0:
-            raise ValueError(f"Found unknown {kwargs=}")
-
-    @classmethod
-    def from_pretrained(
-        cls, pretrained_model_name_or_path: str, **kwargs: Any
-    ) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(
-            pretrained_model_name_or_path, **kwargs
-        )
-
-        if config_dict.get("model_type") == "dbrx":
-            config_dict = config_dict["attn_config"]
-
-        if (
-            "model_type" in config_dict
-            and hasattr(cls, "model_type")
-            and config_dict["model_type"] != cls.model_type
-        ):
-            logger.warning(
-                "You are using a model of type %s to instantiate a model of "
-                "type %s. This is not supported for all configurations of "
-                "models and can yield errors.",
-                config_dict["model_type"], cls.model_type)
-
-        return cls.from_dict(config_dict, **kwargs)
-
-
-class DbrxFFNConfig(PretrainedConfig):
-    """Configuration class for Dbrx FFN.
-
-    [`DbrxFFN`] class. It is used to instantiate feedforward layers according to
-    the specified arguments, defining the layers architecture.
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-    Args:
-        ffn_act_fn (dict, optional): A dict specifying activation function for the FFN.
-            The dict should have a key 'name' with the value being the name of
-            the activation function along with any additional keyword arguments.
-        ffn_hidden_size (int, optional): The hidden size of the feedforward network.
-        moe_num_experts (int, optional): The number of experts in the mixture of experts layer.
-        moe_top_k (int, optional): The number of experts to use in the mixture of experts layer.
-        moe_jitter_eps (float, optional): The jitter epsilon for the mixture of experts layer.
-        moe_loss_weight (float, optional): The loss weight for the mixture of experts layer.
-        moe_normalize_expert_weights (float, optional): The normalization factor for the expert weights.
-        uniform_expert_assignment (bool, optional): Whether to use uniform expert assignment.
-            This should only be used for benchmarking purposes.
-    """
-
-    def __init__(
-        self,
-        ffn_act_fn: Optional[dict] = None,
-        ffn_hidden_size: int = 3584,
-        moe_num_experts: int = 4,
-        moe_top_k: int = 1,
-        moe_jitter_eps: Optional[float] = None,
-        moe_loss_weight: float = 0.01,
-        moe_normalize_expert_weights: Optional[float] = 1,
-        uniform_expert_assignment: bool = False,
-        **kwargs: Any,
-    ):
-        super().__init__()
-        if ffn_act_fn is None:
-            ffn_act_fn = {"name": "silu"}
-        self.ffn_act_fn = ffn_act_fn
-        self.ffn_hidden_size = ffn_hidden_size
-        self.moe_num_experts = moe_num_experts
-        self.moe_top_k = moe_top_k
-        self.moe_jitter_eps = moe_jitter_eps
-        self.moe_loss_weight = moe_loss_weight
-        self.moe_normalize_expert_weights = moe_normalize_expert_weights
-        self.uniform_expert_assignment = uniform_expert_assignment
-
-        for k in ["model_type"]:
-            if k in kwargs:
-                kwargs.pop(k)
-        if len(kwargs) != 0:
-            raise ValueError(f"Found unknown {kwargs=}")
-
-    @classmethod
-    def from_pretrained(
-        cls, pretrained_model_name_or_path: str, **kwargs: Any
-    ) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(
-            pretrained_model_name_or_path, **kwargs
-        )
-
-        if config_dict.get("model_type") == "dbrx":
-            config_dict = config_dict["ffn_config"]
-
-        if (
-            "model_type" in config_dict
-            and hasattr(cls, "model_type")
-            and config_dict["model_type"] != cls.model_type
-        ):
-            logger.warning(
-                "You are using a model of type %s to instantiate a model of "
-                "type %s. This is not supported for all "
-                "configurations of models and can yield errors.", config_dict["model_type"], cls.model_type)
-
-        return cls.from_dict(config_dict, **kwargs)
-
-
-class DbrxConfig(PretrainedConfig):
-    """Configuration class for Dbrx.
-
-    [`DbrxModel`]. It is used to instantiate a Dbrx model according to the
-    specified arguments, defining the model architecture.
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-
-    Args:
-        d_model (`int`, *optional*, defaults to 6144):
-            Dimensionality of the embeddings and hidden states.
-        n_heads (`int`, *optional*, defaults to 48):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        n_layers (`int`, *optional*, defaults to 40):
-            Number of hidden layers in the Transformer encoder.
-        max_seq_len (`int`, *optional*, defaults to 32768):
-            The maximum sequence length of the model.
-        vocab_size (`int`, *optional*, defaults to 100352):
-            Vocabulary size of the Dbrx model. Defines the maximum number of different tokens that can be represented by
-            the `inputs_ids` passed when calling [`DbrxModel`].
-        resid_pdrop (`float`, *optional*, defaults to 0.0):
-            The dropout probability applied to the attention output before combining with residual.
-        emb_pdrop (`float`, *optional*, defaults to 0.0):
-            The dropout probability for the embedding layer.
-        attn_config (`dict`, *optional*):
-            A dictionary used to configure the model's attention module.
-        ffn_config (`dict`, *optional*):
-            A dictionary used to configure the model's FFN module.
-        use_cache (`bool`, *optional*, defaults to `False`):
-            Whether or not the model should return the last key/values attentions (not used by all models).
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        output_router_logits (`bool`, *optional*, defaults to `False`):
-            Whether or not the router logits should be returned by the model. Enabling this will also allow the model to output the auxiliary loss.
-        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
-            The aux loss factor for the total loss.
-
-
-    Example:
-    ```python
-    >>> from transformers import DbrxConfig, DbrxModel
-
-    >>> # Initializing a Dbrx configuration
-    >>> configuration = DbrxConfig()
-
-    >>> # Initializing a model (with random weights) from the configuration
-    >>> model = DbrxModel(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```
-    """
-
-    model_type = "dbrx"
-    attribute_map = {
-        "num_attention_heads": "n_heads",
-        "hidden_size": "d_model",
-        "num_hidden_layers": "n_layers",
-        "max_position_embeddings": "max_seq_len",
-    }
-
-    def __init__(
-        self,
-        d_model: int = 2048,
-        n_heads: int = 16,
-        n_layers: int = 24,
-        max_seq_len: int = 2048,
-        vocab_size: int = 32000,
-        resid_pdrop: float = 0.0,
-        emb_pdrop: float = 0.0,
-        attn_config: Optional[DbrxAttentionConfig] = None,
-        ffn_config: Optional[DbrxFFNConfig] = None,
-        use_cache: bool = True,
-        initializer_range: float = 0.02,
-        output_router_logits: bool = False,
-        router_aux_loss_coef: float = 0.05,
-        **kwargs: Any,
-    ):
-        if attn_config is None:
-            self.attn_config = DbrxAttentionConfig()
-        elif isinstance(attn_config, dict):
-            self.attn_config = DbrxAttentionConfig(**attn_config)
-        else:
-            self.attn_config = attn_config
-
-        if ffn_config is None:
-            self.ffn_config = DbrxFFNConfig()
-        elif isinstance(ffn_config, dict):
-            self.ffn_config = DbrxFFNConfig(**ffn_config)
-        else:
-            self.ffn_config = ffn_config
-
-        self.d_model = d_model
-        self.n_heads = n_heads
-        self.n_layers = n_layers
-        self.max_seq_len = max_seq_len
-        self.vocab_size = vocab_size
-        self.resid_pdrop = resid_pdrop
-        self.emb_pdrop = emb_pdrop
-        self.use_cache = use_cache
-        self.initializer_range = initializer_range
-        self.output_router_logits = output_router_logits
-        self.router_aux_loss_coef = router_aux_loss_coef
-
-        tie_word_embeddings = kwargs.pop("tie_word_embeddings", False)
-        if tie_word_embeddings:
-            raise ValueError(
-                "tie_word_embeddings is not supported for Dbrx models."
-            )
-
-        super().__init__(
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
diff --git a/vllm/transformers_utils/configs/exaone.py b/vllm/transformers_utils/configs/exaone.py
deleted file mode 100644
index 7450904a15caf..0000000000000
--- a/vllm/transformers_utils/configs/exaone.py
+++ /dev/null
@@ -1,190 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# Copied from
-# https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/configuration_exaone.py
-# Copyright 2021 The LG AI Research EXAONE Lab. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Exaone model configuration"""
-
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-
-logger = logging.get_logger(__name__)
-
-EXAONE_PRETRAINED_CONFIG_ARCHIVE_MAP: dict[str, str] = {}
-
-
-class ExaoneConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a :class:
-    `~transformers.ExaoneModel`. It is used to instantiate a GPT Lingvo model
-    according to the specified arguments, defining the model architecture.
-    Instantiating a configuration with the defaults will yield a similar
-    configuration to that of the Exaone
-
-    Configuration objects inherit from {class}`~transformers.PretrainedConfig`
-    and can be used to control the model outputs. Read the documentation from :
-    class:`~transformers.PretrainedConfig` for more information.
-
-    Args:
-        vocab_size ({obj}`int`, `optional`, defaults to 50257):
-            Vocabulary size of the GPT Lingvo model. Defines the number of
-            different tokens that can be represented by the {obj}`inputs_ids`
-            passed when calling {class}`~transformers.ExaoneModel`. Vocabulary
-            size of the model.
-            Defines the different tokens that can be represented by the
-            `inputs_ids` passed to the forward method of :class:
-            `~transformers.EXAONEModel`.
-        hidden_size ({obj}`int`, `optional`, defaults to 2048):
-            Dimensionality of the encoder layers and the pooler layer.
-        num_layers ({obj}`int`, `optional`, defaults to 24):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer in the
-            Transformer decoder.
-        num_key_value_heads (`int`, *optional*):
-            This is the number of key_value heads that should be used to
-            implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi
-            Head Attention (MHA), if `num_key_value_heads=1 the model will use
-            Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint,
-            each group key and value head should be constructed by meanpooling
-            all the original heads within that group. For more details checkout
-            [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not
-            specified, will default to `num_attention_heads`.
-        rotary_pct (`float`, *optional*, defaults to 0.25):
-            percentage of hidden dimensions to allocate to rotary embeddings
-        intermediate_size ({obj}`int`, `optional`, defaults to 8192):
-            Dimensionality of the "intermediate" (i.e., feed-forward) layer in
-            the Transformer encoder.
-        activation_function ({obj}`str` or {obj}`function`, `optional`,
-        defaults to {obj}`"gelu_new"`):
-            The non-linear activation function (function or string) in the
-            encoder and pooler. If string, {obj}`"gelu"`, {obj}`"relu"`,
-            {obj}`"selu"` and {obj}`"gelu_new"` are supported.
-        embed_dropout ({obj}`float`, `optional`, defaults to 0.0):
-            The dropout probabilitiy for all fully connected layers in the
-            embeddings, encoder, and pooler.
-        attention_dropout ({obj}`float`, `optional`, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-        max_position_embeddings ({obj}`int`, `optional`, defaults to 2048):
-            The maximum sequence length that this model might ever be used with.
-            Typically set this to something large just in case
-            (e.g., 512 or 1024 or 2048).
-        type_vocab_size ({obj}`int`, `optional`, defaults to 2):
-            The vocabulary size of the {obj}`token_type_ids` passed when calling
-            {class}`~transformers.EXAONEModel`.
-        initializer_range ({obj}`float`, `optional`, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for
-            initializing all weight matrices.
-        layer_norm_epsilon ({obj}`float`, `optional`, defaults to 1e-5):
-            The epsilon used by the layer normalization layers.
-        use_cache ({obj}`bool`, `optional`, defaults to {obj}`True`):
-            Whether or not the model should return the last key/values
-            attentions (not used by all models).
-            Only relevant if ``config.is_decoder=True``.
-        gradient_checkpointing ({obj}`bool`, `optional`,
-        defaults to {obj}`False`):
-            If True, use gradient checkpointing to save memory at the expense
-            of slower backward pass.
-        Example::
-
-            >>> from transformers import ExoneModel, ExaoneConfig
-
-            >>> # Initializing a EXAONE configuration
-            >>> configuration = ExaoneConfig()
-
-            >>> # Initializing a model from configuration
-            >>> model = ExoneModel(configuration)
-
-            >>> # Accessing the model configuration
-            >>> configuration = model.config
-    """
-
-    model_type = "exaone"
-    keys_to_ignore_at_inference = ["past_key_values"]
-    attribute_map = {"num_hidden_layers": "num_layers"}
-
-    def __init__(
-        self,
-        vocab_size=102400,
-        max_position_embeddings=2048,
-        hidden_size=2048,
-        num_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=None,
-        intermediate_size=None,
-        activation_function="silu",
-        rotary_pct=0.25,
-        resid_dropout=0.0,
-        embed_dropout=0.0,
-        attention_dropout=0.0,
-        layer_norm_epsilon=1e-6,
-        initializer_range=0.02,
-        use_cache=True,
-        bos_token_id=0,
-        eos_token_id=2,
-        tie_word_embeddings=True,
-        **kwargs,
-    ):
-        super().__init__(
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
-
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.num_layers = num_layers
-        self.num_attention_heads = num_attention_heads
-        self.num_hidden_layers = num_layers
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-        self.num_key_value_heads = num_key_value_heads
-        if intermediate_size:
-            self.intermediate_size = intermediate_size
-        else:
-            self.intermediate_size = hidden_size * 4
-        self.activation_function = activation_function
-        self.resid_dropout = resid_dropout
-        self.embed_dropout = embed_dropout
-        self.attention_dropout = attention_dropout
-        self.layer_norm_epsilon = layer_norm_epsilon
-        self.initializer_range = initializer_range
-        self.use_cache = use_cache
-        self.rotary_pct = rotary_pct
-
-        self.bos_token_id = bos_token_id
-        self.eos_token_id = eos_token_id
-
-        self.use_logit_cap = kwargs.pop("use_logit_cap", False)
-        self.ln_no_scale = kwargs.pop("ln_no_scale", False)
-        self.use_gated = kwargs.pop("use_gated", False)
-        self.use_emb_norm = kwargs.pop("use_emb_norm", False)
-        self.use_rotary_pos = kwargs.pop("use_rotary_pos", False)
-        self.rotary_type = kwargs.pop("rotary_type", None)
-        self.scaling_factor = kwargs.pop("scaling_factor", 1)
-        self.use_absolute_pos = kwargs.pop("use_absolute_pos", True)
-        self.use_extra_logit = kwargs.pop("use_extra_logit", True)
-        self.rotary_expand_length = kwargs.pop("rotary_expand_length", None)
-        self.rotary_base = kwargs.pop("rotary_base", 10000.0)
-        self.use_qkv_fuse = kwargs.pop("use_qkv_fuse", False)
-        self.rescale_before_lm_head = kwargs.pop("rescale_before_lm_head",
-                                                 (rotary_pct == 0.25))
-        if self.use_rotary_pos:
-            self.use_absolute_pos = False
diff --git a/vllm/transformers_utils/configs/exaone4.py b/vllm/transformers_utils/configs/exaone4.py
deleted file mode 100644
index a22ebaa6bd6bb..0000000000000
--- a/vllm/transformers_utils/configs/exaone4.py
+++ /dev/null
@@ -1,252 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# ruff: noqa: E501
-
-# Copied from
-# https://github.com/lgai-exaone/transformers/blob/add-exaone4/src/transformers/models/exaone4/configuration_exaone4.py
-# Copyright 2025 The LG CNS Gen AI Solution Delivery Team.
-# Copyright 2025 The LG AI Research and HuggingFace Inc. team. All rights reserved.
-#
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from transformers.configuration_utils import (PretrainedConfig,
-                                              layer_type_validation)
-from transformers.utils import logging
-
-logger = logging.get_logger(__name__)
-
-
-def check_is_sliding(config, layer_idx):
-    """
-    Check if the current layer is a sliding window attention (local attention) layer.
-    """
-    if config.sliding_window is None:
-        return False
-    if config.layer_types is not None:
-        return config.layer_types[layer_idx] == "sliding_attention"
-    if isinstance(config.sliding_window_pattern, int):
-        return ((layer_idx + 1) % config.sliding_window_pattern) != 0
-    elif isinstance(config.sliding_window_pattern, str):
-        assert isinstance(config.sliding_window, int), (
-            f"Sliding window must be positive integer, but got {config.sliding_window}"
-        )
-        return (layer_idx != config.num_hidden_layers - 1
-                and config.sliding_window_pattern[layer_idx % len(
-                    config.sliding_window_pattern)] == "L")
-    else:
-        logger.warning_once(
-            "Sliding window is set, but none of `sliding_window_pattern` or `layer_types` is set. "
-            "Defaulting to use 'full_attention' for all layers.")
-    return False
-
-
-class Exaone4Config(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`Exaone4Model`]. It is used to
-    instantiate a EXAONE 4.0 model according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the EXAONE-4.0-Instruct [LGAI-EXAONE/EXAONE-4.0-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-Instruct)
-    NOTE: `EXAONE-4.0-Instruct` is a placeholder model ID. The exact model ID will be updated in the future.
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
-    outputs. Read the documentation from [`PretrainedConfig`] for more information.
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 102400):
-            Vocabulary size of the EXAONE 4.0 model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`Exaone4Model`].
-        hidden_size (`int`, *optional*, defaults to 4096):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to `hidden_size * 4`):
-            Dimensionality of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 32):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer in the Transformer decoder.
-        num_key_value_heads (`int`, *optional*):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details checkout [this
-            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
-            `num_attention_heads`.
-        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
-            The non-linear activation function (function or string) in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to 2048):
-            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            just in case (e.g., 32768 for EXAONE 3.5).
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
-            The epsilon used by the layer normalization layers.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if ``config.is_decoder=True``.
-        bos_token_id (`int`, *optional*, defaults to 0):
-            Beginning of stream token id.
-        eos_token_id (`int`, *optional*, defaults to 2):
-            End of stream token id.
-        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
-            Whether to tie weight embeddings
-        rope_theta (`float`, *optional*, defaults to 10000.0):
-            The base period of the RoPE embeddings.
-        rope_scaling (`Dict`, *optional*):
-            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
-            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
-            accordingly.
-            Expected contents:
-                `rope_type` (`str`):
-                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
-                    'llama3'], with 'default' being the original RoPE implementation.
-                `factor` (`float`, *optional*):
-                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
-                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
-                    original maximum pre-trained length.
-                `original_max_position_embeddings` (`int`, *optional*):
-                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
-                    pretraining.
-                `attention_factor` (`float`, *optional*):
-                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
-                    computation. If unspecified, it defaults to value recommended by the implementation, using the
-                    `factor` field to infer the suggested value.
-                `beta_fast` (`float`, *optional*):
-                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
-                    ramp function. If unspecified, it defaults to 32.
-                `beta_slow` (`float`, *optional*):
-                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
-                    ramp function. If unspecified, it defaults to 1.
-                `short_factor` (`List[float]`, *optional*):
-                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
-                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
-                    size divided by the number of attention heads divided by 2
-                `long_factor` (`List[float]`, *optional*):
-                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
-                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
-                    size divided by the number of attention heads divided by 2
-                `low_freq_factor` (`float`, *optional*):
-                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
-                `high_freq_factor` (`float`, *optional*):
-                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-        sliding_window (`int`, *optional*):
-            The size of the sliding window for the sliding window attention.
-        sliding_window_pattern (`str`, *optional*):
-            The pattern to use for sliding window attention. Can be one of:
-                - `None`: No sliding window attention is used
-                - `int`: Every `sliding_window` layers, use global attention, else use local attention.
-                - `str`: A sequence of "L" (local attention) and "G" (global attention) characters that defines the
-                  attention pattern. The pattern starts from layer 0 and repeats every `sliding_window` layers. The
-                  final layer always uses global attention regardless of the pattern.
-            For instance, sliding_window_pattern="LLLG" same as sliding_window=4, which means:
-                - Layer 0, 1, 2: local attention,
-                - Layer 3: global attention,
-                ...(repeated)
-        layer_types (`list`, *optional*):
-            Attention pattern for each layer. Prioritized over `sliding_window_pattern`.
-
-    Example:
-
-    ```python
-    >>> from transformers import Exaone4Model, Exaone4Config
-
-    >>> # Initializing a EXAONE configuration
-    >>> configuration = Exaone4Config()
-
-    >>> # Initializing a model from configuration
-    >>> model = Exaone4Model(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-
-    model_type = "exaone4"
-    keys_to_ignore_at_inference = ["past_key_values"]
-    # Default tensor parallel plan for base model `LlamaModel`
-    base_model_tp_plan = {
-        "layers.*.self_attn.q_proj": "colwise",
-        "layers.*.self_attn.k_proj": "colwise",
-        "layers.*.self_attn.v_proj": "colwise",
-        "layers.*.self_attn.o_proj": "rowwise",
-        "layers.*.mlp.gate_proj": "colwise",
-        "layers.*.mlp.up_proj": "colwise",
-        "layers.*.mlp.down_proj": "rowwise",
-    }
-    base_model_pp_plan = {
-        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
-        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
-        "norm": (["hidden_states"], ["hidden_states"]),
-    }
-
-    def __init__(
-        self,
-        vocab_size=102400,
-        hidden_size=4096,
-        intermediate_size=None,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=None,
-        hidden_act="silu",
-        max_position_embeddings=2048,
-        initializer_range=0.02,
-        rms_norm_eps=1e-5,
-        use_cache=True,
-        bos_token_id=0,
-        eos_token_id=2,
-        tie_word_embeddings=False,
-        rope_theta=10000.0,
-        rope_scaling=None,
-        attention_dropout=0.0,
-        sliding_window=None,
-        sliding_window_pattern=None,
-        layer_types=None,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-        self.num_key_value_heads = num_key_value_heads
-        if intermediate_size:
-            self.intermediate_size = intermediate_size
-        else:
-            self.intermediate_size = hidden_size * 4
-        self.hidden_act = hidden_act
-        self.max_position_embeddings = max_position_embeddings
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        self.attention_dropout = attention_dropout
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
-        self.sliding_window = sliding_window
-        self.sliding_window_pattern = sliding_window_pattern
-
-        self.layer_types = layer_types
-        if self.layer_types is None:
-            self.layer_types = [
-                "sliding_attention"
-                if check_is_sliding(self, i) else "full_attention"
-                for i in range(self.num_hidden_layers)
-            ]
-        layer_type_validation(self.layer_types)
-
-        super().__init__(bos_token_id=bos_token_id,
-                         eos_token_id=eos_token_id,
-                         tie_word_embeddings=tie_word_embeddings,
-                         **kwargs)
-
-
-__all__ = ["Exaone4Config"]
diff --git a/vllm/transformers_utils/configs/minimax_text_01.py b/vllm/transformers_utils/configs/minimax_text_01.py
deleted file mode 100644
index e3b63dfa00371..0000000000000
--- a/vllm/transformers_utils/configs/minimax_text_01.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-""" MiniMaxText01 model configuration"""
-
-from transformers.configuration_utils import PretrainedConfig
-
-
-class MiniMaxText01Config(PretrainedConfig):
-    model_type = "MiniMaxText01"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=32000,
-        hidden_size=4096,
-        intermediate_size=14336,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=8,
-        hidden_act="silu",
-        max_position_embeddings=4096 * 32,
-        initializer_range=0.02,
-        rms_norm_eps=1e-5,
-        use_cache=True,
-        pad_token_id=None,
-        bos_token_id=None,
-        eos_token_id=None,
-        tie_word_embeddings=False,
-        rope_theta=1e6,
-        sliding_window=None,
-        attention_dropout=0.0,
-        num_experts_per_tok=2,
-        num_local_experts=8,
-        output_router_logits=False,
-        router_aux_loss_coef=0.001,
-        router_jitter_noise=0.0,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.sliding_window = sliding_window
-
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-
-        self.num_key_value_heads = num_key_value_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.attention_dropout = attention_dropout
-
-        self.num_experts_per_tok = num_experts_per_tok
-        self.num_local_experts = num_local_experts
-        self.output_router_logits = output_router_logits
-        self.router_aux_loss_coef = router_aux_loss_coef
-        self.router_jitter_noise = router_jitter_noise
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
diff --git a/vllm/transformers_utils/configs/minimax_vl_01.py b/vllm/transformers_utils/configs/minimax_vl_01.py
deleted file mode 100644
index c62497192cc2a..0000000000000
--- a/vllm/transformers_utils/configs/minimax_vl_01.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""MiniMaxVL01 model configuration"""
-
-from transformers.configuration_utils import PretrainedConfig
-from transformers.models.auto import CONFIG_MAPPING
-
-from .minimax_text_01 import MiniMaxText01Config
-
-
-class MiniMaxVL01Config(PretrainedConfig):
-    model_type = "minimax_vl_01"
-
-    def __init__(
-        self,
-        vision_config=None,
-        text_config=None,
-        ignore_index=-100,
-        image_token_index=32000,
-        projector_hidden_act="gelu",
-        vision_feature_select_strategy="default",
-        vision_feature_layer=-2,
-        image_grid_pinpoints=None,
-        tie_word_embeddings=False,
-        image_seq_length=576,
-        **kwargs,
-    ):
-        self.ignore_index = ignore_index
-        self.image_token_index = image_token_index
-        self.projector_hidden_act = projector_hidden_act
-        self.image_seq_length = image_seq_length
-
-        if vision_feature_select_strategy not in ["default", "full"]:
-            raise ValueError("vision_feature_select_strategy should " +
-                             "be one of 'default', 'full'." +
-                             f"Got: {vision_feature_select_strategy}")
-
-        self.vision_feature_select_strategy = vision_feature_select_strategy
-        self.vision_feature_layer = vision_feature_layer
-        image_grid_pinpoints = (
-            image_grid_pinpoints if image_grid_pinpoints is not None else
-            [[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]])
-        self.image_grid_pinpoints = image_grid_pinpoints
-
-        if isinstance(vision_config, dict):
-            if "model_type" not in vision_config:
-                vision_config["model_type"] = "clip_vision_model"
-            vision_config = CONFIG_MAPPING[vision_config["model_type"]](
-                **vision_config)
-        elif vision_config is None:
-            vision_config = CONFIG_MAPPING["clip_vision_model"](
-                intermediate_size=4096,
-                hidden_size=1024,
-                patch_size=14,
-                image_size=336,
-                num_hidden_layers=24,
-                num_attention_heads=16,
-                vocab_size=32000,
-                projection_dim=768,
-            )
-
-        self.vision_config = vision_config
-
-        if text_config is not None:
-            text_config = MiniMaxText01Config(**text_config)
-        else:
-            text_config = MiniMaxText01Config()
-
-        self.text_config = text_config
-
-        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
diff --git a/vllm/transformers_utils/configs/mpt.py b/vllm/transformers_utils/configs/mpt.py
deleted file mode 100644
index 91316408dcd89..0000000000000
--- a/vllm/transformers_utils/configs/mpt.py
+++ /dev/null
@@ -1,180 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# Copied from
-# https://huggingface.co/mosaicml/mpt-7b/blob/main/configuration_mpt.py
-"""A HuggingFace-style model configuration."""
-import warnings
-from typing import Any, Optional, Union
-
-from transformers import PretrainedConfig
-
-attn_config_defaults: dict = {
-    'attn_type': 'multihead_attention',
-    'attn_pdrop': 0.0,
-    'attn_impl': 'triton',
-    'qk_ln': False,
-    'clip_qkv': None,
-    'softmax_scale': None,
-    'prefix_lm': False,
-    'attn_uses_sequence_id': False,
-    'alibi': False,
-    'alibi_bias_max': 8
-}
-ffn_config_defaults: dict = {'ffn_type': 'mptmlp'}
-init_config_defaults: dict = {
-    'name': 'kaiming_normal_',
-    'fan_mode': 'fan_in',
-    'init_nonlinearity': 'relu',
-    'init_div_is_residual': True,
-    'emb_init_std': None,
-    'emb_init_uniform_lim': None,
-    'init_std': None,
-    'init_gain': 0.0
-}
-
-
-class MPTConfig(PretrainedConfig):
-    model_type = 'mpt'
-    attribute_map = {
-        'num_attention_heads': 'n_heads',
-        'hidden_size': 'd_model',
-        'num_hidden_layers': 'n_layers',
-    }
-
-    # pylint: disable=dangerous-default-value
-    def __init__(self,
-                 d_model: int = 2048,
-                 n_heads: int = 16,
-                 n_layers: int = 24,
-                 expansion_ratio: int = 4,
-                 max_seq_len: int = 2048,
-                 vocab_size: int = 50368,
-                 resid_pdrop: float = 0.0,
-                 emb_pdrop: float = 0.0,
-                 learned_pos_emb: bool = True,
-                 attn_config: dict = attn_config_defaults,
-                 ffn_config: dict = ffn_config_defaults,
-                 init_device: str = 'cpu',
-                 logit_scale: Optional[Union[float, str]] = None,
-                 no_bias: bool = False,
-                 embedding_fraction: float = 1.0,
-                 norm_type: str = 'low_precision_layernorm',
-                 use_cache: bool = False,
-                 init_config: dict = init_config_defaults,
-                 fc_type: str = 'torch',
-                 verbose: Optional[int] = None,
-                 **kwargs: Any):
-        self.d_model = d_model
-        self.n_heads = n_heads
-        self.n_layers = n_layers
-        self.expansion_ratio = expansion_ratio
-        self.max_seq_len = max_seq_len
-        self.vocab_size = vocab_size
-        self.resid_pdrop = resid_pdrop
-        self.emb_pdrop = emb_pdrop
-        self.learned_pos_emb = learned_pos_emb
-        self.attn_config = attn_config
-        self.ffn_config = ffn_config
-        self.init_device = init_device
-        self.logit_scale = logit_scale
-        self.no_bias = no_bias
-        self.embedding_fraction = embedding_fraction
-        self.norm_type = norm_type
-        self.use_cache = use_cache
-        self.init_config = init_config
-        self.fc_type = fc_type
-        if verbose is not None:
-            warnings.warn(DeprecationWarning(
-                'verbose argument for MPTConfig is now ignored and '
-                'will be removed. Use python_log_level instead.'),
-                          stacklevel=2)
-        if 'name' in kwargs:
-            del kwargs['name']
-        if 'loss_fn' in kwargs:
-            del kwargs['loss_fn']
-        if self.attn_config.get('alibi', False):
-            self.learned_pos_emb = False
-            warnings.warn(
-                f'alibi is turned on, setting `learned_pos_emb` '
-                f'to {self.learned_pos_emb}`',
-                stacklevel=2)
-        super().__init__(**kwargs)
-        self._validate_config()
-
-    def _set_config_defaults(
-            self, config: dict[str, Any],
-            config_defaults: dict[str, Any]) -> dict[str, Any]:
-        for (k, v) in config_defaults.items():
-            if k not in config:
-                config[k] = v
-        return config
-
-    def _validate_config(self) -> None:
-        self.attn_config = self._set_config_defaults(self.attn_config,
-                                                     attn_config_defaults)
-        self.ffn_config = self._set_config_defaults(self.ffn_config,
-                                                    ffn_config_defaults)
-        self.init_config = self._set_config_defaults(self.init_config,
-                                                     init_config_defaults)
-        if self.d_model % self.n_heads != 0:
-            raise ValueError('d_model must be divisible by n_heads')
-        if any(
-                prob < 0 or prob > 1 for prob in
-            [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop
-             ]):
-            raise ValueError(
-                "self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are "
-                "probabilities and must be between 0 and 1")
-        if self.attn_config['attn_impl'] not in ['torch', 'flash', 'triton']:
-            raise ValueError(
-                f"Unknown attn_impl={self.attn_config['attn_impl']}")
-        if self.attn_config['prefix_lm'] and self.attn_config[
-                'attn_impl'] not in ['torch', 'triton']:
-            raise NotImplementedError(
-                'prefix_lm only implemented with torch and triton attention.')
-        if self.attn_config['alibi'] and self.attn_config['attn_impl'] not in [
-                'torch', 'triton'
-        ]:
-            raise NotImplementedError(
-                'alibi only implemented with torch and triton attention.')
-        if self.attn_config['attn_uses_sequence_id'] and self.attn_config[
-                'attn_impl'] not in ['torch', 'triton']:
-            raise NotImplementedError(
-                'attn_uses_sequence_id only implemented with torch '
-                'and triton attention.')
-        if self.embedding_fraction > 1 or self.embedding_fraction <= 0:
-            raise ValueError(
-                'model.embedding_fraction must be between 0 (exclusive) '
-                'and 1 (inclusive)!')
-        if isinstance(self.logit_scale,
-                      str) and self.logit_scale != 'inv_sqrt_d_model':
-            raise ValueError(
-                f"self.logit_scale={self.logit_scale!r} is not recognized as "
-                "an option; use numeric value or 'inv_sqrt_d_model'.")
-        if self.init_config.get('name', None) is None:
-            raise ValueError(
-                f"self.init_config={self.init_config!r} 'name' needs to be set."
-            )
-        if not self.learned_pos_emb and (not self.attn_config['alibi']):
-            warnings.warn(
-                'Positional information not being provided to the model.',
-                stacklevel=2)
-        if self.fc_type == 'te' or self.ffn_config['ffn_type'] == 'te_ln_mlp':
-            try:
-                # pylint: disable=import-outside-toplevel
-                import transformer_engine.pytorch as te
-                del te
-            except Exception as exc:
-                raise ImportError(
-                    'TransformerEngine import fail. `fc_type: te` requires '
-                    'TransformerEngine be installed. '
-                    'The required version of transformer_engine also requires '
-                    'FlashAttention v1.0.6 is installed:\n'
-                    'pip install flash-attn==1.0.6 --no-build-isolation \n'
-                    'pip install git+https://github.com/NVIDIA/TransformerEngine.git@144e4888b2cdd60bd52e706d5b7a79cb9c1a7156'
-                ) from exc
-        if self.ffn_config['ffn_type'] == 'mptmlp':
-            self.ffn_config['fc_type'] = self.fc_type
-        elif self.ffn_config['ffn_type'] == 'te_ln_mlp':
-            self.ffn_config['bias'] = not self.no_bias
diff --git a/vllm/transformers_utils/configs/ovis.py b/vllm/transformers_utils/configs/ovis.py
deleted file mode 100644
index 021d402a71f4c..0000000000000
--- a/vllm/transformers_utils/configs/ovis.py
+++ /dev/null
@@ -1,184 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# yapf: disable
-# ruff: noqa: E501
-# copied from https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_aimv2.py
-# and https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_ovis.py
-from typing import Any, Optional, Union
-
-from transformers import AutoConfig, PretrainedConfig
-
-
-class AIMv2Config(PretrainedConfig):
-    """This is the configuration class to store the configuration of an [`AIMv2Model`].
-
-    Instantiating a configuration with the defaults will yield a similar configuration
-    to that of the [apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224).
-
-    Args:
-        hidden_size: Dimension of the hidden representations.
-        intermediate_size: Dimension of the SwiGLU representations.
-        num_hidden_layers: Number of hidden layers in the Transformer.
-        num_attention_heads: Number of attention heads for each attention layer
-            in the Transformer.
-        num_channels: Number of input channels.
-        image_size: Image size.
-        patch_size: Patch size.
-        rms_norm_eps: Epsilon value used for the RMS normalization layer.
-        attention_dropout: Dropout ratio for attention probabilities.
-        projection_dropout: Dropout ratio for the projection layer after the attention.
-        qkv_bias: Whether to add a bias to the queries, keys and values.
-        use_bias: Whether to add a bias in the feed-forward and projection layers.
-        kwargs: Keyword arguments for the [`PretrainedConfig`].
-    """
-
-    model_type: str = "aimv2"
-
-    def __init__(
-        self,
-        hidden_size: int = 1024,
-        intermediate_size: int = 2816,
-        num_hidden_layers: int = 24,
-        num_attention_heads: int = 8,
-        num_channels: int = 3,
-        image_size: int = 224,
-        patch_size: int = 14,
-        rms_norm_eps: float = 1e-5,
-        attention_dropout: float = 0.0,
-        projection_dropout: float = 0.0,
-        qkv_bias: bool = False,
-        use_bias: bool = False,
-        **kwargs: Any,
-    ):
-        super().__init__(**kwargs)
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.num_channels = num_channels
-        self.patch_size = patch_size
-        self.image_size = image_size
-        self.attention_dropout = attention_dropout
-        self.rms_norm_eps = rms_norm_eps
-
-        self.projection_dropout = projection_dropout
-        self.qkv_bias = qkv_bias
-        self.use_bias = use_bias
-
-
-IGNORE_ID = -100
-IMAGE_TOKEN_ID = -200
-IMAGE_TOKEN = "<image>"
-IMAGE_ATOM_ID = -300
-IMAGE_INDICATOR_IDS = [-301, -302, -303, -304, -305]
-
-
-# ----------------------------------------------------------------------
-#                     Visual Tokenizer Configuration
-# ----------------------------------------------------------------------
-class BaseVisualTokenizerConfig(PretrainedConfig):
-
-    def __init__(self,
-                 vocab_size=16384,
-                 tokenize_function="softmax",
-                 tau=1.0,
-                 depths=None,
-                 drop_cls_token=False,
-                 backbone_config: Optional[Union[PretrainedConfig,
-                                                 dict]] = None,
-                 hidden_stride: int = 1,
-                 **kwargs):
-        super().__init__(**kwargs)
-        self.vocab_size = vocab_size
-        self.tokenize_function = tokenize_function
-        self.tau = tau
-        if isinstance(depths, str):
-            depths = [int(x) for x in depths.split('|')]
-        self.depths = depths
-        self.backbone_kwargs = dict[str, Any]()
-        self.drop_cls_token = drop_cls_token
-        if backbone_config is not None:
-            assert isinstance(backbone_config, (PretrainedConfig, dict)), \
-                f"expect `backbone_config` to be instance of PretrainedConfig or dict, but got {type(backbone_config)} type"
-            if not isinstance(backbone_config, PretrainedConfig):
-                model_type = backbone_config['model_type']
-                if model_type != "aimv2":
-                    backbone_config.pop('model_type')
-                    backbone_config = AutoConfig.for_model(model_type, **backbone_config)
-                else:
-                    backbone_config = AIMv2Config(**backbone_config)
-        self.backbone_config = backbone_config
-        self.hidden_stride = hidden_stride
-
-
-class Aimv2VisualTokenizerConfig(BaseVisualTokenizerConfig):
-    model_type = "aimv2_visual_tokenizer"
-
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        if self.drop_cls_token:
-            self.drop_cls_token = False
-        if self.depths:
-            assert len(self.depths) == 1
-            self.backbone_kwargs['num_hidden_layers'] = self.depths[0]
-
-
-class SiglipVisualTokenizerConfig(BaseVisualTokenizerConfig):
-    model_type = "siglip_visual_tokenizer"
-
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        if self.drop_cls_token:
-            self.drop_cls_token = False
-        if self.depths:
-            assert len(self.depths) == 1
-            self.backbone_kwargs['num_hidden_layers'] = self.depths[0]
-
-
-AutoConfig.register("siglip_visual_tokenizer", SiglipVisualTokenizerConfig)
-AutoConfig.register("aimv2_visual_tokenizer", Aimv2VisualTokenizerConfig)
-
-
-# ----------------------------------------------------------------------
-#                           Ovis Configuration
-# ----------------------------------------------------------------------
-class OvisConfig(PretrainedConfig):
-    model_type = "ovis"
-
-    def __init__(self,
-                 llm_config: Optional[Union[PretrainedConfig, dict]] = None,
-                 visual_tokenizer_config: Optional[Union[PretrainedConfig,
-                                                         dict]] = None,
-                 multimodal_max_length=8192,
-                 hidden_size=None,
-                 conversation_formatter_class=None,
-                 llm_attn_implementation=None,
-                 disable_tie_weight=False,
-                 **kwargs):
-        super().__init__(**kwargs)
-        if llm_config is not None:
-            assert isinstance(llm_config, (PretrainedConfig, dict)), \
-                f"expect `llm_config` to be instance of PretrainedConfig or dict, but got {type(llm_config)} type"
-            if not isinstance(llm_config, PretrainedConfig):
-                model_type = llm_config['model_type']
-                llm_config.pop('model_type')
-                llm_config = AutoConfig.for_model(model_type, **llm_config)
-
-        # map llm_config to text_config
-        self.text_config = llm_config
-        if visual_tokenizer_config is not None:
-            assert isinstance(visual_tokenizer_config, (PretrainedConfig, dict)), \
-                f"expect `visual_tokenizer_config` to be instance of PretrainedConfig or dict, but got {type(visual_tokenizer_config)} type"
-            if not isinstance(visual_tokenizer_config, PretrainedConfig):
-                model_type = visual_tokenizer_config['model_type']
-                visual_tokenizer_config.pop('model_type')
-                visual_tokenizer_config = AutoConfig.for_model(
-                    model_type, **visual_tokenizer_config)
-
-        self.visual_tokenizer_config = visual_tokenizer_config
-        self.multimodal_max_length = multimodal_max_length
-        self.hidden_size = hidden_size
-        self.conversation_formatter_class = conversation_formatter_class
-        self.llm_attn_implementation = llm_attn_implementation
-        self.disable_tie_weight = disable_tie_weight
diff --git a/vllm/transformers_utils/configs/skyworkr1v.py b/vllm/transformers_utils/configs/skyworkr1v.py
deleted file mode 100644
index 33a45220e3159..0000000000000
--- a/vllm/transformers_utils/configs/skyworkr1v.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# Adapted from
-# https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/configuration_skywork_chat.py
-# --------------------------------------------------------
-# SkyworkR1V
-# Copyright (c) 2025 Skywork
-# Licensed under The MIT License [see LICENSE for details]
-# --------------------------------------------------------
-from transformers.configuration_utils import PretrainedConfig
-
-
-class SkyworkR1VChatConfig(PretrainedConfig):
-    model_type = 'internvl_chat'
-    is_composition = True
-
-    def __init__(self,
-                 vision_config=None,
-                 llm_config=None,
-                 use_backbone_lora=0,
-                 use_llm_lora=0,
-                 select_layer=-1,
-                 force_image_size=None,
-                 downsample_ratio=0.5,
-                 template=None,
-                 dynamic_image_size=False,
-                 use_thumbnail=False,
-                 ps_version='v1',
-                 min_dynamic_patch=1,
-                 max_dynamic_patch=6,
-                 **kwargs):
-        super().__init__(**kwargs)
-
-        if vision_config is None:
-            vision_config = {}
-
-        if llm_config is None:
-            llm_config = {}
-
-        self.vision_config = PretrainedConfig(**vision_config)
-        self.text_config = PretrainedConfig(**llm_config)
-
-        self.use_backbone_lora = use_backbone_lora
-        self.use_llm_lora = use_llm_lora
-        self.select_layer = select_layer
-        self.force_image_size = force_image_size
-        self.downsample_ratio = downsample_ratio
-        self.template = template
-        self.dynamic_image_size = dynamic_image_size
-        self.use_thumbnail = use_thumbnail
-        self.ps_version = ps_version  # pixel shuffle version
-        self.min_dynamic_patch = min_dynamic_patch
-        self.max_dynamic_patch = max_dynamic_patch
diff --git a/vllm/transformers_utils/configs/solar.py b/vllm/transformers_utils/configs/solar.py
deleted file mode 100644
index a83dfa40b43a5..0000000000000
--- a/vllm/transformers_utils/configs/solar.py
+++ /dev/null
@@ -1,247 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Solar model configuration"""
-
-from transformers import PretrainedConfig
-from transformers.utils import logging
-
-logger = logging.get_logger(__name__)
-
-
-class SolarConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store
-    the configuration of a [`SolarModel`].
-    It is used to instantiate an LLaMA model
-    according to the specified arguments,
-    defining the model architecture.
-    Instantiating a configuration with the
-    defaults will yield a similar
-    configuration to that of the LLaMA-7B.
-    Configuration objects inherit from [`PretrainedConfig`]
-    and can be used to control the model outputs.
-    Read the documentation from [`PretrainedConfig`] for more information.
-    Args:
-        vocab_size (`int`, *optional*, defaults to 32000):
-            Vocabulary size of the LLaMA model.
-            Defines the number of different tokens
-            that can be represented by the `inputs_ids`
-            passed when calling [`SolarModel`]
-        hidden_size (`int`, *optional*, defaults to 4096):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 11008):
-            Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 32):
-            Number of hidden layers in the Transformer decoder.
-        num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer
-            in the Transformer decoder.
-        num_key_value_heads (`int`, *optional*):
-            This is the number of key_value heads that
-            should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`,
-            the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1` the model
-            will use Multi Query Attention (MQA)
-            otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint,
-            each group key and value head should be constructed
-            by meanpooling all the original heads within that group.
-            For more details checkout [this paper]
-            (https://arxiv.org/pdf/2305.13245.pdf).
-            If it is not specified, will default to
-            `num_attention_heads`.
-        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
-            The non-linear activation function (function or string)
-            in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to 2048):
-            The maximum sequence length that this model might ever be used with.
-            Solar 1 supports up to 2048 tokens,
-            Solar 2 up to 4096, CodeSolar up to 16384.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of
-            the truncated_normal_initializer for initializing
-            all weight matrices.
-        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
-            The epsilon used by the rms normalization layers.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return
-            the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        pad_token_id (`int`, *optional*):
-            Padding token id.
-        bos_token_id (`int`, *optional*, defaults to 1):
-            Beginning of stream token id.
-        eos_token_id (`int`, *optional*, defaults to 2):
-            End of stream token id.
-        pretraining_tp (`int`, *optional*, defaults to 1):
-            Experimental feature. Tensor parallelism rank
-            used during pretraining.
-            Please refer to [this
-            document](https://huggingface.co/docs/
-            transformers/main/
-            perf_train_gpu_many#tensor-parallelism)
-             to understand more about it. This value is
-            necessary to ensure exact reproducibility
-            of the pretraining results.
-            Please refer to [this
-            issue](https://github.com/pytorch/pytorch/issues/76232).
-        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
-            Whether to tie weight embeddings
-        rope_theta (`float`, *optional*, defaults to 10000.0):
-            The base period of the RoPE embeddings.
-        rope_scaling (`dict`, *optional*):
-            Dictionary containing the scaling configuration for
-            the RoPE embeddings.
-            Currently supports two scaling
-            strategies: linear and dynamic.
-            Their scaling factor must be a float greater than 1.
-            The expected format is
-            `{"type": strategy name, "factor": scaling factor}`.
-            When using this flag, don't update
-            `max_position_embeddings` to the expected new maximum.
-            See the following thread for more information on how
-            these scaling strategies behave:
-            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/
-            dynamically_scaled_rope_further_increases/. This is an
-            experimental feature, subject to breaking
-            API changes in future versions.
-        attention_bias (`bool`, *optional*, defaults to `False`):
-            Whether to use a bias in the query, key, value
-            and output projection layers during self-attention.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-        mlp_bias (`bool`, *optional*, defaults to `False`):
-            Whether to use a bias in up_proj, down_proj and gate_proj
-            layers in the MLP layers.
-        sliding_window (`int`, *optional*, defaults to 2047):
-            Sliding window attention window size. If not specified,
-            will default to `2047`.
-    ```python
-    >>> from transformers import SolarModel, SolarConfig
-    >>> # Initializing a Solar-pro style configuration
-    >>> configuration = SolarConfig()
-    >>> # Initializing a model from the Solar-pro style configuration
-    >>> model = SolarModel(configuration)
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-
-    model_type = "solar"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=32000,
-        hidden_size=4096,
-        intermediate_size=11008,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=None,
-        hidden_act="silu",
-        max_position_embeddings=2048,
-        initializer_range=0.02,
-        rms_norm_eps=1e-6,
-        use_cache=True,
-        pad_token_id=None,
-        bos_token_id=1,
-        eos_token_id=2,
-        pretraining_tp=1,
-        tie_word_embeddings=False,
-        rope_theta=10000.0,
-        rope_scaling=None,
-        attention_bias=False,
-        attention_dropout=0.0,
-        mlp_bias=False,
-        sliding_window=2047,
-        bskcn_1=None,
-        bskcn_2=None,
-        bskcn_3=None,
-        bskcn_4=None,
-        bskcn_tv=None,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-
-        self.num_key_value_heads = num_key_value_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.pretraining_tp = pretraining_tp
-        self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
-        self._rope_scaling_validation()
-        self.attention_bias = attention_bias
-        self.attention_dropout = attention_dropout
-        self.mlp_bias = mlp_bias
-        self.sliding_window = sliding_window
-        self.bskcn_1 = bskcn_1 if bskcn_1 is not None else [12, 20, 32, 44]
-        self.bskcn_2 = bskcn_2 if bskcn_2 is not None else [20, 32]
-        self.bskcn_3 = bskcn_3 if bskcn_3 is not None else [16, 24, 36, 48]
-        self.bskcn_4 = bskcn_4 if bskcn_4 is not None else [28, 40]
-        self.bskcn_tv = bskcn_tv if bskcn_tv is not None else [0.9, 0.8]
-
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
-
-    def _rope_scaling_validation(self):
-        """
-        Validate the `rope_scaling` configuration.
-        """
-        if self.rope_scaling is None:
-            return
-
-        if (not isinstance(self.rope_scaling, dict)
-                or len(self.rope_scaling) != 2):
-            raise ValueError(
-                "`rope_scaling` must be a dictionary with two fields,"
-                " `type` and `factor`, "
-                f"got {self.rope_scaling}")
-        rope_scaling_type = self.rope_scaling.get("type", None)
-        rope_scaling_factor = self.rope_scaling.get("factor", None)
-        if rope_scaling_type is None or rope_scaling_type not in [
-                "linear",
-                "dynamic",
-        ]:
-            raise ValueError(f"`rope_scaling`'s type field must be one of "
-                             f"['linear', 'dynamic'], got {rope_scaling_type}")
-        if (rope_scaling_factor is None
-                or not isinstance(rope_scaling_factor, float)
-                or rope_scaling_factor <= 1.0):
-            raise ValueError(
-                f"`rope_scaling`'s factor field must be a float > 1,"
-                f" got {rope_scaling_factor}")
diff --git a/vllm/transformers_utils/configs/speculators/__init__.py b/vllm/transformers_utils/configs/speculators/__init__.py
new file mode 100644
index 0000000000000..208f01a7cb5ee
--- /dev/null
+++ b/vllm/transformers_utils/configs/speculators/__init__.py
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
diff --git a/vllm/transformers_utils/configs/speculators/algos.py b/vllm/transformers_utils/configs/speculators/algos.py
new file mode 100644
index 0000000000000..efc87b6bcf26f
--- /dev/null
+++ b/vllm/transformers_utils/configs/speculators/algos.py
@@ -0,0 +1,32 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+SUPPORTED_SPECULATORS_TYPES = {}
+
+
+def register_speculator(name):
+
+    def decorator(fn):
+        SUPPORTED_SPECULATORS_TYPES[name] = fn
+        return fn
+
+    return decorator
+
+
+@register_speculator("eagle3")
+def update_eagle3(config_dict: dict, vllm_config: dict) -> None:
+    """
+    Apply Eagle-3 specific configuration transformations.
+    
+    Eagle-3 specific fields:
+    - draft_vocab_size: Size of the draft model's vocabulary
+    - target_hidden_size: Hidden size of the target model
+    - norm_before_residual: Whether to apply norm before residual connection
+    """
+
+    vllm_config["draft_vocab_size"] = config_dict.get("draft_vocab_size")
+    if config_dict.get("target_hidden_size") is not None:
+        vllm_config["target_hidden_size"] = config_dict["target_hidden_size"]
+    vllm_config["norm_before_residual"] = config_dict.get(
+        "norm_before_residual", True)
+    vllm_config["architectures"] = ["Eagle3LlamaForCausalLM"]
diff --git a/vllm/transformers_utils/configs/speculators/base.py b/vllm/transformers_utils/configs/speculators/base.py
new file mode 100644
index 0000000000000..d7c16e180c709
--- /dev/null
+++ b/vllm/transformers_utils/configs/speculators/base.py
@@ -0,0 +1,91 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+from typing import Any, Union
+
+from transformers import PretrainedConfig
+
+from vllm.transformers_utils.configs.speculators.algos import (
+    SUPPORTED_SPECULATORS_TYPES)
+
+__all__ = ["SpeculatorsConfig"]
+
+
+class SpeculatorsConfig(PretrainedConfig):
+    model_type = "speculators"
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Union[str, os.PathLike],
+        **kwargs,
+    ) -> "SpeculatorsConfig":
+        """Load speculators Eagle config and convert to vLLM format."""
+        config_dict, _ = cls.get_config_dict(pretrained_model_name_or_path,
+                                             **kwargs)
+
+        speculators_model_type = config_dict.get("speculators_model_type")
+        if speculators_model_type not in SUPPORTED_SPECULATORS_TYPES:
+            raise ValueError(
+                f"Expected one of: {SUPPORTED_SPECULATORS_TYPES}. "
+                "Please ensure you're loading a speculators-format model.")
+
+        # validate fields
+        # TODO: @dsikka - use speculators pydantic model to validate
+        cls.validate_speculators_config(config_dict=config_dict)
+        # Convert from speculators config -> format that can be ingested by vLLM
+        vllm_config = cls.convert_speculators_to_vllm(config_dict=config_dict)
+        # Apply anything specific to the supported algorithm
+        algo_updater = SUPPORTED_SPECULATORS_TYPES[speculators_model_type]
+        algo_updater(config_dict=config_dict, vllm_config=vllm_config)
+        return cls(**vllm_config)
+
+    @classmethod
+    def validate_speculators_config(cls, config_dict: dict[str, Any]) -> None:
+        try:
+            spec_config = config_dict["speculators_config"]
+            methods = spec_config["proposal_methods"]
+            first_method = methods[0]
+            _ = first_method["speculative_tokens"]
+            _ = spec_config["verifier"]["name_or_path"]
+            _ = config_dict["speculators_model_type"]
+        except (KeyError, IndexError, TypeError) as e:
+            raise ValueError("Invalid speculators config structure") from e
+
+        if "transformer_layer_config" not in config_dict:
+            raise ValueError("Must provide transformer_layer_config")
+
+        if not isinstance(config_dict["transformer_layer_config"], dict):
+            raise TypeError(
+                "'transformer_layer_config' must be a dictionary if provided")
+
+    @classmethod
+    def convert_speculators_to_vllm(
+            cls, config_dict: dict[str, Any]) -> dict[str, Any]:
+        """
+        Convert speculators config format to vLLM format.
+        
+        This method handles the translation of field names and structure
+        between speculators and vLLM formats.
+        
+        Returns:
+            Dictionary with vLLM-compatible configuration
+        """
+        # Currently we only support one proposal method
+        spec_config = config_dict["speculators_config"]
+        first_method = spec_config.get("proposal_methods")[0]
+        num_lookahead_tokens = first_method.get("speculative_tokens")
+
+        if num_lookahead_tokens is None:
+            raise ValueError(
+                "Missing 'speculative_tokens' in proposal method. "
+                f"Got: {first_method}")
+
+        # Build base vLLM config
+        vllm_config = {
+            "method": config_dict.get("speculators_model_type"),
+            "num_lookahead_tokens": num_lookahead_tokens,
+            "target_model": spec_config.get("verifier")["name_or_path"]
+        }
+        vllm_config.update(config_dict["transformer_layer_config"])
+        return vllm_config
diff --git a/vllm/transformers_utils/configs/step3_vl.py b/vllm/transformers_utils/configs/step3_vl.py
new file mode 100644
index 0000000000000..fe3c72de69d28
--- /dev/null
+++ b/vllm/transformers_utils/configs/step3_vl.py
@@ -0,0 +1,123 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any, Optional, Union
+
+from transformers.configuration_utils import PretrainedConfig
+
+
+class Step3VisionEncoderConfig(PretrainedConfig):
+    model_type = "step3_vision_encoder"
+
+    def __init__(
+        self,
+        hidden_size=1792,
+        intermediate_size=3072,
+        output_hidden_size=4096,
+        num_hidden_layers=63,
+        num_attention_heads=16,
+        num_channels=3,
+        image_size=728,
+        patch_size=14,
+        hidden_act="quick_gelu",
+        layer_norm_eps=1e-5,
+        **kwargs,
+    ):
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.output_hidden_size = output_hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        super().__init__(**kwargs)
+
+
+class Step3TextConfig(PretrainedConfig):
+    model_type = "step3_text"
+    architectures = ["Step3TextForCausalLM"]
+
+    def __init__(
+        self,
+        hidden_size: int = 7168,
+        intermediate_size: int = 18432,
+        num_attention_heads: int = 64,
+        num_attention_groups: int = 1,
+        num_hidden_layers: int = 61,
+        max_seq_len: int = 65536,
+        vocab_size: int = 128815,
+        rms_norm_eps: float = 1e-5,
+        moe_intermediate_size: int = 5120,
+        moe_num_experts: int = 48,
+        moe_top_k: int = 3,
+        rope_theta: float = 500000,
+        rope_scaling: Optional[dict[str, Any]] = None,
+        max_position_embedding: int = 65536,
+        share_expert_dim: int = 5120,
+        share_q_dim: int = 2048,
+        head_dim: int = 256,
+        norm_expert_weight: bool = False,
+        moe_layers_enum: tuple[int,
+                               ...] = (4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+                                       15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+                                       25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
+                                       35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+                                       45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
+                                       55, 56, 57, 58, 59),
+        **kwargs,
+    ) -> None:
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_attention_heads = num_attention_heads
+        self.num_attention_groups = num_attention_groups
+        self.num_hidden_layers = num_hidden_layers
+        self.max_seq_len = max_seq_len
+        self.vocab_size = vocab_size
+        self.rms_norm_eps = rms_norm_eps
+        self.moe_intermediate_size = moe_intermediate_size
+        self.moe_num_experts = moe_num_experts
+        self.moe_top_k = moe_top_k
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.max_position_embedding = max_position_embedding
+        self.share_expert_dim = share_expert_dim
+        self.share_q_dim = share_q_dim
+        self.head_dim = head_dim
+        self.norm_expert_weight = norm_expert_weight
+        self.moe_layers_enum = moe_layers_enum
+
+        super().__init__(**kwargs)
+
+
+class Step3VLConfig(PretrainedConfig):
+    model_type = "step3_vl"
+
+    def __init__(
+        self,
+        vision_config: Optional[Union[dict, Step3VisionEncoderConfig]] = None,
+        text_config: Optional[Union[dict, Step3TextConfig]] = None,
+        understand_projector_stride: int = 1,
+        projector_bias: bool = True,
+        image_token_id: int = 128001,
+        **kwargs,
+    ) -> None:
+        if vision_config is None:
+            vision_config = Step3VisionEncoderConfig()
+        elif isinstance(vision_config, dict):
+            vision_config = Step3VisionEncoderConfig(**vision_config)
+        self.vision_config = vision_config
+
+        if text_config is None:
+            text_config = Step3TextConfig()
+        elif isinstance(text_config, dict):
+            text_config = Step3TextConfig(**text_config)
+        self.text_config = text_config
+
+        self.understand_projector_stride = understand_projector_stride
+        self.projector_bias = projector_bias
+        self.hidden_size = text_config.hidden_size
+        self.image_token_id = image_token_id
+
+        super().__init__(**kwargs)
diff --git a/vllm/transformers_utils/configs/telechat2.py b/vllm/transformers_utils/configs/telechat2.py
deleted file mode 100644
index 050a7851d143f..0000000000000
--- a/vllm/transformers_utils/configs/telechat2.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# adapted from https://www.modelscope.cn/models/TeleAI/TeleChat2-3B/resolve/master/configuration_telechat2.py
-""" Telechat configuration compatible with LlamaConfig. """
-
-from transformers.configuration_utils import PretrainedConfig
-
-
-class Telechat2Config(PretrainedConfig):
-
-    model_type = "telechat"
-    keys_to_ignore_at_inference = ["past_key_values"]
-    attribute_map = {
-        "num_hidden_layers": "n_layer",
-        "num_attention_heads": "n_head",
-        "intermediate_size": "ffn_hidden_size",
-        "rms_norm_eps": "layer_norm_epsilon"
-    }
-
-    def __init__(
-        self,
-        vocab_size=160256,
-        hidden_size=4096,
-        n_layer=30,
-        n_head=32,
-        layer_norm_epsilon=1e-5,
-        initializer_range=0.02,
-        use_cache=True,
-        bos_token_id=1,
-        eos_token_id=2,
-        apply_residual_connection_post_layernorm=False,
-        hidden_dropout=0.0,
-        attention_dropout=0.0,
-        ffn_hidden_size=12288,
-        training_seqlen=8192,
-        logn=True,
-        embed_layernorm=False,
-        hidden_act="silu",
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        n_embed = kwargs.pop("n_embed", None)
-        self.hidden_size = hidden_size if n_embed is None else n_embed
-        self.n_layer = n_layer
-        self.n_head = n_head
-        self.layer_norm_epsilon = layer_norm_epsilon
-        self.initializer_range = initializer_range
-        self.use_cache = use_cache
-        self.apply_residual_connection_post_layernorm = (
-            apply_residual_connection_post_layernorm)
-        self.hidden_dropout = hidden_dropout
-        self.attention_dropout = attention_dropout
-        self.bos_token_id = bos_token_id
-        self.eos_token_id = eos_token_id
-        self.logn = logn
-        self.training_seqlen = training_seqlen
-        self.embed_layernorm = embed_layernorm
-        self.num_key_value_heads = kwargs.pop("num_key_value_heads", None)
-        self.ffn_hidden_size = ffn_hidden_size
-        self.hidden_act = hidden_act
-        super().__init__(bos_token_id=bos_token_id,
-                         eos_token_id=eos_token_id,
-                         **kwargs)
diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py
index 70cd08263d372..a630d940b2578 100644
--- a/vllm/transformers_utils/processor.py
+++ b/vllm/transformers_utils/processor.py
@@ -4,9 +4,15 @@
 from functools import lru_cache
 from typing import TYPE_CHECKING, Any, Optional, Union, cast
 
+from transformers import (AutoFeatureExtractor, AutoImageProcessor,
+                          AutoProcessor)
+from transformers.feature_extraction_utils import FeatureExtractionMixin
+from transformers.image_processing_utils import BaseImageProcessor
 from transformers.processing_utils import ProcessorMixin
 from typing_extensions import TypeVar
 
+from vllm.utils import get_allowed_kwarg_only_overrides
+
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
 
@@ -33,23 +39,42 @@ class HashableList(list):
         return hash(tuple(self))
 
 
-def _merge_mm_kwargs(model_config: "ModelConfig", **kwargs):
-    mm_config = model_config.get_multimodal_config()
-    base_kwargs = mm_config.mm_processor_kwargs
-    if base_kwargs is None:
-        base_kwargs = {}
+def _get_processor_factory_fn(processor_cls: Union[type, tuple[type, ...]]):
+    if isinstance(processor_cls, tuple) or processor_cls == ProcessorMixin:
+        return AutoProcessor.from_pretrained
+    if hasattr(processor_cls, "from_pretrained"):
+        return processor_cls.from_pretrained
 
-    merged_kwargs = {**base_kwargs, **kwargs}
+    return processor_cls
+
+
+def _merge_mm_kwargs(
+    model_config: "ModelConfig",
+    processor_cls: Union[type, tuple[type, ...]],
+    /,
+    **kwargs,
+):
+    mm_config = model_config.get_multimodal_config()
+    merged_kwargs = mm_config.merge_mm_processor_kwargs(kwargs)
+
+    factory = _get_processor_factory_fn(processor_cls)
+    allowed_kwargs = get_allowed_kwarg_only_overrides(
+        factory,
+        merged_kwargs,
+        requires_kw_only=False,
+        allow_var_kwargs=True,
+    )
 
     # NOTE: Pythonic dict is not hashable and will raise unhashable type
     # error when calling `cached_get_processor`, therefore we need to
     # wrap it to a hashable dict.
-    for key, value in merged_kwargs.items():
+    for key, value in allowed_kwargs.items():
         if isinstance(value, dict):
-            merged_kwargs[key] = HashableDict(value)
+            allowed_kwargs[key] = HashableDict(value)
         if isinstance(value, list):
-            merged_kwargs[key] = HashableList(value)
-    return merged_kwargs
+            allowed_kwargs[key] = HashableList(value)
+
+    return allowed_kwargs
 
 
 def get_processor(
@@ -61,21 +86,29 @@ def get_processor(
     **kwargs: Any,
 ) -> _P:
     """Load a processor for the given model name via HuggingFace."""
-    # don't put this import at the top level
-    # it will call torch.cuda.device_count()
-    from transformers import AutoProcessor
-
-    processor_factory = (AutoProcessor if processor_cls == ProcessorMixin or
-                         isinstance(processor_cls, tuple) else processor_cls)
+    if revision is None:
+        revision = "main"
 
     try:
-        processor = processor_factory.from_pretrained(
-            processor_name,
-            *args,
-            revision=revision,
-            trust_remote_code=trust_remote_code,
-            **kwargs,
-        )
+        if isinstance(processor_cls, tuple) or processor_cls == ProcessorMixin:
+            processor = AutoProcessor.from_pretrained(
+                processor_name,
+                *args,
+                revision=revision,
+                trust_remote_code=trust_remote_code,
+                **kwargs,
+            )
+        elif issubclass(processor_cls, ProcessorMixin):
+            processor = processor_cls.from_pretrained(
+                processor_name,
+                *args,
+                revision=revision,
+                trust_remote_code=trust_remote_code,
+                **kwargs,
+            )
+        else:
+            # Processors that are standalone classes unrelated to HF
+            processor = processor_cls(*args, **kwargs)
     except ValueError as e:
         # If the error pertains to the processor class not existing or not
         # currently being imported, suggest using the --trust-remote-code flag.
@@ -112,7 +145,7 @@ def cached_processor_from_config(
         revision=model_config.revision,
         trust_remote_code=model_config.trust_remote_code,
         processor_cls=processor_cls,  # type: ignore[arg-type]
-        **_merge_mm_kwargs(model_config, **kwargs),
+        **_merge_mm_kwargs(model_config, processor_cls, **kwargs),
     )
 
 
@@ -125,10 +158,6 @@ def get_feature_extractor(
 ):
     """Load an audio feature extractor for the given model name 
     via HuggingFace."""
-    # don't put this import at the top level
-    # it will call torch.cuda.device_count()
-    from transformers import AutoFeatureExtractor
-    from transformers.feature_extraction_utils import FeatureExtractionMixin
     try:
         feature_extractor = AutoFeatureExtractor.from_pretrained(
             processor_name,
@@ -164,7 +193,7 @@ def cached_feature_extractor_from_config(
         model_config.model,
         revision=model_config.revision,
         trust_remote_code=model_config.trust_remote_code,
-        **_merge_mm_kwargs(model_config, **kwargs),
+        **_merge_mm_kwargs(model_config, AutoFeatureExtractor, **kwargs),
     )
 
 
@@ -176,11 +205,6 @@ def get_image_processor(
     **kwargs: Any,
 ):
     """Load an image processor for the given model name via HuggingFace."""
-    # don't put this import at the top level
-    # it will call torch.cuda.device_count()
-    from transformers import AutoImageProcessor
-    from transformers.image_processing_utils import BaseImageProcessor
-
     try:
         processor = AutoImageProcessor.from_pretrained(
             processor_name,
@@ -217,5 +241,5 @@ def cached_image_processor_from_config(
         model_config.model,
         revision=model_config.revision,
         trust_remote_code=model_config.trust_remote_code,
-        **_merge_mm_kwargs(model_config, **kwargs),
+        **_merge_mm_kwargs(model_config, AutoImageProcessor, **kwargs),
     )
diff --git a/vllm/transformers_utils/processors/__init__.py b/vllm/transformers_utils/processors/__init__.py
index 14d15f2bc1673..eca4d7c884dd3 100644
--- a/vllm/transformers_utils/processors/__init__.py
+++ b/vllm/transformers_utils/processors/__init__.py
@@ -1,5 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Multi-modal processors may be defined in this directory for the following
+reasons:
+
+- There is no processing file defined by HF Hub or Transformers library.
+- There is a need to override the existing processor to support vLLM.
+"""
 
 from vllm.transformers_utils.processors.deepseek_vl2 import (
     DeepseekVLV2Processor)
diff --git a/vllm/transformers_utils/processors/deepseek_vl2.py b/vllm/transformers_utils/processors/deepseek_vl2.py
index b4669d12fa213..5896bde312657 100644
--- a/vllm/transformers_utils/processors/deepseek_vl2.py
+++ b/vllm/transformers_utils/processors/deepseek_vl2.py
@@ -256,7 +256,7 @@ class DeepseekVLV2Processor(ProcessorMixin):
     def __call__(
         self,
         *,
-        prompt: str,
+        text: str,
         images: list[Image.Image],
         inference_mode: bool = True,
         **kwargs,
@@ -264,7 +264,7 @@ class DeepseekVLV2Processor(ProcessorMixin):
         """
 
         Args:
-            prompt (str): the formatted prompt;
+            text (str): the formatted prompt;
             images (list[ImageType]): the list of images;
             inference_mode (bool): if True, then remove the last eos token;
             **kwargs:
@@ -278,7 +278,7 @@ class DeepseekVLV2Processor(ProcessorMixin):
         """
 
         prepare = self.process_one(
-            prompt=prompt,
+            prompt=text,
             images=images,
             inference_mode=inference_mode,
         )
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index 24ddd35abea60..d2be2ceeeae6d 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -50,11 +50,12 @@ def decode_tokens(
     `skip_special_tokens=None` means to use the backend's default
     settings.
     """
+    decode_method = getattr(tokenizer, "_decode", tokenizer.decode)
     if skip_special_tokens is not None:
-        return tokenizer.decode(token_ids,
-                                skip_special_tokens=skip_special_tokens)
+        return decode_method(token_ids,
+                             skip_special_tokens=skip_special_tokens)
 
-    return tokenizer.decode(token_ids)
+    return decode_method(token_ids)
 
 
 def encode_tokens(
@@ -270,7 +271,7 @@ def get_tokenizer(
             }
             tokenizer.add_special_tokens(special_tokens_map)
 
-        # NOTE: We can remove this after https://github.com/THUDM/ChatGLM3/issues/1324
+        # NOTE: We can remove this after https://github.com/zai-org/ChatGLM3/issues/1324
         if type(tokenizer).__name__ in ("ChatGLMTokenizer",
                                         "ChatGLM4Tokenizer"):
             assert isinstance(tokenizer, PreTrainedTokenizer)
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index 054037b8932b7..ce62282c2199f 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -47,7 +47,7 @@ from dataclasses import dataclass, field
 from functools import cache, lru_cache, partial, wraps
 from types import MappingProxyType
 from typing import (TYPE_CHECKING, Any, Callable, Generic, Literal, NamedTuple,
-                    Optional, Tuple, TypeVar, Union, cast, overload)
+                    Optional, TextIO, Tuple, TypeVar, Union, cast, overload)
 from urllib.parse import urlparse
 from uuid import uuid4
 
@@ -72,6 +72,7 @@ from typing_extensions import Never, ParamSpec, TypeIs, assert_never
 
 import vllm.envs as envs
 from vllm.logger import enable_trace_function_call, init_logger
+from vllm.ray.lazy_utils import is_in_ray_actor
 
 if TYPE_CHECKING:
     from argparse import Namespace
@@ -167,6 +168,10 @@ GB_bytes = 1_000_000_000
 GiB_bytes = 1 << 30
 """The number of bytes in one gibibyte (GiB)."""
 
+# ANSI color codes
+CYAN = '\033[1;36m'
+RESET = '\033[0;0m'
+
 STR_DTYPE_TO_TORCH_DTYPE = {
     "half": torch.half,
     "bfloat16": torch.bfloat16,
@@ -1664,6 +1669,13 @@ class FlexibleArgumentParser(ArgumentParser):
         # Enable the deprecated kwarg for Python 3.12 and below
 
         def parse_known_args(self, args=None, namespace=None):
+            if args is not None and "--disable-log-requests" in args:
+                # Special case warning because the warning below won't trigger
+                # if –-disable-log-requests because its value is default.
+                logger.warning_once(
+                    "argument '--disable-log-requests' is deprecated and "
+                    "replaced with '--enable-log-requests'. This will be "
+                    "removed in v0.12.0.")
             namespace, args = super().parse_known_args(args, namespace)
             for action in FlexibleArgumentParser._deprecated:
                 if (hasattr(namespace, dest := action.dest)
@@ -2010,49 +2022,6 @@ def supports_kw(
     return False
 
 
-def resolve_mm_processor_kwargs(
-    init_kwargs: Optional[Mapping[str, object]],
-    inference_kwargs: Optional[Mapping[str, object]],
-    callable: Callable[..., object],
-    *,
-    requires_kw_only: bool = True,
-    allow_var_kwargs: bool = False,
-) -> dict[str, Any]:
-    """Applies filtering to eliminate invalid mm_processor_kwargs, i.e.,
-    those who are not explicit keywords to the given callable (of one is
-    given; otherwise no filtering is done), then merges the kwarg dicts,
-    giving priority to inference_kwargs if there are any collisions.
-
-    In the case that no kwarg overrides are provided, returns an empty
-    dict so that it can still be kwarg expanded into the callable later on.
-
-    If allow_var_kwargs=True, allows for things that can be expanded into
-    kwargs as long as they aren't naming collision for var_kwargs or potential
-    positional arguments.
-    """
-    # Filter inference time multimodal processor kwargs provided
-    runtime_mm_kwargs = get_allowed_kwarg_only_overrides(
-        callable,
-        overrides=inference_kwargs,
-        requires_kw_only=requires_kw_only,
-        allow_var_kwargs=allow_var_kwargs,
-    )
-
-    # Filter init time multimodal processor kwargs provided
-    init_mm_kwargs = get_allowed_kwarg_only_overrides(
-        callable,
-        overrides=init_kwargs,
-        requires_kw_only=requires_kw_only,
-        allow_var_kwargs=allow_var_kwargs,
-    )
-
-    # Merge the final processor kwargs, prioritizing inference
-    # time values over the initialization time values.
-    mm_processor_kwargs = {**init_mm_kwargs, **runtime_mm_kwargs}
-
-    return mm_processor_kwargs
-
-
 def get_allowed_kwarg_only_overrides(
     callable: Callable[..., object],
     overrides: Optional[Mapping[str, object]],
@@ -2827,6 +2796,9 @@ def make_zmq_socket(
     if linger is not None:
         socket.setsockopt(zmq.LINGER, linger)
 
+    if socket_type == zmq.XPUB:
+        socket.setsockopt(zmq.XPUB_VERBOSE, True)
+
     # Determine if the path is a TCP socket with an IPv6 address.
     # Enable IPv6 on the zmq socket if so.
     scheme, host, _ = split_zmq_path(path)
@@ -2865,17 +2837,6 @@ def zmq_socket_ctx(
         ctx.destroy(linger=linger)
 
 
-def is_in_ray_actor():
-    """Check if we are in a Ray actor."""
-
-    try:
-        import ray
-        return (ray.is_initialized()
-                and ray.get_runtime_context().get_actor_id() is not None)
-    except ImportError:
-        return False
-
-
 def _maybe_force_spawn():
     """Check if we need to force the use of the `spawn` multiprocessing start
     method.
@@ -3282,14 +3243,71 @@ def has_deep_gemm() -> bool:
     return _has_module("deep_gemm")
 
 
-def bind_process_name(name: str, suffix: str = "") -> None:
-    """Bind the process name to a specific name with an optional suffix.
+def set_process_title(name: str,
+                      suffix: str = "",
+                      append: bool = False) -> None:
+    """
+    Set the current process title to a specific name with an
+    optional suffix.
 
     Args:
-        name: The base name to bind the process to.
+        name: The title to assign to the current process.
         suffix: An optional suffix to append to the base name.
+        append: Whether to append to the existing process title.
     """
-    name = f"{envs.VLLM_PROCESS_NAME_PREFIX}::{name}"
     if suffix:
         name = f"{name}_{suffix}"
+    if append:
+        name = f"{setproctitle.getproctitle()}_{name}"
+    else:
+        name = f"{envs.VLLM_PROCESS_NAME_PREFIX}::{name}"
     setproctitle.setproctitle(name)
+
+
+def _add_prefix(file: TextIO, worker_name: str, pid: int) -> None:
+    """Prepend each output line with process-specific prefix"""
+
+    prefix = f"{CYAN}({worker_name} pid={pid}){RESET} "
+    file_write = file.write
+
+    def write_with_prefix(s: str):
+        if not s:
+            return
+        if file.start_new_line:  # type: ignore[attr-defined]
+            file_write(prefix)
+        idx = 0
+        while (next_idx := s.find('\n', idx)) != -1:
+            next_idx += 1
+            file_write(s[idx:next_idx])
+            if next_idx == len(s):
+                file.start_new_line = True  # type: ignore[attr-defined]
+                return
+            file_write(prefix)
+            idx = next_idx
+        file_write(s[idx:])
+        file.start_new_line = False  # type: ignore[attr-defined]
+
+    file.start_new_line = True  # type: ignore[attr-defined]
+    file.write = write_with_prefix  # type: ignore[method-assign]
+
+
+def decorate_logs(process_name: Optional[str] = None) -> None:
+    """
+    Adds a process-specific prefix to each line of output written to stdout and
+    stderr.
+
+    This function is intended to be called before initializing the api_server,
+    engine_core, or worker classes, so that all subsequent output from the
+    process is prefixed with the process name and PID. This helps distinguish
+    log output from different processes in multi-process environments.
+
+    Args:
+        process_name: Optional; the name of the process to use in the prefix.
+            If not provided, the current process name from the multiprocessing
+            context is used.
+    """
+    if process_name is None:
+        process_name = get_mp_context().current_process().name
+    pid = os.getpid()
+    _add_prefix(sys.stdout, process_name, pid)
+    _add_prefix(sys.stderr, process_name, pid)
diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py
index 169b083017e46..0edfb01cde9d6 100644
--- a/vllm/utils/deep_gemm.py
+++ b/vllm/utils/deep_gemm.py
@@ -8,13 +8,25 @@ from __future__ import annotations
 
 import functools
 import importlib
+import os
 from typing import Any, Callable, NoReturn
 
 import torch
 
 import vllm.envs as envs
 from vllm.platforms import current_platform
-from vllm.utils import has_deep_gemm
+from vllm.utils import cdiv, has_deep_gemm
+
+
+@functools.cache
+def is_deep_gemm_supported() -> bool:
+    """Return ``True`` if DeepGEMM is supported on the current platform.
+    Currently, only Hopper and Blackwell GPUs are supported.
+    """
+    is_supported_arch = current_platform.is_cuda() and (
+        current_platform.is_device_capability(90)
+        or current_platform.is_device_capability(100))
+    return has_deep_gemm() and is_supported_arch
 
 
 @functools.cache
@@ -26,7 +38,7 @@ def is_blackwell_deep_gemm_used() -> bool:
         return False
 
     _lazy_init()
-    if _per_block_cast_impl is None:
+    if _fp8_gemm_nt_impl is None:
         return False
 
     return (current_platform.is_cuda()
@@ -52,23 +64,26 @@ def _resolve_symbol(module, new: str, old: str) -> Callable[..., Any] | None:
 _fp8_gemm_nt_impl: Callable[..., Any] | None = None
 _grouped_impl: Callable[..., Any] | None = None
 _grouped_masked_impl: Callable[..., Any] | None = None
-_per_block_cast_impl: Callable[..., Any] | None = None
 
 
 def _lazy_init() -> None:
     """Import deep_gemm and resolve symbols on first use."""
-    global _fp8_gemm_nt_impl, _grouped_impl, _grouped_masked_impl, \
-        _per_block_cast_impl
+    global _fp8_gemm_nt_impl, _grouped_impl, _grouped_masked_impl
 
     # fast path
     if (_fp8_gemm_nt_impl is not None or _grouped_impl is not None
-            or _grouped_masked_impl is not None
-            or _per_block_cast_impl is not None):
+            or _grouped_masked_impl is not None):
         return
 
     if not has_deep_gemm():
         return
 
+    # Set up deep_gemm cache path
+    DEEP_GEMM_JIT_CACHE_ENV_NAME = 'DG_JIT_CACHE_DIR'
+    if not os.environ.get(DEEP_GEMM_JIT_CACHE_ENV_NAME, None):
+        os.environ[DEEP_GEMM_JIT_CACHE_ENV_NAME] = os.path.join(
+            envs.VLLM_CACHE_ROOT, "deep_gemm")
+
     _dg = importlib.import_module("deep_gemm")
 
     _fp8_gemm_nt_impl = _resolve_symbol(_dg, "fp8_gemm_nt",
@@ -79,14 +94,6 @@ def _lazy_init() -> None:
     _grouped_masked_impl = _resolve_symbol(
         _dg, "fp8_m_grouped_gemm_nt_masked",
         "m_grouped_gemm_fp8_fp8_bf16_nt_masked")
-    # Try to get per_token_cast_to_fp8 from DeepGEMM math utils.
-    try:
-        _math_mod = importlib.import_module(
-            "deep_gemm.utils.math")  # type: ignore
-        _per_block_cast_impl = getattr(_math_mod, "per_block_cast_to_fp8",
-                                       None)
-    except ModuleNotFoundError:
-        _per_block_cast_impl = None
 
 
 def fp8_gemm_nt(*args, **kwargs):
@@ -110,13 +117,37 @@ def fp8_m_grouped_gemm_nt_masked(*args, **kwargs):
     return _grouped_masked_impl(*args, **kwargs)
 
 
-def per_block_cast_to_fp8(x, *args, **kwargs):
-    _lazy_init()
-    if _per_block_cast_impl is not None and is_blackwell_deep_gemm_used():
-        return _per_block_cast_impl(x, use_ue8m0=True)
-    # TODO: refactor the `per_block_cast_to_fp8` from tests to vllm utils
-    from tests.kernels.quant_utils import per_block_cast_to_fp8 as _pbcf
-    return _pbcf(x, *args, **kwargs)
+def _ceil_to_ue8m0(x: torch.Tensor):
+    return torch.pow(2.0, torch.ceil(torch.log2(x.abs())))
+
+
+def _align(x: int, y: int) -> int:
+    return cdiv(x, y) * y
+
+
+DEFAULT_BLOCK_SIZE = [128, 128]
+
+
+# Taken from https://github.com/deepseek-ai/DeepGEMM/blob/dd6ed14acbc7445dcef224248a77ab4d22b5f240/deep_gemm/utils/math.py#L38
+# TODO(wentao): optimize this function, using triton or cuda kernel
+def per_block_cast_to_fp8(
+        x: torch.Tensor,
+        block_size: list[int] = DEFAULT_BLOCK_SIZE,
+        use_ue8m0: bool = False) -> tuple[torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2
+    m, n = x.shape
+    block_m, block_n = block_size
+    x_padded = torch.zeros((_align(m, block_m), _align(n, block_n)),
+                           dtype=x.dtype,
+                           device=x.device)
+    x_padded[:m, :n] = x
+    x_view = x_padded.view(-1, block_m, x_padded.size(1) // block_n, block_n)
+    x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
+    sf = x_amax / 448.0
+    sf = _ceil_to_ue8m0(sf) if use_ue8m0 else sf
+    x_scaled = (x_view * (1.0 / sf)).to(torch.float8_e4m3fn)
+    return x_scaled.view_as(x_padded)[:m, :n].contiguous(), sf.view(
+        x_view.size(0), x_view.size(2))
 
 
 def calc_diff(x: torch.Tensor, y: torch.Tensor):
@@ -142,4 +173,5 @@ __all__ = [
     "fp8_m_grouped_gemm_nt_masked",
     "per_block_cast_to_fp8",
     "is_blackwell_deep_gemm_used",
+    "is_deep_gemm_supported",
 ]
diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py
index ebc54fd029da6..cce1aefaf9b02 100644
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -10,12 +10,25 @@ import contextlib
 import functools
 import importlib
 import importlib.util
-from typing import Any, Callable, NoReturn
+import os
+from typing import Any, Callable, NoReturn, Optional
 
+import requests
+
+import vllm.envs as envs
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
 
+# This is the storage path for the cubins, it can be replaced
+# with a local path for testing.
+# Referenced from https://github.com/flashinfer-ai/flashinfer/blob/0c9a92c3d9a7e043ab6f3f7b2273269caf6ab044/flashinfer/jit/cubin_loader.py#L35  # noqa: E501
+FLASHINFER_CUBINS_REPOSITORY = os.environ.get(
+    "FLASHINFER_CUBINS_REPOSITORY",
+    "https://edge.urm.nvidia.com/artifactory/sw-kernelinferencelibrary-public-generic-local/",  # noqa: E501
+)
+
 
 @functools.cache
 def has_flashinfer() -> bool:
@@ -66,6 +79,8 @@ def _lazy_import_wrapper(module_name: str,
 # Create lazy wrappers for each function
 flashinfer_trtllm_fp8_block_scale_moe = _lazy_import_wrapper(
     "flashinfer.fused_moe", "trtllm_fp8_block_scale_moe")
+flashinfer_trtllm_fp8_per_tensor_scale_moe = _lazy_import_wrapper(
+    "flashinfer.fused_moe", "trtllm_fp8_per_tensor_scale_moe")
 flashinfer_cutlass_fused_moe = _lazy_import_wrapper("flashinfer.fused_moe",
                                                     "cutlass_fused_moe")
 fp4_quantize = _lazy_import_wrapper("flashinfer", "fp4_quantize")
@@ -106,6 +121,67 @@ def has_flashinfer_cutlass_fused_moe() -> bool:
     return True
 
 
+@functools.cache
+def has_nvidia_artifactory() -> bool:
+    """Return ``True`` if NVIDIA's artifactory is accessible.
+
+    This checks connectivity to the kernel inference library artifactory
+    which is required for downloading certain cubin kernels like TRTLLM FHMA.
+    """
+    try:
+        # Use a short timeout to avoid blocking for too long
+        response = requests.get(FLASHINFER_CUBINS_REPOSITORY, timeout=5)
+        accessible = response.status_code == 200
+        if accessible:
+            logger.debug_once("NVIDIA artifactory is accessible")
+        else:
+            logger.warning_once(
+                "NVIDIA artifactory returned failed status code: %d",
+                response.status_code)
+        return accessible
+    except Exception as e:
+        logger.warning_once("Failed to connect to NVIDIA artifactory: %s", e)
+        return False
+
+
+def use_trtllm_attention(
+    num_tokens: int,
+    max_seq_len: int,
+    kv_cache_dtype: str,
+    num_qo_heads: Optional[int],
+    num_kv_heads: Optional[int],
+    attn_head_size: Optional[int],
+) -> bool:
+    # Requires SM100 and NVIDIA artifactory to be accessible to download cubins
+    if not (current_platform.is_device_capability(100)
+            and has_nvidia_artifactory()):
+        return False
+
+    # Check if the dimensions are supported by TRTLLM decode attention
+    if (attn_head_size is None or num_qo_heads is None or num_kv_heads is None
+            or num_qo_heads % num_kv_heads != 0 or attn_head_size != 128):
+        return False
+
+    env_value = envs.VLLM_USE_TRTLLM_ATTENTION
+    if env_value is not None:
+        logger.info_once("VLLM_USE_TRTLLM_ATTENTION is set to %s", env_value)
+        # Environment variable is set - respect it
+        # Making the conditional check for zero because
+        # the path is automatically enabled if the batch size condition
+        # is satisfied.
+        no_use_trtllm = (env_value == "0")
+        if not no_use_trtllm:
+            logger.info_once("Using TRTLLM attention.")
+        return not no_use_trtllm
+    else:
+        # Environment variable not set - use auto-detection
+        use_trtllm = (num_tokens <= 256 and max_seq_len < 131072
+                      and kv_cache_dtype == "auto")
+        if use_trtllm:
+            logger.warning_once("Using TRTLLM attention (auto-detected).")
+        return use_trtllm
+
+
 __all__ = [
     "has_flashinfer",
     "flashinfer_trtllm_fp8_block_scale_moe",
@@ -115,4 +191,6 @@ __all__ = [
     "autotune",
     "has_flashinfer_moe",
     "has_flashinfer_cutlass_fused_moe",
+    "has_nvidia_artifactory",
+    "use_trtllm_attention",
 ]
diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py
index 3b6d753863d07..9ed46331863c9 100644
--- a/vllm/v1/attention/backends/cpu_attn.py
+++ b/vllm/v1/attention/backends/cpu_attn.py
@@ -315,8 +315,8 @@ class TorchSDPAMetadata(AttentionMetadata):
 
 class TorchSDPAMetadataBuilderV1(AttentionMetadataBuilder[TorchSDPAMetadata]):
 
-    def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig,
-                 device: torch.device) -> None:
+    def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
+                 vllm_config: VllmConfig, device: torch.device) -> None:
         self.kv_cache_spec = kv_cache_spec
         self.vllm_config = vllm_config
         self.scheduler_config = vllm_config.scheduler_config
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 7c8a5e056fea5..f086bab2556eb 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -25,7 +25,8 @@ if is_flash_attn_varlen_func_available():
 from vllm.config import VllmConfig, get_layers_from_vllm_config
 from vllm.logger import init_logger
 from vllm.utils import cdiv
-from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder,
+from vllm.v1.attention.backends.utils import (AttentionCGSupport,
+                                              AttentionMetadataBuilder,
                                               CommonAttentionMetadata,
                                               get_kv_cache_layout)
 from vllm.v1.kv_cache_interface import AttentionSpec
@@ -99,6 +100,13 @@ class FlashAttentionBackend(AttentionBackend):
             raise ValueError(f"Unknown cache layout format {cache_layout}.")
         return stride_order
 
+    @staticmethod
+    def get_fp8_dtype_for_flashattn(kv_cache_dtype: str) -> torch.dtype:
+        if kv_cache_dtype in ("fp8", "fp8_e4m3"):
+            return torch.float8_e4m3fn
+        else:
+            raise ValueError(f"Unrecognized FP8 dtype: {kv_cache_dtype}")
+
 
 @dataclass
 class FlashAttentionMetadata:
@@ -146,10 +154,12 @@ def _get_sliding_window_configs(
 
 class FlashAttentionMetadataBuilder(
         AttentionMetadataBuilder[FlashAttentionMetadata]):
-    full_cudagraph_supported: ClassVar[bool] = get_flash_attn_version() == 3
+    attn_cudagraph_support: ClassVar[AttentionCGSupport] = \
+        AttentionCGSupport.NEVER if get_flash_attn_version() == 2 \
+        else AttentionCGSupport.ALWAYS
 
-    def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig,
-                 device: torch.device):
+    def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
+                 vllm_config: VllmConfig, device: torch.device):
         self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
         self.parallel_config = vllm_config.parallel_config
@@ -161,6 +171,7 @@ class FlashAttentionMetadataBuilder(
             self.parallel_config)
         self.num_heads_kv = self.model_config.get_num_kv_heads(
             self.parallel_config)
+        self.kv_cache_dtype = kv_cache_spec.dtype
         self.headdim = self.model_config.get_head_size()
         self.block_size = kv_cache_spec.block_size
 
@@ -239,17 +250,24 @@ class FlashAttentionMetadataBuilder(
 
         def schedule(batch_size, cu_query_lens, max_query_len, seqlens,
                      max_seq_len, causal):
+            cache_dtype = self.cache_config.cache_dtype
+            if cache_dtype.startswith("fp8"):
+                qkv_dtype = FlashAttentionBackend.get_fp8_dtype_for_flashattn(
+                    cache_dtype)
+            else:
+                qkv_dtype = self.kv_cache_dtype
             if aot_schedule:
                 return get_scheduler_metadata(
                     batch_size=batch_size,
                     max_seqlen_q=max_query_len,
                     max_seqlen_k=max_seq_len,
-                    cache_seqlens=seqlens,
                     num_heads_q=self.num_heads_q,
                     num_heads_kv=self.num_heads_kv,
                     headdim=self.headdim,
-                    page_size=self.block_size,
+                    cache_seqlens=seqlens,
+                    qkv_dtype=qkv_dtype,
                     cu_seqlens_q=cu_query_lens,
+                    page_size=self.block_size,
                     causal=causal,
                     window_size=self.aot_sliding_window,
                     num_splits=self.max_num_splits,
@@ -474,8 +492,10 @@ class FlashAttentionImpl(AttentionImpl):
             )
 
         if self.kv_cache_dtype.startswith("fp8"):
-            key_cache = key_cache.view(torch.float8_e4m3fn)
-            value_cache = value_cache.view(torch.float8_e4m3fn)
+            dtype = FlashAttentionBackend.get_fp8_dtype_for_flashattn(
+                self.kv_cache_dtype)
+            key_cache = key_cache.view(dtype)
+            value_cache = value_cache.view(dtype)
             num_tokens, num_heads, head_size = query.shape
             query, _ = ops.scaled_fp8_quant(
                 query.reshape(
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index b72745ef156eb..8592d1b26dfa8 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -4,33 +4,35 @@
 from __future__ import annotations
 
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Optional
+from typing import ClassVar, Optional, Union
 
 import torch
 from flashinfer import (BatchDecodeWithPagedKVCacheWrapper,
                         BatchPrefillWithPagedKVCacheWrapper,
                         MultiLevelCascadeAttentionWrapper)
-from flashinfer.decode import trtllm_batch_decode_with_kv_cache
+from flashinfer.decode import (_get_range_buf, get_seq_lens,
+                               trtllm_batch_decode_with_kv_cache)
+from flashinfer.prefill import trtllm_batch_context_with_kv_cache
 
 import vllm.envs as envs
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionType)
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.platforms import current_platform
-from vllm.utils import cdiv
+from vllm.utils import cdiv, is_pin_memory_available
+from vllm.utils.flashinfer import use_trtllm_attention
 from vllm.v1.attention.backends.flash_attn import use_cascade_attention
-from vllm.v1.attention.backends.utils import (
-    AttentionMetadataBuilder, CommonAttentionMetadata, PerLayerParameters,
-    get_kv_cache_layout, get_per_layer_parameters,
-    infer_global_hyperparameters, reorder_batch_to_split_decodes_and_prefills,
-    split_decodes_and_prefills)
+# yapf conflicts with isort for this block
+# yapf: disable
+from vllm.v1.attention.backends.utils import (AttentionCGSupport,
+                                              AttentionMetadataBuilder,
+                                              CommonAttentionMetadata,
+                                              get_kv_cache_layout,
+                                              get_per_layer_parameters,
+                                              infer_global_hyperparameters,
+                                              split_decodes_and_prefills)
 from vllm.v1.kv_cache_interface import AttentionSpec
 
-if TYPE_CHECKING:
-    from vllm.v1.core.sched.output import SchedulerOutput
-    from vllm.v1.worker.gpu_input_batch import InputBatch
-
 FLASHINFER_WORKSPACE_BUFFER_SIZE = 256 * 1024 * 1024
 
 logger = init_logger(__name__)
@@ -39,7 +41,6 @@ logger = init_logger(__name__)
 class FlashInferBackend(AttentionBackend):
 
     accept_output_buffer: bool = True
-    cached_sm100a_supported: Optional[bool] = None
 
     @classmethod
     def get_supported_dtypes(cls) -> list[torch.dtype]:
@@ -99,48 +100,6 @@ class FlashInferBackend(AttentionBackend):
             raise ValueError(f"Unknown cache layout format {cache_layout}.")
         return stride_order
 
-    @staticmethod
-    def use_trtllm_decode_attention(
-        batch_size: int,
-        max_seq_len: int,
-        kv_cache_dtype: str,
-        num_qo_heads: int,
-        num_kv_heads: int,
-        attn_head_size: int,
-    ) -> bool:
-        if FlashInferBackend.cached_sm100a_supported is None:
-            FlashInferBackend.cached_sm100a_supported = (
-                current_platform.has_device_capability(100))
-        if not FlashInferBackend.cached_sm100a_supported:
-            return False
-        if (num_qo_heads // num_kv_heads > 8
-                or num_qo_heads % num_kv_heads != 0 or attn_head_size != 128):
-            return False
-        env_value = envs.VLLM_USE_TRTLLM_DECODE_ATTENTION
-        if env_value is not None:
-            logger.info_once("VLLM_USE_TRTLLM_DECODE_ATTENTION is set to %s",
-                             env_value)
-            # Environment variable is set - respect it
-            # Making the conditional check for zero because
-            # the path is automatically enabled if the batch size condition
-            # is satisfied.
-            no_use_trtllm = env_value == "0"
-            if not no_use_trtllm:
-                logger.info_once(
-                    "VLLM_USE_TRTLLM_DECODE_ATTENTION is set to 1, "
-                    "using TRTLLM decode attention.")
-            return not no_use_trtllm
-        else:
-            # Environment variable not set - use auto-detection
-            # Only supports attention head size of 128
-            use_trtllm = (FlashInferBackend.cached_sm100a_supported
-                          and batch_size <= 256 and max_seq_len < 131072
-                          and kv_cache_dtype == "auto")
-            if use_trtllm:
-                logger.warning_once(
-                    "Using TRTLLM decode attention (auto-detected).")
-        return use_trtllm
-
     @staticmethod
     def get_fp8_dtype_for_flashinfer(kv_cache_dtype: str) -> torch.dtype:
         if kv_cache_dtype in ("fp8", "fp8_e4m3"):
@@ -191,10 +150,12 @@ class FlashInferMetadata:
     slot_mapping: torch.Tensor
 
     # For flashinfer trtllm batch decode
+    max_q_len: int
     max_seq_len: int
     seq_lens: torch.Tensor
     block_table_tensor: torch.Tensor
-    workspace_buffer: torch.Tensor
+    prefill_use_trtllm: bool
+    decode_use_trtllm: bool
 
     # For handling prefill decode split
     num_decodes: int
@@ -213,40 +174,80 @@ class FlashInferMetadata:
     decode_wrapper: Optional[BatchDecodeWithPagedKVCacheWrapper] = None
     cascade_wrapper: Optional[MultiLevelCascadeAttentionWrapper] = None
 
+    qo_indptr_gpu: Optional[torch.Tensor] = None
+    paged_kv_indptr_gpu: Optional[torch.Tensor] = None
+
     def __post_init__(self):
         if self.head_dim is not None:
             FlashInferBackend.validate_head_size(self.head_dim)
 
 
 class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
+    attn_cudagraph_support: ClassVar[AttentionCGSupport] = \
+        AttentionCGSupport.PURE_DECODE_ONLY
 
-    def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig,
-                 device: torch.device):
+    reorder_batch_threshold: ClassVar[int] = 1
+
+    def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
+                 vllm_config: VllmConfig, device: torch.device):
         self.device = device
-        self._workspace_buffer = None
-        self._prefill_wrapper = None  # Wrapper for prefill/append
-        self._decode_wrapper = None  # Wrapper for decode
-        self._cascade_wrapper = None  # Wrapper for cascade attention
-
-        # Global hyperparameters shared by all attention layers
-        self.global_hyperparameters: Optional[PerLayerParameters] = None
-
         self.vllm_config = vllm_config
         self.cache_config = vllm_config.cache_config
         self.kv_cache_spec = kv_cache_spec
-        max_num_blocks_per_request = cdiv(
-            vllm_config.model_config.max_model_len,
-            self.kv_cache_spec.block_size)
-        self.block_table_arange = torch.arange(max_num_blocks_per_request,
+        self._workspace_buffer = None
+        self._prefill_wrapper = None  # Wrapper for prefill/append
+        self._decode_wrapper = None  # Wrapper for decode (general shape)
+
+        self.compilation_config = vllm_config.compilation_config
+        max_num_pages_per_req = cdiv(vllm_config.model_config.max_model_len,
+                                     self.kv_cache_spec.block_size)
+        max_num_reqs = vllm_config.scheduler_config.max_num_seqs
+        max_num_pages = max_num_reqs * max_num_pages_per_req
+        self.enable_cuda_graph = self.compilation_config.full_cuda_graph
+        if self.enable_cuda_graph:
+            # For full cudagraph capture, one `decode_wrapper` for each batch
+            # size is needed for FlashInfer.
+            self._decode_wrappers_cudagraph: dict[
+                int, BatchDecodeWithPagedKVCacheWrapper] = {}
+            self._decode_cudagraph_max_bs = min(
+                max_num_reqs, self.compilation_config.max_capture_size)
+
+        self._cascade_wrapper = None  # Wrapper for cascade attention
+
+        # Global hyperparameters shared by all attention layers
+        self.global_hyperparameters = infer_global_hyperparameters(
+            get_per_layer_parameters(vllm_config, layer_names, FlashInferImpl))
+
+        # Preparing persistent buffers (device-side)
+        self.paged_kv_indptr = torch.zeros(max_num_reqs + 1,
+                                           dtype=torch.int32,
+                                           device=self.device)
+        self.paged_kv_indices = torch.zeros(
+            max_num_pages,  # max num pages possible
+            dtype=torch.int32,
+            device=self.device)
+        self.paged_kv_last_page_len = torch.zeros(max_num_reqs,
+                                                  dtype=torch.int32,
+                                                  device=self.device)
+        # host-side buffer
+        pin_memory = is_pin_memory_available()
+        self.paged_kv_indptr_cpu = torch.zeros(max_num_reqs + 1,
+                                               dtype=torch.int32,
+                                               device="cpu",
+                                               pin_memory=pin_memory)
+        self.paged_kv_indices_cpu = torch.zeros(max_num_pages,
+                                                dtype=torch.int32,
+                                                device="cpu",
+                                                pin_memory=pin_memory)
+        self.paged_kv_last_page_len_cpu = torch.zeros(max_num_reqs,
+                                                      dtype=torch.int32,
+                                                      device="cpu",
+                                                      pin_memory=pin_memory)
+
+        self.block_table_arange = torch.arange(max_num_pages_per_req,
                                                dtype=torch.int32,
                                                device=self.device)
 
-    def reorder_batch(self, input_batch: InputBatch,
-                      scheduler_output: SchedulerOutput) -> bool:
-        return reorder_batch_to_split_decodes_and_prefills(input_batch,
-                                                           scheduler_output,
-                                                           decode_threshold=1)
-
     def _get_workspace_buffer(self):
         if self._workspace_buffer is None:
             self._workspace_buffer = torch.empty(
@@ -261,8 +262,16 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                 self._get_workspace_buffer(), get_kv_cache_layout())
         return self._prefill_wrapper
 
-    def _get_decode_wrapper(self):
-        if self._decode_wrapper is None:
+    def _get_decode_wrapper(self,
+                            batch_size: int,
+                            use_cudagraph: bool = False):
+        if use_cudagraph:
+            decode_wrapper = self._decode_wrappers_cudagraph.get(
+                batch_size, None)
+        else:
+            decode_wrapper = self._decode_wrapper
+
+        if decode_wrapper is None:
             num_qo_heads = (
                 self.vllm_config.model_config.get_num_attention_heads(
                     self.vllm_config.parallel_config))
@@ -270,11 +279,32 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                 self.vllm_config.parallel_config)
             use_tensor_cores = envs.VLLM_FLASHINFER_FORCE_TENSOR_CORES or (
                 num_qo_heads // num_kv_heads > 4)
-            self._decode_wrapper = BatchDecodeWithPagedKVCacheWrapper(
+
+            if use_cudagraph:
+                paged_kv_indptr = self.paged_kv_indptr[:batch_size + 1]
+                paged_kv_indices = self.paged_kv_indices
+                paged_kv_last_page_len = self.paged_kv_last_page_len[:
+                                                                     batch_size]
+            else:
+                paged_kv_indptr = None
+                paged_kv_indices = None
+                paged_kv_last_page_len = None
+            decode_wrapper = BatchDecodeWithPagedKVCacheWrapper(
                 self._get_workspace_buffer(),
                 get_kv_cache_layout(),
+                use_cuda_graph=use_cudagraph,
+                paged_kv_indptr_buffer=paged_kv_indptr,
+                paged_kv_indices_buffer=paged_kv_indices,
+                paged_kv_last_page_len_buffer=paged_kv_last_page_len,
                 use_tensor_cores=use_tensor_cores)
-        return self._decode_wrapper
+
+            # save the decode wrapper
+            if use_cudagraph:
+                self._decode_wrappers_cudagraph[batch_size] = decode_wrapper
+            else:
+                self._decode_wrapper = decode_wrapper
+
+        return decode_wrapper
 
     def _get_cascade_wrapper(self):
         if self._cascade_wrapper is None:
@@ -282,12 +312,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                 2, self._get_workspace_buffer(), get_kv_cache_layout())
         return self._cascade_wrapper
 
-    def _plan(self, num_prefills: int, num_decodes: int,
-              attn_metadata: FlashInferMetadata):
-        if self.global_hyperparameters is None:
-            self.global_hyperparameters = infer_global_hyperparameters(
-                get_per_layer_parameters(self.vllm_config, FlashInferImpl))
-
+    def _plan(self, attn_metadata: FlashInferMetadata):
         if attn_metadata.use_cascade:
             attn_metadata.cascade_wrapper = self._get_cascade_wrapper()
             attn_metadata.cascade_wrapper.plan(
@@ -322,6 +347,8 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
             # Regular attention (common case).
             # Decodes are at the front and prefills are at the back,
             # according to reorder_batch()
+            num_prefills = attn_metadata.num_prefills
+            num_decodes = attn_metadata.num_decodes
             if num_prefills > 0:
                 # Decodes are first so prefills start after the last decode
                 prefill_start = num_decodes
@@ -337,35 +364,67 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                 # to be relative to the start of the prefill queries.
                 qo_indptr_cpu = attn_metadata.qo_indptr_cpu[
                     prefill_start:] - attn_metadata.qo_indptr_cpu[prefill_start]
-                attn_metadata.prefill_wrapper.plan(
-                    qo_indptr_cpu,
-                    attn_metadata.paged_kv_indptr_cpu[prefill_start:],
-                    attn_metadata.paged_kv_indices,
-                    attn_metadata.paged_kv_last_page_len_cpu[prefill_start:],
-                    attn_metadata.num_qo_heads,
-                    attn_metadata.num_kv_heads,
-                    attn_metadata.head_dim,
-                    attn_metadata.page_size,
-                    causal=True,
-                    sm_scale=self.global_hyperparameters.sm_scale,
-                    window_left=self.global_hyperparameters.window_left,
-                    logits_soft_cap=self.global_hyperparameters.
-                    logits_soft_cap,
-                    q_data_type=attn_metadata.q_data_type,
-                    kv_data_type=attn_metadata.kv_data_type,
-                )
+                paged_kv_indptr_cpu = attn_metadata.paged_kv_indptr_cpu[
+                    prefill_start:]
+                if not attn_metadata.prefill_use_trtllm:
+                    attn_metadata.prefill_wrapper.plan(
+                        qo_indptr_cpu,
+                        paged_kv_indptr_cpu,
+                        attn_metadata.paged_kv_indices,
+                        attn_metadata.
+                        paged_kv_last_page_len_cpu[prefill_start:],
+                        attn_metadata.num_qo_heads,
+                        attn_metadata.num_kv_heads,
+                        attn_metadata.head_dim,
+                        attn_metadata.page_size,
+                        causal=True,
+                        sm_scale=self.global_hyperparameters.sm_scale,
+                        window_left=self.global_hyperparameters.window_left,
+                        logits_soft_cap=self.global_hyperparameters.
+                        logits_soft_cap,
+                        q_data_type=attn_metadata.q_data_type,
+                        kv_data_type=attn_metadata.kv_data_type,
+                    )
+                else:
+                    attn_metadata.qo_indptr_gpu = qo_indptr_cpu.to(self.device)
+                    attn_metadata.paged_kv_indptr_gpu = paged_kv_indptr_cpu.to(
+                        self.device)
 
             if num_decodes > 0:
-                attn_metadata.decode_wrapper = self._get_decode_wrapper()
-                if not FlashInferBackend.use_trtllm_decode_attention(
-                        num_decodes, attn_metadata.max_seq_len,
-                        self.cache_config.cache_dtype,
-                        attn_metadata.num_qo_heads, attn_metadata.num_kv_heads,
-                        attn_metadata.head_dim):
-                    attn_metadata.decode_wrapper.plan(
-                        attn_metadata.paged_kv_indptr_cpu[:num_decodes + 1],
+                pure_decode = num_prefills == 0
+                # possible required padding for cudagraph replay
+                use_cudagraph = (self.enable_cuda_graph and pure_decode and
+                                 num_decodes <= self._decode_cudagraph_max_bs)
+                if use_cudagraph:
+                    num_input_tokens = (
+                        self.vllm_config.pad_for_cudagraph(num_decodes))
+                    # Carefully fulfill the padding region with reasonable value
+                    # on cpu.
+                    # Make sure paged_kv_indptr_cpu is not decreasing
+                    self.paged_kv_indptr_cpu[1 + num_decodes:1 +
+                                             num_input_tokens].fill_(
+                                                 attn_metadata.
+                                                 paged_kv_indptr_cpu[-1])
+                    # Fill the remaining paged_kv_last_page_len_cpu with 1.
+                    # This is because flashinfer treats 0 as a full page
+                    # instead of empty.
+                    self.paged_kv_last_page_len_cpu[
+                        num_decodes:num_input_tokens].fill_(1)
+
+                else:
+                    num_input_tokens = num_decodes
+
+                attn_metadata.decode_wrapper = self._get_decode_wrapper(
+                    num_input_tokens, use_cudagraph)
+                if not attn_metadata.decode_use_trtllm:
+                    # Use the persistent buffer with padding length,
+                    # instead of the same address but chunked version
+                    # in atten_metadata when using cudagraph.
+                    fast_plan_decode(
+                        attn_metadata.decode_wrapper,
+                        self.paged_kv_indptr_cpu[:num_input_tokens + 1],
                         attn_metadata.paged_kv_indices,
-                        attn_metadata.paged_kv_last_page_len_cpu[:num_decodes],
+                        self.paged_kv_last_page_len_cpu[:num_input_tokens],
                         attn_metadata.num_qo_heads,
                         attn_metadata.num_kv_heads,
                         attn_metadata.head_dim,
@@ -384,11 +443,13 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
               common_prefix_len: int,
               common_attn_metadata: CommonAttentionMetadata,
               fast_build: bool = False) -> FlashInferMetadata:
+        num_reqs = common_attn_metadata.num_reqs
         num_actual_tokens = common_attn_metadata.num_actual_tokens
         num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens =\
             split_decodes_and_prefills(common_attn_metadata)
 
         page_size = self.kv_cache_spec.block_size
+        max_q_len = common_attn_metadata.max_query_len
         max_seq_len = common_attn_metadata.seq_lens_cpu.max()
         seq_lens = common_attn_metadata.seq_lens
         seq_lens_cpu = common_attn_metadata.seq_lens_cpu
@@ -429,38 +490,70 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                                                        non_blocking=True)
         mask = (self.block_table_arange[:max_num_blocks].unsqueeze(0)
                 < block_table_bounds.unsqueeze(1))
-        paged_kv_indices = block_table_tensor[:, :max_num_blocks][mask]
+        # write self.paged_kv_indices inplace
+        num_actual_pages = torch.sum(mask)
+        paged_kv_indices = self.paged_kv_indices[:num_actual_pages]
+        torch.masked_select(block_table_tensor[:, :max_num_blocks],
+                            mask,
+                            out=paged_kv_indices)
 
-        paged_kv_indptr_cpu = torch.zeros(len(block_table_bounds_cpu) + 1,
-                                          dtype=torch.int32,
-                                          device='cpu')
-        paged_kv_indptr_cpu[1:] = block_table_bounds_cpu.cumsum(
-            dim=0, dtype=torch.int32)
+        # write self.paged_kv_indptr_cpu inplace (0-index is always 0)
+        torch.cumsum(block_table_bounds_cpu,
+                     dim=0,
+                     dtype=torch.int32,
+                     out=self.paged_kv_indptr_cpu[1:1 + num_reqs])
 
         paged_kv_last_page_len_cpu = seq_lens_cpu % page_size
-        paged_kv_last_page_len_cpu = torch.where(
-            paged_kv_last_page_len_cpu == 0, page_size,
-            paged_kv_last_page_len_cpu)
+        # write self.paged_kv_last_page_len_cpu inplace
+        torch.where(paged_kv_last_page_len_cpu == 0,
+                    torch.tensor(page_size),
+                    paged_kv_last_page_len_cpu,
+                    out=self.paged_kv_last_page_len_cpu[:num_reqs])
+
         cache_dtype = self.cache_config.cache_dtype
         if cache_dtype.startswith("fp8"):
             kv_cache_dtype = FlashInferBackend.get_fp8_dtype_for_flashinfer(
                 cache_dtype)
         else:
             kv_cache_dtype = self.kv_cache_spec.dtype
+
+        num_qo_heads = self.vllm_config.model_config.get_num_attention_heads(
+            self.vllm_config.parallel_config)
+        num_kv_heads = self.kv_cache_spec.num_kv_heads
+        head_dim = self.kv_cache_spec.head_size
+
+        # currently prefill trtllm attention does not support fp8 kv cache
+        # trtllm may not support sliding window
+        prefill_use_trtllm = (self.global_hyperparameters.window_left == -1
+                              and not cache_dtype.startswith("fp8")
+                              and use_trtllm_attention(
+                                num_prefill_tokens, max_seq_len, cache_dtype,
+                                num_qo_heads, num_kv_heads, head_dim))
+        decode_use_trtllm = (self.global_hyperparameters.window_left == -1
+                             and use_trtllm_attention(
+                                num_decode_tokens, max_seq_len, cache_dtype,
+                                num_qo_heads, num_kv_heads, head_dim))
+
         attn_metadata = FlashInferMetadata(
             num_actual_tokens=num_actual_tokens,
             qo_indptr_cpu=common_attn_metadata.query_start_loc_cpu,
-            paged_kv_indptr_cpu=paged_kv_indptr_cpu,
+            paged_kv_indptr_cpu=self.paged_kv_indptr_cpu[:1 + num_reqs],
             paged_kv_indices=paged_kv_indices,
-            paged_kv_last_page_len_cpu=paged_kv_last_page_len_cpu,
-            num_qo_heads=self.vllm_config.model_config.get_num_attention_heads(
-                self.vllm_config.parallel_config),
-            num_kv_heads=self.kv_cache_spec.num_kv_heads,
-            head_dim=self.kv_cache_spec.head_size,
+            paged_kv_last_page_len_cpu=self.
+            paged_kv_last_page_len_cpu[:num_reqs],
+            num_qo_heads=num_qo_heads,
+            num_kv_heads=num_kv_heads,
+            head_dim=head_dim,
             page_size=page_size,
             kv_data_type=kv_cache_dtype,
             q_data_type=self.vllm_config.model_config.dtype,
             slot_mapping=common_attn_metadata.slot_mapping,
+            max_q_len=max_q_len,
+            max_seq_len=max_seq_len,
+            seq_lens=seq_lens,
+            block_table_tensor=block_table_tensor,
+            prefill_use_trtllm=prefill_use_trtllm,
+            decode_use_trtllm=decode_use_trtllm,
             num_decodes=num_decodes,
             num_decode_tokens=num_decode_tokens,
             num_prefills=num_prefills,
@@ -470,16 +563,32 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
             shared_kv_page_indptr_cpu=shared_kv_page_indptr_cpu,
             shared_kv_page_indices_cpu=shared_kv_page_indices_cpu,
             shared_kv_last_page_len_cpu=shared_kv_last_page_len_cpu,
-            max_seq_len=max_seq_len,
-            seq_lens=seq_lens,
-            block_table_tensor=block_table_tensor,
-            workspace_buffer=self._get_workspace_buffer(),
         )
 
-        self._plan(num_prefills, num_decodes, attn_metadata)
+        self._plan(attn_metadata)
 
         return attn_metadata
 
+    def build_for_cudagraph_capture(
+            self, common_attn_metadata: CommonAttentionMetadata):
+        """
+        This method builds the metadata for full cudagraph capture.
+        Currently, only decode is supported for full cudagraphs with FlashInfer.
+        """
+        m = common_attn_metadata
+
+        assert m.num_reqs == m.num_actual_tokens, \
+            "FlashInfer only supports decode-only full CUDAGraph capture. " \
+            "Make sure all cudagraph capture sizes <= max_num_seq."
+
+        m.max_query_len = 1  # decode-only
+
+        return self.build(0, m)
+
+    def can_run_in_cudagraph(
+            self, common_attn_metadata: CommonAttentionMetadata) -> bool:
+        return common_attn_metadata.max_query_len == 1
+
     def use_cascade_attention(self, *args, **kwargs) -> bool:
         if self.kv_cache_spec.dtype != self.vllm_config.model_config.dtype:
             # TODO: The cascade wrapper currently does not support setting
@@ -622,30 +731,64 @@ class FlashInferImpl(AttentionImpl):
         # Regular attention (common case).
         # Decodes are at the front and prefills are at the back,
         # according to reorder_batch()
-        if prefill_wrapper := attn_metadata.prefill_wrapper:
+        if num_prefill_tokens > 0:
+            prefill_wrapper = attn_metadata.prefill_wrapper
             prefill_query = query[num_decode_tokens:]
             assert prefill_query.shape[0] == num_prefill_tokens
             assert prefill_wrapper is not None
-            assert prefill_wrapper._causal
-            assert prefill_wrapper._window_left == window_left
-            assert prefill_wrapper._logits_soft_cap == (self.logits_soft_cap
-                                                        or 0.0)
-            assert prefill_wrapper._sm_scale == self.scale
-            prefill_wrapper.run(
-                prefill_query,
-                kv_cache_permute,
-                k_scale=layer._k_scale_float,
-                v_scale=layer._v_scale_float,
-                out=output[num_decode_tokens:],
-            )
-        if decode_wrapper := attn_metadata.decode_wrapper:
+
+            if not attn_metadata.prefill_use_trtllm:
+                assert prefill_wrapper._causal
+                assert prefill_wrapper._window_left == window_left
+                assert prefill_wrapper._logits_soft_cap == (
+                    self.logits_soft_cap or 0.0)
+                assert prefill_wrapper._sm_scale == self.scale
+                prefill_wrapper.run(
+                    prefill_query,
+                    kv_cache_permute,
+                    k_scale=layer._k_scale_float,
+                    v_scale=layer._v_scale_float,
+                    out=output[num_decode_tokens:],
+                )
+            else:
+                # prefill_query may be non-contiguous
+                prefill_query = prefill_query.contiguous()
+                workspace_buffer = prefill_wrapper._float_workspace_buffer
+                block_tables_prefill = attn_metadata.block_table_tensor[
+                    num_decode_tokens:]
+                seq_lens_prefill = attn_metadata.seq_lens[num_decode_tokens:]
+
+                # This path needs to be enabled with VLLM_KV_CACHE_LAYOUT = HND
+                assert get_kv_cache_layout() == "HND"
+                assert prefill_query.is_contiguous()
+                assert kv_cache_permute.is_contiguous()
+                assert workspace_buffer.is_contiguous()
+                assert block_tables_prefill.is_contiguous()
+                assert seq_lens_prefill.is_contiguous()
+
+                trtllm_batch_context_with_kv_cache(
+                    query=prefill_query,
+                    kv_cache=kv_cache_permute,
+                    workspace_buffer=workspace_buffer,
+                    block_tables=block_tables_prefill,
+                    seq_lens=seq_lens_prefill,
+                    max_q_len=attn_metadata.max_q_len,
+                    max_kv_len=attn_metadata.max_seq_len,
+                    bmm1_scale=layer._k_scale_float * self.scale,
+                    bmm2_scale=layer._v_scale_float,
+                    batch_size=attn_metadata.num_prefills,
+                    cum_seq_lens_q=attn_metadata.qo_indptr_gpu,
+                    cum_seq_lens_kv=attn_metadata.paged_kv_indptr_gpu,
+                    out=output[num_decode_tokens:],
+                )
+
+        if num_decode_tokens > 0:
+            decode_wrapper = attn_metadata.decode_wrapper
             decode_query = query[:num_decode_tokens]
             assert decode_query.shape[0] == num_decode_tokens
-            if not FlashInferBackend.use_trtllm_decode_attention(
-                    attn_metadata.num_decodes, attn_metadata.max_seq_len,
-                    self.kv_cache_dtype, attn_metadata.num_qo_heads,
-                    attn_metadata.num_kv_heads, attn_metadata.head_dim):
-                assert decode_wrapper is not None
+            assert decode_wrapper is not None
+
+            if not attn_metadata.decode_use_trtllm:
                 assert decode_wrapper._window_left == window_left
                 assert decode_wrapper._logits_soft_cap == (self.logits_soft_cap
                                                            or 0.0)
@@ -658,30 +801,190 @@ class FlashInferImpl(AttentionImpl):
                     out=output[:num_decode_tokens],
                 )
             else:
+                # decode_query may be non-contiguous
+                decode_query = decode_query.contiguous()
+                workspace_buffer = decode_wrapper._float_workspace_buffer
+                block_tables_decode = attn_metadata.block_table_tensor[:
+                                                                       num_decode_tokens]
+                seq_lens_decode = attn_metadata.seq_lens[:num_decode_tokens]
+
                 # This path needs to be enabled with VLLM_KV_CACHE_LAYOUT = HND
-                if num_decode_tokens > 0:
-                    # decode_query may be non-contiguous
-                    decode_query = decode_query.contiguous()
-                    block_tables_decode = attn_metadata.block_table_tensor[:
-                                                                           num_decode_tokens]
-                    seq_lens_decode = attn_metadata.seq_lens[:
-                                                             num_decode_tokens]
+                assert get_kv_cache_layout() == "HND"
+                assert decode_query.is_contiguous()
+                assert kv_cache_permute.is_contiguous()
+                assert workspace_buffer.is_contiguous()
+                assert block_tables_decode.is_contiguous()
+                assert seq_lens_decode.is_contiguous()
 
-                    assert get_kv_cache_layout() == "HND"
-                    assert decode_query.is_contiguous()
-                    assert kv_cache_permute.is_contiguous()
-                    assert block_tables_decode.is_contiguous()
-                    assert seq_lens_decode.is_contiguous()
-
-                    output[:num_decode_tokens] = (
-                        trtllm_batch_decode_with_kv_cache(
-                            query=decode_query,
-                            kv_cache=kv_cache_permute,
-                            workspace_buffer=attn_metadata.workspace_buffer,
-                            block_tables=block_tables_decode,
-                            seq_lens=seq_lens_decode,
-                            max_seq_len=attn_metadata.max_seq_len,
-                            bmm1_scale=layer._k_scale_float * self.scale,
-                            bmm2_scale=layer._v_scale_float,
-                        ))
+                trtllm_batch_decode_with_kv_cache(
+                    query=decode_query,
+                    kv_cache=kv_cache_permute,
+                    workspace_buffer=workspace_buffer,
+                    block_tables=block_tables_decode,
+                    seq_lens=seq_lens_decode,
+                    max_seq_len=attn_metadata.max_seq_len,
+                    bmm1_scale=layer._k_scale_float * self.scale,
+                    bmm2_scale=layer._v_scale_float,
+                    out=output[:num_decode_tokens],
+                )
         return output_padded
+
+
+def fast_plan_decode(
+    self,  # decode wrapper
+    indptr_cpu: torch.Tensor,
+    indices: torch.Tensor,
+    last_page_len_cpu: torch.Tensor,
+    num_qo_heads: int,
+    num_kv_heads: int,
+    head_dim: int,
+    page_size: int,
+    pos_encoding_mode: str = "NONE",
+    window_left: int = -1,
+    logits_soft_cap: Optional[float] = None,
+    q_data_type: Optional[Union[str, torch.dtype]] = "float16",
+    kv_data_type: Optional[Union[str, torch.dtype]] = None,
+    data_type: Optional[Union[str, torch.dtype]] = None,
+    sm_scale: Optional[float] = None,
+    rope_scale: Optional[float] = None,
+    rope_theta: Optional[float] = None,
+    non_blocking: bool = True,
+) -> None:
+    """
+    A faster version of BatchDecodeWithPagedKVCacheWrapper::plan used for
+    cudagraph capture/replay, while the no cudagraph version turns back
+    to the original plan.
+    using original plan after passing host-side buffers:
+    - only host-to-device copy of indptr and last_page_len buffers
+    Modifications for cudagraph:
+    - only host-to-device copy of indptr and last_page_len buffers.
+    - avoid device-to-device copy of indices buffer.
+
+    Part of the code get inspiration from the original plan from FlashInfer repo
+    and the implementation of fast_decode_plan for FlashInfer in SGlang repo.
+    """
+    # Warm up with the original plan if it is first call, and always run the
+    # original plan if we run for dynamic shape. For fixed shape (cudagraph),
+    # this warm up is to generate the _cached_module for the decode wrapper.
+    if not self.is_cuda_graph_enabled or \
+        getattr(self, "vllm_first_call", True):
+        self.plan(
+            indptr_cpu,
+            indices,
+            last_page_len_cpu,
+            num_qo_heads,
+            num_kv_heads,
+            head_dim,
+            page_size,
+            pos_encoding_mode,
+            window_left,
+            logits_soft_cap,
+            q_data_type,
+            kv_data_type,
+            data_type,
+            sm_scale,
+            rope_scale,
+            rope_theta,
+            non_blocking,
+        )
+        self.vllm_first_call = False
+        return
+
+    assert self.is_cuda_graph_enabled, "Should be cudagraph only here"
+
+    batch_size = len(last_page_len_cpu)
+    if logits_soft_cap is None:
+        logits_soft_cap = 0.0
+
+    # Handle data types consistently
+    if data_type is not None:
+        if q_data_type is None:
+            q_data_type = data_type
+        if kv_data_type is None:
+            kv_data_type = data_type
+    elif q_data_type is None:
+        q_data_type = "float16"
+
+    if kv_data_type is None:
+        kv_data_type = q_data_type
+    q_data_type = getattr(torch, q_data_type) if isinstance(
+        q_data_type, str) else q_data_type
+    kv_data_type = getattr(torch, kv_data_type) if isinstance(
+        kv_data_type, str) else kv_data_type
+
+    if self.use_tensor_cores:
+        qo_indptr_host = _get_range_buf(batch_size + 1, "cpu")
+
+    if batch_size != self._fixed_batch_size:
+        raise ValueError(
+            "The batch size should be fixed in cudagraph mode, the runtime "
+            "batch size {} mismatches the batch size set during "
+            "initialization {}".format(batch_size, self._fixed_batch_size))
+    if len(indices) > len(self._paged_kv_indices_buf):
+        raise ValueError(
+            "The size of indices should be less than or equal to the "
+            "allocated buffer")
+
+    # host-to-device copy for the indptr buffer
+    self._paged_kv_indptr_buf.copy_(indptr_cpu, non_blocking=True)
+    # host-to-device copy for the last_page_len buffer
+    self._paged_kv_last_page_len_buf.copy_(last_page_len_cpu,
+                                           non_blocking=True)
+
+    indptr_host = indptr_cpu
+    last_page_len_host = last_page_len_cpu
+
+    if self.use_tensor_cores:
+        kv_lens_arr_host = get_seq_lens(indptr_host, last_page_len_host,
+                                        page_size)
+
+        try:
+            # Make sure we pass exactly 15 arguments for tensor core version
+            self._plan_info = self._cached_module.plan(
+                self._float_workspace_buffer,
+                self._int_workspace_buffer,
+                self._pin_memory_int_workspace_buffer,
+                qo_indptr_host,
+                indptr_host,
+                kv_lens_arr_host,
+                batch_size,  # total_num_rows
+                batch_size,
+                num_qo_heads,
+                num_kv_heads,
+                page_size,
+                self.is_cuda_graph_enabled,
+                head_dim,
+                head_dim,
+                False,  # causal
+            )
+        except Exception as e:
+            raise RuntimeError(f"Error in tensor core plan: {e}") from e
+    else:
+        try:
+            # Make sure we pass exactly 15 arguments for standard version
+            self._plan_info = self._cached_module.plan(
+                self._float_workspace_buffer,
+                self._int_workspace_buffer,
+                self._pin_memory_int_workspace_buffer,
+                indptr_host,
+                batch_size,
+                num_qo_heads,
+                num_kv_heads,
+                page_size,
+                self.is_cuda_graph_enabled,
+                window_left,
+                logits_soft_cap,
+                head_dim,
+                head_dim,
+                torch.empty(0, dtype=q_data_type),
+                torch.empty(0, dtype=kv_data_type),
+            )
+        except Exception as e:
+            raise RuntimeError(f"Error in standard plan: {e}") from e
+
+    self._pos_encoding_mode = pos_encoding_mode
+    self._window_left = window_left
+    self._logits_soft_cap = logits_soft_cap
+    self._sm_scale = sm_scale
+    self._rope_scale = rope_scale
+    self._rope_theta = rope_theta
diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py
index ad63f92cd88a7..bb0d890c7754d 100644
--- a/vllm/v1/attention/backends/flex_attention.py
+++ b/vllm/v1/attention/backends/flex_attention.py
@@ -258,8 +258,8 @@ class FlexAttentionMetadata:
 class FlexAttentionMetadataBuilder(
         AttentionMetadataBuilder[FlexAttentionMetadata]):
 
-    def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig,
-                 device: torch.device):
+    def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
+                 vllm_config: VllmConfig, device: torch.device):
         self.model_config = vllm_config.model_config
         self.parallel_config = vllm_config.parallel_config
         self.cache_config = vllm_config.cache_config
diff --git a/vllm/v1/attention/backends/mamba_attn.py b/vllm/v1/attention/backends/mamba_attn.py
index dca5de46c0653..66a8d91db89c2 100644
--- a/vllm/v1/attention/backends/mamba_attn.py
+++ b/vllm/v1/attention/backends/mamba_attn.py
@@ -2,21 +2,17 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import math
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Optional
+from typing import ClassVar, Optional
 
 import torch
 
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.config import VllmConfig
-from vllm.v1.attention.backends.utils import (
-    AttentionMetadataBuilder, CommonAttentionMetadata,
-    reorder_batch_to_split_decodes_and_prefills, split_decodes_and_prefills)
+from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder,
+                                              CommonAttentionMetadata,
+                                              split_decodes_and_prefills)
 from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec
 
-if TYPE_CHECKING:
-    from vllm.v1.core.sched.output import SchedulerOutput
-    from vllm.v1.worker.gpu_input_batch import InputBatch
-
 
 def _query_start_loc_to_chunk_indices_offsets(query_start_loc: torch.Tensor,
                                               chunk_size: int,
@@ -87,20 +83,16 @@ class Mamba2AttentionMetadata:
 class Mamba2AttentionMetadataBuilder(
         AttentionMetadataBuilder[Mamba2AttentionMetadata]):
 
-    def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig,
-                 device: torch.device):
+    reorder_batch_threshold: ClassVar[int] = 1
+
+    def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
+                 vllm_config: VllmConfig, device: torch.device):
         assert isinstance(kv_cache_spec, MambaSpec)
         self.kv_cache_spec = kv_cache_spec
         self.chunk_size = vllm_config.model_config.get_mamba_chunk_size()
         assert self.chunk_size is not None, (
             "chunk_size needs to be set in the model config for Mamba2 models")
 
-    def reorder_batch(self, input_batch: "InputBatch",
-                      scheduler_output: "SchedulerOutput") -> bool:
-        return reorder_batch_to_split_decodes_and_prefills(input_batch,
-                                                           scheduler_output,
-                                                           decode_threshold=1)
-
     def build(self,
               common_prefix_len: int,
               common_attn_metadata: CommonAttentionMetadata,
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index cf17d93302395..badff67656c24 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -190,7 +190,7 @@ return curr_o @ W_O
 import functools
 from abc import abstractmethod
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Generic, Optional, TypeVar, Union
+from typing import ClassVar, Generic, Optional, TypeVar, Union
 
 import torch
 
@@ -209,10 +209,12 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                UnquantizedLinearMethod)
 from vllm.platforms import current_platform
 from vllm.utils import cdiv, round_down
-from vllm.v1.attention.backends.utils import (
-    AttentionMetadataBuilder, CommonAttentionMetadata,
-    get_per_layer_parameters, infer_global_hyperparameters,
-    reorder_batch_to_split_decodes_and_prefills, split_decodes_and_prefills)
+from vllm.utils.flashinfer import has_nvidia_artifactory
+from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder,
+                                              CommonAttentionMetadata,
+                                              get_per_layer_parameters,
+                                              infer_global_hyperparameters,
+                                              split_decodes_and_prefills)
 from vllm.v1.kv_cache_interface import AttentionSpec
 
 try:
@@ -232,10 +234,6 @@ try:
 except ImportError:
     flashinfer_available = False
 
-if TYPE_CHECKING:
-    from vllm.v1.core.sched.output import SchedulerOutput
-    from vllm.v1.worker.gpu_input_batch import InputBatch
-
 logger = init_logger(__name__)
 
 CUDNN_WORKSPACE_SIZE = 12800
@@ -379,17 +377,16 @@ M = TypeVar("M", bound=MLACommonMetadata)
 
 
 def use_flashinfer_prefill() -> bool:
-    if flashinfer_available and not envs.VLLM_USE_CUDNN_PREFILL:
-        # For blackwell default to flashinfer prefill if its available since
-        #  its faster than FA2.
-        return current_platform.has_device_capability(100)
-    return False
+    # For blackwell default to flashinfer prefill if its available since
+    # it is faster than FA2.
+    return (flashinfer_available and not envs.VLLM_USE_CUDNN_PREFILL
+            and current_platform.is_device_capability(100))
 
 
 def use_cudnn_prefill() -> bool:
-    if flashinfer_available and envs.VLLM_USE_CUDNN_PREFILL:
-        return current_platform.has_device_capability(100)
-    return False
+    return (flashinfer_available and envs.VLLM_USE_CUDNN_PREFILL
+            and current_platform.is_device_capability(100)
+            and has_nvidia_artifactory())
 
 
 # Currently 394MB, this can be tuned based on GEMM sizes used.
@@ -403,9 +400,11 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
     NOTE: Please read the comment at the top of the file before trying to
     understand this class
     """
+    reorder_batch_threshold: ClassVar[int] = 1
 
     def __init__(self,
                  kv_cache_spec: AttentionSpec,
+                 layer_names: list[str],
                  vllm_config: VllmConfig,
                  device: torch.device,
                  metadata_cls: Optional[type[M]] = None):
@@ -471,7 +470,8 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
                 BatchPrefillWithRaggedKVCacheWrapper] = []
 
             self._global_hyperparameters = infer_global_hyperparameters(
-                get_per_layer_parameters(vllm_config, MLACommonImpl))
+                get_per_layer_parameters(vllm_config, layer_names,
+                                         MLACommonImpl))
 
         if self._use_cudnn_prefill:
             self.cudnn_workspace = torch.empty(
@@ -557,12 +557,6 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
         prefill.prefill_main = self._fi_prefill_main
         prefill.prefill_chunks = self._fi_prefill_chunks
 
-    def reorder_batch(self, input_batch: "InputBatch",
-                      scheduler_output: "SchedulerOutput") -> bool:
-        return reorder_batch_to_split_decodes_and_prefills(input_batch,
-                                                           scheduler_output,
-                                                           decode_threshold=1)
-
     def _build_decode(self, block_table_tensor: torch.Tensor,
                       seq_lens: torch.Tensor):
         return MLACommonDecodeMetadata(
diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py
index c787f25cd3adf..b23a8f0a5e870 100644
--- a/vllm/v1/attention/backends/mla/cutlass_mla.py
+++ b/vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -21,7 +21,7 @@ class CutlassMLABackend(MLACommonBackend):
 
     @staticmethod
     def get_name() -> str:
-        return "CUTLASS_MLA_VLLM_V1"
+        return "CUTLASS_MLA"
 
     @staticmethod
     def get_impl_cls() -> type["CutlassMLAImpl"]:
diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py
index d3e5300dbbd6b..b5aecff9937f3 100644
--- a/vllm/v1/attention/backends/mla/flashmla.py
+++ b/vllm/v1/attention/backends/mla/flashmla.py
@@ -18,6 +18,7 @@ from vllm.v1.attention.backends.mla.common import (MLACommonBackend,
                                                    MLACommonImpl,
                                                    MLACommonMetadata,
                                                    MLACommonMetadataBuilder)
+from vllm.v1.attention.backends.utils import AttentionCGSupport
 from vllm.v1.kv_cache_interface import AttentionSpec
 
 logger = init_logger(__name__)
@@ -54,11 +55,13 @@ class FlashMLAMetadata(MLACommonMetadata[FlashMLADecodeMetadata]):
 
 
 class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]):
-    full_cudagraph_supported: ClassVar[bool] = True  # Decode-only
+    attn_cudagraph_support: ClassVar[AttentionCGSupport] = \
+        AttentionCGSupport.PURE_DECODE_ONLY
 
-    def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig,
-                 device: torch.device):
-        super().__init__(kv_cache_spec, vllm_config, device, FlashMLAMetadata)
+    def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
+                 vllm_config: VllmConfig, device: torch.device):
+        super().__init__(kv_cache_spec, layer_names, vllm_config, device,
+                         FlashMLAMetadata)
 
         self.compilation_config = vllm_config.compilation_config
         self.num_q_heads = vllm_config.model_config.get_num_attention_heads(
diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
index 834c234558350..8b55e1a301992 100644
--- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
+++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
@@ -17,6 +17,7 @@ from vllm.v1.attention.backends.mla.common import (MLACommonBackend,
                                                    MLACommonImpl,
                                                    MLACommonMetadata,
                                                    MLACommonMetadataBuilder)
+from vllm.v1.attention.backends.utils import AttentionCGSupport
 from vllm.v1.kv_cache_interface import AttentionSpec
 
 # yapf: enable
@@ -64,11 +65,13 @@ class AiterMLAMetadata(MLACommonMetadata[AiterMLADecodeMetadata]):
 
 
 class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
-    full_cudagraph_supported: ClassVar[bool] = True  # decode only
+    attn_cudagraph_support: ClassVar[AttentionCGSupport] = \
+        AttentionCGSupport.PURE_DECODE_ONLY
 
-    def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig,
-                 device: torch.device):
-        super().__init__(kv_cache_spec, vllm_config, device, AiterMLAMetadata)
+    def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
+                 vllm_config: VllmConfig, device: torch.device):
+        super().__init__(kv_cache_spec, layer_names, vllm_config, device,
+                         AiterMLAMetadata)
         assert self.kv_cache_spec.block_size == 1, "AITER MLA" \
             "only supports block size 1."
 
diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py
index 85a5dc8c91c13..abe05174507ff 100644
--- a/vllm/v1/attention/backends/rocm_aiter_fa.py
+++ b/vllm/v1/attention/backends/rocm_aiter_fa.py
@@ -231,8 +231,8 @@ class AiterFlashAttentionMetadataBuilder(
         AttentionMetadataBuilder[AiterFlashAttentionMetadata]):
     full_cudagraph_supported: ClassVar[bool] = True
 
-    def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig,
-                 device: torch.device):
+    def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
+                 vllm_config: VllmConfig, device: torch.device):
         self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
         self.parallel_config = vllm_config.parallel_config
@@ -251,9 +251,6 @@ class AiterFlashAttentionMetadataBuilder(
         self.aot_sliding_window: Optional[tuple[int, int]] = None
         self.total_tokens: int = 0
 
-    def reorder_batch(self, input_batch, scheduler_output) -> bool:
-        return False
-
     def build_for_cudagraph_capture(
             self, common_attn_metadata: CommonAttentionMetadata):
         self.total_tokens = self.model_config.max_model_len \
diff --git a/vllm/v1/attention/backends/tree_attn.py b/vllm/v1/attention/backends/tree_attn.py
new file mode 100644
index 0000000000000..a071f0921df94
--- /dev/null
+++ b/vllm/v1/attention/backends/tree_attn.py
@@ -0,0 +1,448 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Attention layer with TreeAttention."""
+
+import ast
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Optional
+
+import torch
+
+from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionMetadata, AttentionType)
+from vllm.attention.ops.triton_unified_attention import unified_attention
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.v1.attention.backends.utils import (
+    AttentionMetadataBuilder, CommonAttentionMetadata,
+    reorder_batch_to_split_decodes_and_prefills, split_decodes_and_prefills)
+from vllm.v1.kv_cache_interface import AttentionSpec
+
+if TYPE_CHECKING:
+    from vllm.v1.core.sched.output import SchedulerOutput
+    from vllm.v1.worker.gpu_input_batch import InputBatch
+
+from vllm import _custom_ops as ops
+
+logger = init_logger(__name__)
+
+
+class TreeAttentionBackend(AttentionBackend):
+
+    accept_output_buffer: bool = True
+
+    @classmethod
+    def get_supported_dtypes(cls) -> list[torch.dtype]:
+        return [torch.float16, torch.bfloat16]
+
+    @classmethod
+    def get_supported_head_sizes(cls) -> list[int]:
+        return [32, 64, 96, 128, 160, 192, 224, 256]
+
+    @classmethod
+    def validate_head_size(cls, head_size: int) -> None:
+        supported_head_sizes = cls.get_supported_head_sizes()
+        if head_size not in supported_head_sizes:
+            attn_type = cls.__name__.removesuffix("Backend")
+            raise ValueError(
+                f"Head size {head_size} is not supported by {attn_type}. "
+                f"Supported head sizes are: {supported_head_sizes}. "
+                "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
+                "FlexAttention backend which supports all head sizes.")
+
+    @staticmethod
+    def get_name() -> str:
+        return "TREE_ATTN_VLLM_V1"
+
+    @staticmethod
+    def get_impl_cls() -> type["TreeAttentionImpl"]:
+        return TreeAttentionImpl
+
+    @staticmethod
+    def get_metadata_cls() -> type["AttentionMetadata"]:
+        return TreeAttentionMetadata
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> tuple[int, ...]:
+        if block_size % 16 != 0:
+            raise ValueError("Block size must be a multiple of 16.")
+        return (2, num_blocks, block_size, num_kv_heads, head_size)
+
+    @staticmethod
+    def get_builder_cls() -> type["TreeAttentionMetadataBuilder"]:
+        return TreeAttentionMetadataBuilder
+
+    @staticmethod
+    def use_cascade_attention(*args, **kwargs) -> bool:
+        return False
+
+
+@dataclass
+class TreeAttentionMetadata:
+    num_actual_tokens: int  # Number of tokens excluding padding.
+    max_query_len: int
+    query_start_loc: torch.Tensor
+    max_seq_len: int
+    seq_lens: torch.Tensor
+    block_table: torch.Tensor
+    slot_mapping: torch.Tensor
+
+    num_prefill_tokens: int = 0
+    num_decode_tokens: int = 0
+    num_prefills: int = 0
+    num_decodes: int = 0
+
+    tree_attn_bias: Optional[torch.Tensor] = None
+
+    # Cached Prefill/decode metadata.
+    _cached_prefill_metadata: Optional["TreeAttentionMetadata"] = None
+    _cached_decode_metadata: Optional["TreeAttentionMetadata"] = None
+
+    @property
+    def prefill_metadata(self) -> Optional["TreeAttentionMetadata"]:
+        if self.num_prefills == 0:
+            return None
+
+        if self._cached_prefill_metadata is not None:
+            # Recover cached prefill-phase attention
+            # metadata structure
+            return self._cached_prefill_metadata
+
+        q_start_loc = self.query_start_loc[self.num_decodes:]
+        q_seqlens = torch.diff(q_start_loc)
+        kv_seqlens = self.seq_lens[self.num_decodes:]
+        # Construct & cache prefill-phase attention metadata structure
+        self._cached_prefill_metadata = TreeAttentionMetadata(
+            num_actual_tokens=self.num_prefill_tokens,
+            max_query_len=int(q_seqlens.max().item()),
+            query_start_loc=q_start_loc - q_start_loc[0],
+            max_seq_len=int(kv_seqlens.max().item()),
+            seq_lens=kv_seqlens,
+            block_table=self.block_table[self.num_decodes:],
+            slot_mapping=self.slot_mapping[self.num_decode_tokens:],
+        )
+        return self._cached_prefill_metadata
+
+    @property
+    def decode_metadata(self) -> Optional["TreeAttentionMetadata"]:
+        if self.num_decode_tokens == 0:
+            return None
+
+        if self._cached_decode_metadata is not None:
+            # Recover cached decode-phase attention
+            # metadata structure
+            return self._cached_decode_metadata
+
+        q_start_loc = self.query_start_loc[:self.num_decodes + 1]
+        q_seqlens = torch.diff(q_start_loc)
+        kv_seqlens = self.seq_lens[:self.num_decodes]
+        # Construct & cache decode-phase attention metadata structure
+        self._cached_decode_metadata = TreeAttentionMetadata(
+            num_actual_tokens=self.num_decode_tokens,
+            max_query_len=int(q_seqlens.max().item()),
+            query_start_loc=q_start_loc,
+            max_seq_len=int(kv_seqlens.max().item()),
+            seq_lens=kv_seqlens,
+            block_table=self.block_table[:self.num_decodes],
+            slot_mapping=self.slot_mapping[:self.num_decode_tokens],
+            tree_attn_bias=self.tree_attn_bias,
+        )
+        return self._cached_decode_metadata
+
+
+class TreeAttentionMetadataBuilder(
+        AttentionMetadataBuilder[TreeAttentionMetadata]):
+
+    def __init__(
+        self,
+        kv_cache_spec: AttentionSpec,
+        layer_names: list[str],
+        vllm_config: VllmConfig,
+        device: torch.device,
+    ):
+        self.kv_cache_spec = kv_cache_spec
+        self.block_size = kv_cache_spec.block_size
+
+        spec_config = vllm_config.speculative_config
+        spec_token_tree = (spec := spec_config) and spec.speculative_token_tree
+        tree_choices: list[tuple[int,
+                                 ...]] = (ast.literal_eval(spec_token_tree)
+                                          if spec_token_tree is not None else
+                                          [(0, )])
+        # Construct the tree attention bias.
+        depth_counts = _get_depth_counts(tree_choices)
+        self.tree_attn_bias = _prepare_tree_attn_bias(
+            tree_choices,
+            depth_counts,
+            dtype=torch.float32,
+            device=device,
+        )
+
+    def reorder_batch(self, input_batch: "InputBatch",
+                      scheduler_output: "SchedulerOutput") -> bool:
+        return reorder_batch_to_split_decodes_and_prefills(
+            input_batch,
+            scheduler_output,
+            decode_threshold=self.tree_attn_bias.shape[0])
+
+    def build(
+        self,
+        common_prefix_len: int,
+        common_attn_metadata: CommonAttentionMetadata,
+        fast_build: bool = False,
+    ) -> TreeAttentionMetadata:
+        decode_threshold = self.tree_attn_bias.shape[0]
+        num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
+            split_decodes_and_prefills(common_attn_metadata,
+                                       decode_threshold=decode_threshold))
+
+        num_actual_tokens = common_attn_metadata.num_actual_tokens
+        q_start_loc = common_attn_metadata.query_start_loc
+        max_query_len = common_attn_metadata.max_query_len
+        kv_seqlens = common_attn_metadata.seq_lens
+        max_seq_len = int(common_attn_metadata.seq_lens_cpu.max())
+        block_table = common_attn_metadata.block_table_tensor
+        slot_mapping = common_attn_metadata.slot_mapping
+
+        return TreeAttentionMetadata(
+            num_actual_tokens=num_actual_tokens,
+            num_prefill_tokens=num_prefill_tokens,
+            num_decode_tokens=num_decode_tokens,
+            num_prefills=num_prefills,
+            num_decodes=num_decodes,
+            max_query_len=max_query_len,
+            query_start_loc=q_start_loc,
+            max_seq_len=max_seq_len,
+            seq_lens=kv_seqlens,
+            block_table=block_table,
+            slot_mapping=slot_mapping,
+            tree_attn_bias=self.tree_attn_bias,
+        )
+
+    def build_for_drafting(
+        self,
+        common_attn_metadata: CommonAttentionMetadata,
+        draft_index: int,
+    ) -> TreeAttentionMetadata:
+        # Cache the original tree attention bias.
+        orig_tree_attn_bias = self.tree_attn_bias
+
+        if draft_index == 0:
+            # Use prefill for drafting at the root level.
+            self.tree_attn_bias = torch.empty(0)
+        else:
+            # Slice the tree attention bias for drafting.
+            query_len = common_attn_metadata.max_query_len
+            start, end = draft_index, draft_index + query_len
+            self.tree_attn_bias = self.tree_attn_bias[start:end,
+                                                      start:end].contiguous()
+
+        # Build attention bias.
+        attn_metadata = self.build(0, common_attn_metadata, fast_build=True)
+
+        # Reset the tree attention bias to the original value.
+        self.tree_attn_bias = orig_tree_attn_bias
+        return attn_metadata
+
+
+def _get_depth_counts(sorted_tree_choices: list[tuple[int, ...]]) -> list[int]:
+    # Count the number of choices at each depth of the tree.
+    depth_counts = []
+    prev_depth = 0
+    for path in sorted_tree_choices:
+        depth = len(path)
+        if depth != prev_depth:
+            depth_counts.append(0)
+        depth_counts[depth - 1] += 1
+        prev_depth = depth
+    return depth_counts
+
+
+def _prepare_tree_attn_bias(
+    sorted_tree_choices: list[tuple[int, ...]],
+    depth_counts: list[int],
+    dtype: Optional[torch.dtype],
+    device: Optional[torch.device],
+) -> torch.Tensor:
+    # +1 comes from the additional root node.
+    tree_len = len(sorted_tree_choices) + 1
+    tree_attn_mask = torch.full((tree_len, tree_len),
+                                -torch.inf,
+                                device=device,
+                                dtype=dtype)
+
+    # Set diagonal to all zeros. Each token should
+    # attend to itself.
+    mask_val = 0
+    for i in range(tree_len):
+        tree_attn_mask[i, i] = mask_val
+
+    # Set root to all zeros. All tokens attend to it.
+    tree_attn_mask[:, 0] = mask_val
+
+    # Set all ancestors to zeros.
+    start = 0
+    for i in range(len(depth_counts)):
+        for j in range(depth_counts[i]):
+            cur_tree_choice = sorted_tree_choices[start + j]
+            # Retrieve ancestor position.
+            if len(cur_tree_choice) == 1:
+                continue
+            ancestor_idx = []
+            for c in range(len(cur_tree_choice) - 1):
+                ancestor_idx.append(
+                    sorted_tree_choices.index(cur_tree_choice[:c + 1]) + 1)
+            tree_attn_mask[j + start + 1, ancestor_idx] = mask_val
+        start += depth_counts[i]
+    return tree_attn_mask
+
+
+class TreeAttentionImpl(AttentionImpl):
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: Optional[list[float]],
+        sliding_window: Optional[int],
+        kv_cache_dtype: str,
+        logits_soft_cap: Optional[float] = None,
+        attn_type: AttentionType = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
+        use_irope: bool = False,
+    ) -> None:
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.num_kv_heads = num_kv_heads
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+        self.kv_cache_dtype = kv_cache_dtype
+        self.kv_sharing_target_layer_name = kv_sharing_target_layer_name
+        if alibi_slopes is not None:
+            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
+        self.alibi_slopes = alibi_slopes
+        if logits_soft_cap is None:
+            # Setting logits_soft_cap to 0 means no soft cap.
+            logits_soft_cap = 0
+        self.logits_soft_cap = logits_soft_cap
+        if sliding_window is None:
+            self.sliding_window = (-1, -1)
+        else:
+            self.sliding_window = (sliding_window - 1, 0)
+
+        TreeAttentionBackend.validate_head_size(head_size)
+
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "TreeAttentionImpl.")
+
+    def forward(
+        self,
+        layer: torch.nn.Module,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: TreeAttentionMetadata,
+        output: Optional[torch.Tensor] = None,
+        output_scale: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Forward pass with TreeAttention.
+
+        Args:
+            query: shape = [num_tokens, num_heads, head_size]
+            key: shape = [num_tokens, num_kv_heads, head_size]
+            value: shape = [num_tokens, num_kv_heads, head_size]
+            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
+            attn_metadata: Metadata for attention.
+        Returns:
+            shape = [num_tokens, num_heads * head_size]
+        """
+        assert output is not None, "Output tensor must be provided."
+
+        if output_scale is not None:
+            raise NotImplementedError(
+                "fused output quantization is not yet supported"
+                " for TreeAttentionImpl")
+
+        if attn_metadata is None:
+            # Profiling run.
+            return output
+
+        # Cache the input KVs.
+        key_cache, value_cache = kv_cache.unbind(0)
+        if self.kv_sharing_target_layer_name is None:
+            # Reshape the input keys and values and store them in the cache.
+            # Skip this if sharing KV cache with an earlier attention layer.
+            # NOTE(woosuk): Here, key and value are padded while slot_mapping is
+            # not padded. However, we don't need to do key[:num_actual_tokens]
+            # and value[:num_actual_tokens] because the reshape_and_cache_flash
+            # op uses the slot_mapping's shape to determine the number of
+            # actual tokens.
+            ops.reshape_and_cache_flash(
+                key,
+                value,
+                key_cache,
+                value_cache,
+                attn_metadata.slot_mapping,
+                self.kv_cache_dtype,
+                layer._k_scale,
+                layer._v_scale,
+            )
+
+        num_actual_tokens = attn_metadata.num_actual_tokens
+        num_decode_tokens = attn_metadata.num_decode_tokens
+        descale_shape = (attn_metadata.query_start_loc.shape[0] - 1,
+                         key.shape[1])
+        if prefill_meta := attn_metadata.prefill_metadata:
+            unified_attention(
+                q=query[num_decode_tokens:num_actual_tokens],
+                k=key_cache,
+                v=value_cache,
+                out=output[num_decode_tokens:num_actual_tokens],
+                cu_seqlens_q=prefill_meta.query_start_loc,
+                max_seqlen_q=prefill_meta.max_query_len,
+                seqused_k=prefill_meta.seq_lens,
+                max_seqlen_k=prefill_meta.max_seq_len,
+                softmax_scale=self.scale,
+                causal=True,
+                alibi_slopes=self.alibi_slopes,
+                window_size=self.sliding_window,
+                block_table=prefill_meta.block_table,
+                softcap=self.logits_soft_cap,
+                q_descale=None,  # Not supported
+                k_descale=layer._k_scale.expand(descale_shape),
+                v_descale=layer._v_scale.expand(descale_shape),
+            )
+
+        if decode_meta := attn_metadata.decode_metadata:
+            unified_attention(
+                q=query[:num_decode_tokens],
+                k=key_cache,
+                v=value_cache,
+                out=output[:num_decode_tokens],
+                cu_seqlens_q=decode_meta.query_start_loc,
+                max_seqlen_q=decode_meta.max_query_len,
+                seqused_k=decode_meta.seq_lens,
+                max_seqlen_k=decode_meta.max_seq_len,
+                softmax_scale=self.scale,
+                causal=True,
+                alibi_slopes=self.alibi_slopes,
+                qq_bias=decode_meta.tree_attn_bias,
+                window_size=self.sliding_window,
+                block_table=decode_meta.block_table,
+                softcap=self.logits_soft_cap,
+                q_descale=None,  # Not supported
+                k_descale=layer._k_scale.expand(descale_shape),
+                v_descale=layer._v_scale.expand(descale_shape),
+            )
+        return output
diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py
index 83471ca51b73f..942cb95eefa2f 100644
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -18,7 +18,8 @@ from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
-from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder,
+from vllm.v1.attention.backends.utils import (AttentionCGSupport,
+                                              AttentionMetadataBuilder,
                                               CommonAttentionMetadata)
 from vllm.v1.kv_cache_interface import AttentionSpec
 
@@ -57,10 +58,11 @@ class TritonAttentionMetadata:
 
 class TritonAttentionMetadataBuilder(
         AttentionMetadataBuilder[TritonAttentionMetadata]):
-    full_cudagraph_supported: ClassVar[bool] = True
+    attn_cudagraph_support: ClassVar[AttentionCGSupport] = \
+        AttentionCGSupport.ALWAYS
 
-    def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig,
-                 device: torch.device):
+    def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
+                 vllm_config: VllmConfig, device: torch.device):
         self.device = device
         self.block_size = kv_cache_spec.block_size
         self.kv_cache_spec = kv_cache_spec
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index b13362f8a8d8d..7aeea40b25a67 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -1,10 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import abc
+import enum
 import functools
 from abc import abstractmethod
-from dataclasses import dataclass
-from typing import TYPE_CHECKING, ClassVar, Generic, Optional, TypeVar
+from dataclasses import dataclass, make_dataclass
+from typing import TYPE_CHECKING, Any, ClassVar, Generic, Optional, TypeVar
 
 import numpy as np
 import torch
@@ -62,16 +63,120 @@ class CommonAttentionMetadata:
     causal: bool = True
 
 
+@dataclass
+class UbatchSlice:
+    request_slice: slice
+    token_slice: slice
+
+
+def slice_query_start_locs(
+    query_start_loc: torch.Tensor,
+    request_slice: slice,
+) -> torch.Tensor:
+    """
+    Creates a new query_start_loc that corresponds to the requests in 
+    request_slice.
+
+    Note: This function creates a new tensor to hold the new query_start_locs.
+    This will break cudagraph compatibility.
+    """
+    return query_start_loc[request_slice.start: request_slice.stop + 1] -\
+        query_start_loc[request_slice.start]
+
+
+def _make_metadata_with_slice(
+        ubatch_slice: UbatchSlice,
+        attn_metadata: CommonAttentionMetadata) -> CommonAttentionMetadata:
+    """
+    This function creates a new CommonAttentionMetadata that corresponds to 
+    the requests included in ubatch_slice
+    """
+
+    request_slice = ubatch_slice.request_slice
+    token_slice = ubatch_slice.token_slice
+
+    query_start_loc = slice_query_start_locs(attn_metadata.query_start_loc,
+                                             request_slice)
+    assert len(query_start_loc) >= 2, (
+        f"query_start_loc must have at least 2 elements, "
+        f"got {len(query_start_loc)}")
+    query_start_loc_cpu = slice_query_start_locs(
+        attn_metadata.query_start_loc_cpu, request_slice)
+
+    seq_lens = attn_metadata.seq_lens[request_slice]
+    seq_lens_cpu = attn_metadata.seq_lens_cpu[request_slice]
+    num_computed_tokens_cpu = attn_metadata.num_computed_tokens_cpu[
+        request_slice]
+
+    num_requests = request_slice.stop - request_slice.start
+    num_actual_tokens = token_slice.stop - token_slice.start
+    max_query_len = int(
+        torch.max(torch.abs(query_start_loc_cpu[1:] -
+                            query_start_loc_cpu[:-1])).item())
+
+    block_table_tensor = attn_metadata.block_table_tensor[request_slice]
+    slot_mapping = attn_metadata.slot_mapping[token_slice]
+
+    return CommonAttentionMetadata(
+        query_start_loc=query_start_loc,
+        query_start_loc_cpu=query_start_loc_cpu,
+        seq_lens=seq_lens,
+        seq_lens_cpu=seq_lens_cpu,
+        num_computed_tokens_cpu=num_computed_tokens_cpu,
+        num_reqs=num_requests,
+        num_actual_tokens=num_actual_tokens,
+        max_query_len=max_query_len,
+        block_table_tensor=block_table_tensor,
+        slot_mapping=slot_mapping,
+    )
+
+
+def split_attn_metadata(
+    ubatch_slices: list[UbatchSlice],
+    common_attn_metadata: CommonAttentionMetadata,
+) -> list[CommonAttentionMetadata]:
+    """
+    Creates a new CommonAttentionMetadata instance that corresponds to the 
+    requests for each UbatchSlice in ubatch_slices.
+
+    Note: This function does not modify common_attn_metadata
+    """
+    results = []
+    for ubatch_slice in ubatch_slices:
+        results.append(
+            _make_metadata_with_slice(ubatch_slice, common_attn_metadata))
+    return results
+
+
 M = TypeVar("M")
 
 
+class AttentionCGSupport(enum.Enum):
+    """ Constants for the cudagraph support of the attention backend
+    Here we do not consider the cascade attention, as currently
+    it is never cudagraph supported."""
+
+    NEVER = 0
+    """NO cudagraph support"""
+    PURE_DECODE_ONLY = 1
+    """Cudagraph supported for pure decode, need to run without
+    cudagraph for mixed prefill-decode batches"""
+    ALWAYS = 2
+    """Cudagraph always supported"""
+
+
 class AttentionMetadataBuilder(abc.ABC, Generic[M]):
     # Does this backend/builder support CUDA Graphs for attention.
-    full_cudagraph_supported: ClassVar[bool] = False
+    attn_cudagraph_support: ClassVar[AttentionCGSupport] = \
+        AttentionCGSupport.NEVER
+    # Does this backend/builder reorder the batch?
+    # If not, set this to None. Otherwise set it to the query
+    # length that will be pulled into the front of the batch.
+    reorder_batch_threshold: ClassVar[Optional[int]] = None
 
     @abstractmethod
-    def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig,
-                 device: torch.device):
+    def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
+                 vllm_config: VllmConfig, device: torch.device):
         self.kv_cache_spec = kv_cache_spec
 
     @abstractmethod
@@ -109,6 +214,26 @@ class AttentionMetadataBuilder(abc.ABC, Generic[M]):
         return self.build(common_prefix_len=0,
                           common_attn_metadata=common_attn_metadata)
 
+    def build_for_drafting(
+        self,
+        common_attn_metadata: CommonAttentionMetadata,
+        draft_index: int,
+    ) -> M:
+        """
+        Build attention metadata for draft model. Uses build by default.
+        
+        Args:
+            common_attn_metadata: The common attention metadata.
+            draft_index: The index of the current draft operation.
+                When speculating a chain of tokens, this index refers to the
+                draft attempt for the i-th token.
+                For tree-based attention, this index instead refers to the
+                draft attempt for the i-th level in the tree of tokens.
+        """
+        return self.build(common_prefix_len=0,
+                          common_attn_metadata=common_attn_metadata,
+                          fast_build=True)
+
     def use_cascade_attention(
         self,
         common_prefix_len: int,
@@ -122,14 +247,6 @@ class AttentionMetadataBuilder(abc.ABC, Generic[M]):
     ) -> bool:
         return False
 
-    def reorder_batch(self, input_batch: "InputBatch",
-                      scheduler_output: "SchedulerOutput") -> bool:
-        """
-        This method can reorder the batch if desired by the backend.
-        :return: Has the batch been reordered (default False).
-        """
-        return False
-
 
 @functools.lru_cache
 def get_kv_cache_layout():
@@ -164,14 +281,14 @@ class PerLayerParameters:
 
 
 def get_per_layer_parameters(
-        vllm_config: VllmConfig,
+        vllm_config: VllmConfig, layer_names: list[str],
         cls_: type['AttentionImpl']) -> dict[str, PerLayerParameters]:
     """
-    Scan all attention layers and determine some hyperparameters
+    Scan layers in `layer_names` and determine some hyperparameters
     to use during `plan`.
     """
 
-    layers = get_layers_from_vllm_config(vllm_config, Attention)
+    layers = get_layers_from_vllm_config(vllm_config, Attention, layer_names)
     per_layer_params: dict[str, PerLayerParameters] = {}
 
     for key, layer in layers.items():
@@ -208,6 +325,10 @@ def infer_global_hyperparameters(
     param_sets = list(per_layer_params.values())
     global_params = param_sets[0]
     for params in param_sets:
+        if params.window_left != global_params.window_left:
+            raise ValueError(
+                "Window left is not the same for all layers. One potential fix "
+                "is to set disable_sliding_window=True")
         assert params == global_params, (
             "FlashInfer backend currently only supports models in which all "
             "layers share the same values for the following hyperparameters: "
@@ -504,3 +625,34 @@ def reorder_batch_to_split_decodes_and_prefills(
         modified_batch = True
 
     return modified_batch
+
+
+KV_SHARING_FAST_PREFILL_METADATA_FIELDS = [
+    ('logits_indices_padded', Optional[torch.Tensor], None),
+    ('num_logits_indices', int, 0),
+]
+
+
+def subclass_attention_metadata(
+    name_prefix: str,
+    metadata_cls: Any,
+    fields: list[tuple[str, Any, Any]],
+) -> Any:
+    """
+    Return a new subclass of `metadata_cls` with additional fields
+    """
+    name: str = name_prefix + metadata_cls.__name__  # type: ignore
+    Wrapped = make_dataclass(name, fields, bases=(metadata_cls, ))
+    return Wrapped
+
+
+def make_kv_sharing_fast_prefill_attention_metadata(
+    metadata_cls: Any, ) -> Any:
+    """
+    Return a new subclass of `metadata_cls` for fast prefill
+    """
+    return subclass_attention_metadata(
+        name_prefix="KVSharingFastPrefill",
+        metadata_cls=metadata_cls,
+        fields=KV_SHARING_FAST_PREFILL_METADATA_FIELDS,
+    )
diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py
index 5bf4d3a2acb45..ad9854dd29c38 100644
--- a/vllm/v1/core/block_pool.py
+++ b/vllm/v1/core/block_pool.py
@@ -276,7 +276,7 @@ class BlockPool:
                 # candidate), so remove it.
                 if block.ref_cnt == 0 and not block.is_null:
                     self.free_block_queue.remove(block)
-                block.incr_ref()
+                block.ref_cnt += 1
 
     def free_blocks(self, ordered_blocks: Iterable[KVCacheBlock]) -> None:
         """Free a list of blocks. The blocks should be ordered by their
diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py
index de72e60434ad7..f3a16d64e19fd 100644
--- a/vllm/v1/core/kv_cache_coordinator.py
+++ b/vllm/v1/core/kv_cache_coordinator.py
@@ -7,7 +7,8 @@ from vllm.v1.core.block_pool import BlockPool
 from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock
 from vllm.v1.core.single_type_kv_cache_manager import (
     FullAttentionManager, get_manager_for_kv_cache_spec)
-from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheConfig
+from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
+                                        KVCacheSpec)
 from vllm.v1.request import Request
 
 
@@ -125,14 +126,17 @@ class KVCacheCoordinator(ABC):
     def get_num_common_prefix_blocks(self, request_id: str,
                                      num_running_requests: int) -> list[int]:
         """
-        Get the number of common prefix blocks for a request.
+        Get the number of common prefix blocks for all requests in the RUNNING
+        state for each kv cache group.
 
         Args:
             request_id: The request ID.
-            block_hashes: The block hashes of the request.
+            num_running_requests: The total number of requests in the RUNNING
+                state.
 
         Returns:
-            The number of common prefix blocks.
+            list[int]: The number of common prefix blocks for all requests in
+                the RUNNING state for each kv cache group.
         """
         num_blocks_per_group = [
             manager.get_num_common_prefix_blocks(request_id,
@@ -258,44 +262,40 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
         one of them is full attention. Then, split the kv cache groups into full
         attention groups and other groups.
         """
-        full_attention_type_id: Optional[str] = None
-        other_type_id: Optional[str] = None
+        full_attention_spec: Optional[FullAttentionSpec] = None
+        other_spec: Optional[KVCacheSpec] = None
         self.full_attention_group_ids: list[int] = []
         self.other_group_ids: list[int] = []
         for i, g in enumerate(self.kv_cache_config.kv_cache_groups):
             if isinstance(g.kv_cache_spec, FullAttentionSpec):
-                if full_attention_type_id is None:
-                    full_attention_type_id = g.kv_cache_spec.type_id
+                if full_attention_spec is None:
+                    full_attention_spec = g.kv_cache_spec
                 else:
-                    assert full_attention_type_id == g.kv_cache_spec.type_id, (
+                    assert full_attention_spec == g.kv_cache_spec, (
                         "HybridKVCacheCoordinator assumes exactly one type of "
                         "full attention groups now.")
                 self.full_attention_group_ids.append(i)
             else:
-                if other_type_id is None:
-                    other_type_id = g.kv_cache_spec.type_id
+                if other_spec is None:
+                    other_spec = g.kv_cache_spec
                 else:
-                    assert other_type_id == g.kv_cache_spec.type_id, (
+                    assert other_spec == g.kv_cache_spec, (
                         "HybridKVCacheCoordinator assumes "
                         "exactly one other type of groups now.")
                 self.other_group_ids.append(i)
 
-        assert full_attention_type_id is not None, (
+        assert full_attention_spec is not None, (
             "HybridKVCacheCoordinator assumes exactly one type of full "
             "attention groups now.")
-        assert other_type_id is not None, (
+        assert other_spec is not None, (
             "HybridKVCacheCoordinator assumes exactly one type of other "
             "groups now.")
 
         self.full_attention_manager_cls = FullAttentionManager
         self.other_attention_cls = self.single_type_managers[
             self.other_group_ids[0]].__class__
-
-        self.full_attention_spec = self.kv_cache_config.kv_cache_groups[
-            self.full_attention_group_ids[0]].kv_cache_spec
-        self.other_spec = self.kv_cache_config.kv_cache_groups[
-            self.other_group_ids[0]].kv_cache_spec
-
+        self.full_attention_spec = full_attention_spec
+        self.other_spec = other_spec
         self.full_attention_block_size = self.full_attention_spec.block_size
         self.other_block_size = self.other_spec.block_size
 
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index e820a0ad6d5d0..ce333dbe61a19 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -170,10 +170,6 @@ class KVCacheManager:
                                                self.block_size, request)
             self.req_to_block_hashes[request.request_id] = block_hashes
 
-        if self.log_stats:
-            assert self.prefix_cache_stats is not None
-            self.prefix_cache_stats.requests += 1
-
         # NOTE: When all tokens hit the cache, we must recompute the last token
         # to obtain logits. Thus, set max_cache_hit_length to prompt_length - 1.
         # This can trigger recomputation of an entire block, rather than just
@@ -187,6 +183,7 @@ class KVCacheManager:
 
         if self.log_stats:
             assert self.prefix_cache_stats is not None
+            self.prefix_cache_stats.requests += 1
             self.prefix_cache_stats.queries += request.num_tokens
             self.prefix_cache_stats.hits += num_new_computed_tokens
 
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 5b0218640a8c8..eab1560b1a18c 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -5,7 +5,7 @@
 import os
 from collections import defaultdict, deque
 from collections.abc import Iterable, Sequence
-from dataclasses import dataclass
+from dataclasses import astuple, dataclass
 from typing import Any, Callable, NamedTuple, Optional
 
 from vllm.config import VllmConfig
@@ -154,14 +154,6 @@ class KVCacheBlock:
     # Whether the block is a null block that should never be cached.
     is_null: bool = False
 
-    # TODO(Jialin): For performance, let callers handle ref_cnt bumps to
-    # avoid function calls.
-    def incr_ref(self):
-        self.ref_cnt += 1
-
-    def decr_ref(self):
-        self.ref_cnt -= 1
-
     @property
     def block_hash(self) -> Optional[BlockHashWithGroupId]:
         return self._block_hash
@@ -575,12 +567,10 @@ def hash_request_tokens(hash_function: Any, block_size: int,
 
     ret = []
     parent_block_hash_value = None
-    for start in range(0, len(token_ids), block_size):
+    # Only full blocks will be hashed
+    for start in range(0, len(token_ids) - block_size + 1, block_size):
         end = start + block_size
         block_token_ids = token_ids[start:end]
-        # Do not hash the block if it is not full.
-        if len(block_token_ids) < block_size:
-            break
 
         if req_need_extra_keys:
             # MM and LoRA requests need extra keys for block-hash computation.
@@ -727,7 +717,9 @@ def create_kv_cache_group_specs(
 
 def is_kv_cache_type_uniform(kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
     """
-    Whether all layers in the given KVCacheSpec have the same type of KV cache.
+    Whether all layers in the given KVCacheSpec have the same KV cache spec.
+    Note that we regard FullAttentionSpec with and without sliding window as
+    the same type.
 
     Args:
         kv_cache_spec: The kv cache spec of each attention layer in the model
@@ -736,8 +728,12 @@ def is_kv_cache_type_uniform(kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
         True if all layers have the same type, False otherwise.
     """
 
-    layer_keys = set(layer.type_id for layer in kv_cache_spec.values())
-    return len(layer_keys) == 1
+    try:
+        kv_cache_spec_values = list(kv_cache_spec.values())
+        _ = kv_cache_spec_values[0].merge(kv_cache_spec_values)
+    except AssertionError:
+        return False
+    return True
 
 
 def get_max_concurrency_for_kv_cache_config(
@@ -928,12 +924,12 @@ def _get_kv_cache_config_uniform_page_size(
     Returns:
         The generated KVCacheConfig
     """
-    # Group all layers by type_id.
+    # Group all layers by kv_cache_spec.
     # E.g., 2 full attention layers and 3 sliding window attention layers,
     # -> (full.0, full.1), (sw.0, sw.1, sw.2).
-    same_type_layers: dict[str, list[str]] = defaultdict(list)
+    same_type_layers: dict[KVCacheSpec, list[str]] = defaultdict(list)
     for layer_name, layer_spec in kv_cache_spec.items():
-        same_type_layers[layer_spec.type_id].append(layer_name)
+        same_type_layers[layer_spec].append(layer_name)
 
     # Split each group into smaller groups, to make the number of layers in each
     # group identical. Add padding to the last group of each type if necessary.
@@ -1017,12 +1013,7 @@ def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]):
         kv_cache_spec: The kv cache spec of each attention layer in the model
     """
 
-    def is_hybrid(kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
-        type_ids = set(layer_spec.type_id
-                       for layer_spec in kv_cache_spec.values())
-        return len(type_ids) > 1
-
-    if not is_hybrid(kv_cache_spec):
+    if is_kv_cache_type_uniform(kv_cache_spec):
         return
 
     logger.warning(
@@ -1060,7 +1051,7 @@ def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]):
                     attention_chunk_size=spec.attention_chunk_size,
                 )
 
-    if is_hybrid(kv_cache_spec):
+    if not is_kv_cache_type_uniform(kv_cache_spec):
         raise ValueError("Hybrid KV cache manager is disabled but failed to "
                          "convert the KV cache specs to one unified type.")
 
@@ -1119,11 +1110,11 @@ def unify_kv_cache_configs(kv_cache_configs: list[KVCacheConfig]):
             in-place modified to make them consistent.
     """
 
-    # Sort the kv cache groups by the type_id of their KV cache spec.
+    # Sort the kv cache groups by their KV cache spec.
     # This can avoid the inconsistency caused by the order of groups.
     for kv_cache_config in kv_cache_configs:
-        kv_cache_config.kv_cache_groups.sort(
-            key=lambda x: x.kv_cache_spec.type_id)
+        kv_cache_config.kv_cache_groups.sort(key=lambda x: (type(
+            x.kv_cache_spec).__name__, astuple(x.kv_cache_spec)))
 
     # Verify that the groups of each rank are the same.
     for kv_cache_config in kv_cache_configs[1:]:
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 446f98034cb8b..d39aea1f2d116 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -30,7 +30,7 @@ from vllm.v1.engine import (EngineCoreEventType, EngineCoreOutput,
                             EngineCoreOutputs)
 from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.metrics.stats import SchedulerStats
-from vllm.v1.outputs import ModelRunnerOutput
+from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.spec_decode.metrics import SpecDecodingStats
 from vllm.v1.structured_output import StructuredOutputManager
@@ -83,7 +83,7 @@ class Scheduler(SchedulerInterface):
             assert len(self.kv_cache_config.kv_cache_groups) == 1, (
                 "Multiple KV cache groups are not currently supported "
                 "with KV connectors")
-            self.connector = KVConnectorFactory.create_connector_v1(
+            self.connector = KVConnectorFactory.create_connector(
                 config=self.vllm_config, role=KVConnectorRole.SCHEDULER)
 
         self.kv_event_publisher = EventPublisherFactory.create(
@@ -884,7 +884,9 @@ class Scheduler(SchedulerInterface):
             self.waiting.remove_requests(stopped_preempted_reqs)
 
         # KV Connector: update state for finished KV Transfers.
-        self._update_from_kv_xfer_finished(model_runner_output)
+        if model_runner_output.kv_connector_output:
+            self._update_from_kv_xfer_finished(
+                model_runner_output.kv_connector_output)
 
         # Create EngineCoreOutputs for all clients that have requests with
         # outputs in this step.
@@ -1128,7 +1130,7 @@ class Scheduler(SchedulerInterface):
         return True
 
     def _update_from_kv_xfer_finished(self,
-                                      model_runner_output: ModelRunnerOutput):
+                                      kv_connector_output: KVConnectorOutput):
         """
         KV Connector: update the scheduler state based on the output.
 
@@ -1139,9 +1141,9 @@ class Scheduler(SchedulerInterface):
             scheduler the request during the next step.
         """
         # KV Connector:: update recv and send status from last step.
-        for req_id in (model_runner_output.finished_recving or ()):
+        for req_id in (kv_connector_output.finished_recving or ()):
             logger.debug("Finished recving KV transfer for request %s", req_id)
             self.finished_recving_kv_req_ids.add(req_id)
-        for req_id in (model_runner_output.finished_sending or ()):
+        for req_id in (kv_connector_output.finished_sending or ()):
             logger.debug("Finished sending KV transfer for request %s", req_id)
             self._free_blocks(self.requests[req_id])
diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py
index e8a44c7773a71..8f310023a8cd3 100644
--- a/vllm/v1/core/single_type_kv_cache_manager.py
+++ b/vllm/v1/core/single_type_kv_cache_manager.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import itertools
 from abc import ABC, abstractmethod
 from collections import defaultdict
 from typing import Callable
@@ -177,14 +178,17 @@ class SingleTypeKVCacheManager(ABC):
     def get_num_common_prefix_blocks(self, request_id: str,
                                      num_running_requests: int) -> int:
         """
-        Get the number of common prefix blocks for a request.
+        Get the number of common prefix blocks for all requests in the RUNNING
+        state.
 
         Args:
             request_id: The request ID.
-            block_hashes: The block hashes of the request.
+            num_running_requests: The total number of requests in the RUNNING
+                state.
 
         Returns:
-            The number of common prefix blocks.
+            The number of common prefix blocks for all requests in the RUNNING
+                state.
         """
 
         raise NotImplementedError
@@ -264,7 +268,7 @@ class FullAttentionManager(SingleTypeKVCacheManager):
         computed_blocks: tuple[list[KVCacheBlock], ...] = tuple(
             [] for _ in range(len(kv_cache_group_ids)))
         max_num_blocks = max_length // kv_cache_spec.block_size
-        for i, block_hash in zip(range(max_num_blocks), block_hashes):
+        for block_hash in itertools.islice(block_hashes, max_num_blocks):
             # block_hashes is a chain of block hashes. If a block hash is not
             # in the cached_block_hash_to_id, the following block hashes are
             # not computed yet for sure.
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 79dc80d8fc547..810d03f32d726 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -123,6 +123,13 @@ class EngineCoreOutput(
         return self.finish_reason is not None
 
 
+class UtilityResult:
+    """Wrapper for special handling when serializing/deserializing."""
+
+    def __init__(self, r: Any = None):
+        self.result = r
+
+
 class UtilityOutput(
         msgspec.Struct,
         array_like=True,  # type: ignore[call-arg]
@@ -132,7 +139,7 @@ class UtilityOutput(
 
     # Non-None implies the call failed, result should be None.
     failure_message: Optional[str] = None
-    result: Any = None
+    result: Optional[UtilityResult] = None
 
 
 class EngineCoreOutputs(
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index ed0d9620f4762..45f450291ab63 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -27,7 +27,7 @@ from vllm.transformers_utils.config import (
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import Device, cdiv
+from vllm.utils import Device, cdiv, deprecate_kwargs
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.core_client import EngineCoreClient
 from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError
@@ -57,6 +57,7 @@ class AsyncLLM(EngineClient):
         start_engine_loop: bool = True,
         stat_loggers: Optional[list[StatLoggerFactory]] = None,
         client_addresses: Optional[dict[str, str]] = None,
+        client_count: int = 1,
         client_index: int = 0,
     ) -> None:
         """
@@ -120,6 +121,7 @@ class AsyncLLM(EngineClient):
             executor_class=executor_class,
             log_stats=self.log_stats,
             client_addresses=client_addresses,
+            client_count=client_count,
             client_index=client_index,
         )
 
@@ -142,16 +144,23 @@ class AsyncLLM(EngineClient):
             pass
 
     @classmethod
+    @deprecate_kwargs(
+        "disable_log_requests",
+        additional_message=("This argument will have no effect. "
+                            "Use `enable_log_requests` instead."),
+    )
     def from_vllm_config(
-        cls,
-        vllm_config: VllmConfig,
-        start_engine_loop: bool = True,
-        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
-        stat_loggers: Optional[list[StatLoggerFactory]] = None,
-        disable_log_requests: bool = False,
-        disable_log_stats: bool = False,
-        client_addresses: Optional[dict[str, str]] = None,
-        client_index: int = 0,
+            cls,
+            vllm_config: VllmConfig,
+            start_engine_loop: bool = True,
+            usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+            stat_loggers: Optional[list[StatLoggerFactory]] = None,
+            enable_log_requests: bool = False,
+            disable_log_stats: bool = False,
+            client_addresses: Optional[dict[str, str]] = None,
+            client_count: int = 1,
+            client_index: int = 0,
+            disable_log_requests: bool = True,  # Deprecated, will be removed
     ) -> "AsyncLLM":
         if not envs.VLLM_USE_V1:
             raise ValueError(
@@ -166,10 +175,11 @@ class AsyncLLM(EngineClient):
             executor_class=Executor.get_class(vllm_config),
             start_engine_loop=start_engine_loop,
             stat_loggers=stat_loggers,
-            log_requests=not disable_log_requests,
+            log_requests=enable_log_requests,
             log_stats=not disable_log_stats,
             usage_context=usage_context,
             client_addresses=client_addresses,
+            client_count=client_count,
             client_index=client_index,
         )
 
@@ -191,7 +201,7 @@ class AsyncLLM(EngineClient):
         return cls(
             vllm_config=vllm_config,
             executor_class=executor_class,
-            log_requests=not engine_args.disable_log_requests,
+            log_requests=engine_args.enable_log_requests,
             log_stats=not engine_args.disable_log_stats,
             start_engine_loop=start_engine_loop,
             usage_context=usage_context,
diff --git a/vllm/v1/engine/coordinator.py b/vllm/v1/engine/coordinator.py
index fc45eea3a73cf..596edfdbe24f8 100644
--- a/vllm/v1/engine/coordinator.py
+++ b/vllm/v1/engine/coordinator.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import copy
 import multiprocessing
 import time
 import weakref
@@ -10,11 +11,10 @@ import zmq
 
 from vllm.config import ParallelConfig
 from vllm.logger import init_logger
-from vllm.utils import get_mp_context, make_zmq_socket
+from vllm.utils import get_mp_context, make_zmq_socket, set_process_title
 from vllm.v1.engine import EngineCoreOutputs, EngineCoreRequestType
 from vllm.v1.serial_utils import MsgpackDecoder
-from vllm.v1.utils import (bind_process_name, get_engine_client_zmq_addr,
-                           shutdown)
+from vllm.v1.utils import get_engine_client_zmq_addr, shutdown
 
 logger = init_logger(__name__)
 
@@ -66,18 +66,14 @@ class DPCoordinator:
 
         # Assume coordinator is colocated with front-end procs when not in
         # either external or hybrid DP LB mode.
+        local_only = not (external_lb or hybrid_lb)
         front_publish_address = get_engine_client_zmq_addr(
-            local_only=not external_lb and not hybrid_lb, host=host)
+            local_only=local_only, host=host)
 
         local_only_eng = dp_size == parallel_config.data_parallel_size_local
         back_publish_address = get_engine_client_zmq_addr(local_only_eng, host)
         back_output_address = get_engine_client_zmq_addr(local_only_eng, host)
 
-        # When in external LB mode, load stats aren't published, only changes
-        # to request wave / running state, so we don't need to rate-limit the
-        # updates to the front-end proc(s).
-        min_stats_update_interval_ms = 0 if external_lb else 100
-
         context = get_mp_context()
         self.proc: multiprocessing.Process = context.Process(
             target=DPCoordinatorProc.run_coordinator,
@@ -87,7 +83,6 @@ class DPCoordinator:
                 "front_publish_address": front_publish_address,
                 "back_output_address": back_output_address,
                 "back_publish_address": back_publish_address,
-                "min_stats_update_interval_ms": min_stats_update_interval_ms,
             },
             daemon=True)
         self.proc.start()
@@ -119,17 +114,13 @@ class DPCoordinatorProc:
     def __init__(self,
                  engine_count: int,
                  min_stats_update_interval_ms: int = 100):
-        bind_process_name(self.__class__.__name__)
+        set_process_title("DPCoordinator")
         self.ctx = zmq.Context()
 
         self.engines = [EngineState() for _ in range(engine_count)]
 
         self.stats_update_interval_ms = min_stats_update_interval_ms
 
-        self.current_wave = 0
-        self.engines_running = False
-        self.stats_changed = False
-
     @staticmethod
     def run_coordinator(
         engine_count: int,
@@ -156,6 +147,16 @@ class DPCoordinatorProc:
 
         decoder = MsgpackDecoder(EngineCoreOutputs)
 
+        # For tracking request wave progression.
+        current_wave = 0
+        engines_running = False
+
+        # For tracking request counts for internal load-balancing.
+        stats_changed = False
+        last_stats_step = -1
+        last_stats_wave = -1
+        last_step_counts: Optional[list[list[int]]] = None
+
         with make_zmq_socket(
                 path=front_publish_address,  # IPC
                 ctx=self.ctx,
@@ -173,6 +174,18 @@ class DPCoordinatorProc:
                 bind=True,
         ) as publish_back:
 
+            # Wait until all engines subscribe.
+            for _ in self.engines:
+                if publish_back.recv() != b'\x01':
+                    logger.error(
+                        "DP Coordinator received unexpected message while "
+                        "waiting for engines to subscribe")
+                    return
+            # Send ready message to engines.
+            publish_back.send(b"READY")
+
+            logger.info("All engine subscriptions received by DP coordinator")
+
             poller = zmq.Poller()
             poller.register(publish_front, zmq.POLLIN)
             poller.register(output_back, zmq.POLLIN)
@@ -180,21 +193,33 @@ class DPCoordinatorProc:
             while True:
                 elapsed = int(time.time() * 1000) - last_publish_time
                 # Send at stats_update_interval_ms interval if the stats have
-                # changed, or otherwise every 4 seconds.
+                # changed, or otherwise every 5 seconds.
                 wait_for = (self.stats_update_interval_ms
-                            if self.stats_changed else 4000)
-                events = poller.poll(timeout=max(0, wait_for - elapsed))
+                            if stats_changed else 5000)
+
+                # Wait at least 50ms to ensure we've received all stats for
+                # the current step.
+                min_timeout = 50 if last_step_counts is None else 0
+
+                events = poller.poll(timeout=max(min_timeout, wait_for -
+                                                 elapsed))
                 if not events:
                     # Poller timeout - publish current stats to front-ends.
-                    engine_req_counts_list = self._get_engine_counts()
-                    to_publish = (engine_req_counts_list, self.current_wave,
-                                  self.engines_running)
+                    if last_step_counts is not None:
+                        engine_req_counts_list = last_step_counts
+                        last_step_counts = None
+                    else:
+                        engine_req_counts_list = self._get_engine_counts()
+                        stats_changed = False
+
+                    to_publish = (engine_req_counts_list, current_wave,
+                                  engines_running)
                     publish_front.send(msgspec.msgpack.encode(to_publish))
                     last_publish_time = int(time.time() * 1000)
-                    self.stats_changed = False
                     continue
 
                 events = dict(events)
+                wave_state_changed = False
 
                 if publish_front in events:
                     buffer = publish_front.recv()
@@ -221,7 +246,7 @@ class DPCoordinatorProc:
                             # current_wave
                             # we note that 0 is the wave number for the new
                             # engine
-                            self.engines_running = False
+                            engines_running = False
                             logger.info(
                                 "DPCoordinator scaled up from %s to %s "
                                 "engines", current_count, new_engine_count)
@@ -237,15 +262,15 @@ class DPCoordinatorProc:
                     # engines are paused, so that we can wake the other
                     # engines.
                     engine_to_exclude, wave = decoded
-                    if not self.engines_running:
-                        if wave < self.current_wave:
+                    if not engines_running:
+                        if wave < current_wave:
                             # If the wave number is stale, ensure the message
                             # is handled by all the engines.
                             engine_to_exclude = None
 
-                        self.engines_running = True
-                        self.stats_changed = True
-                        self._send_start_wave(publish_back, self.current_wave,
+                        engines_running = True
+                        wave_state_changed = True
+                        self._send_start_wave(publish_back, current_wave,
                                               engine_to_exclude)
 
                 if output_back in events:
@@ -263,36 +288,56 @@ class DPCoordinatorProc:
                         # 1. Updated request load stats - update our local
                         # state with these.
                         stats = self.engines[eng_index].request_counts
+                        stats_step = scheduler_stats.step_counter
+                        stats_wave = scheduler_stats.current_wave
+                        if (stats_wave > last_stats_wave
+                                or stats_wave == last_stats_wave
+                                and stats_step > last_stats_step):
+                            if stats_changed:
+                                last_step_counts = self._get_engine_counts(
+                                    do_copy=True)
+                            last_stats_step = stats_step
+                            last_stats_wave = stats_wave
+                        elif stats_wave != last_stats_wave or (
+                                stats_step != last_stats_step):
+                            logger.warning(
+                                "Received stats for out-of-order "
+                                "step (%d, %d) from engine %d (expected "
+                                "> (%d, %d))", stats_wave, stats_step,
+                                eng_index, last_stats_wave, last_stats_step)
                         stats[0] = scheduler_stats.num_waiting_reqs
                         stats[1] = scheduler_stats.num_running_reqs
-                        self.stats_changed = True
+                        stats_changed = True
 
                     if (wave := outputs.wave_complete) is not None:
                         # 2. Notification from rank 0 engine that we've
                         # moved into the global paused state
                         # (engines_running==False).
-                        if self.current_wave <= wave:
+                        if current_wave <= wave:
                             new_wave = wave + 1
                             logger.debug("Moving DP wave from %d to %d.",
-                                         self.current_wave, new_wave)
-                            self.current_wave = new_wave
-                            self.engines_running = False
-                            self.stats_changed = True
+                                         current_wave, new_wave)
+                            current_wave = new_wave
+                            engines_running = False
+                            wave_state_changed = True
                     elif (wave := outputs.start_wave) is not None and (
-                            wave > self.current_wave or
-                        (wave == self.current_wave
-                         and not self.engines_running)):
+                            wave > current_wave or
+                        (wave == current_wave and not engines_running)):
                         # 3. The engine received request for a non-current wave
                         # so we must ensure that other engines progress to the
                         # next wave (race condition handling).
                         logger.debug(
                             "Starting wave %d after notification of "
                             "stale wave request from engine.", wave)
-                        self.current_wave = wave
-                        self.engines_running = True
-                        self.stats_changed = True
+                        current_wave = wave
+                        engines_running = True
+                        wave_state_changed = True
                         self._send_start_wave(publish_back, wave, eng_index)
 
+                if wave_state_changed:
+                    message = (None, current_wave, engines_running)
+                    publish_front.send(msgspec.msgpack.encode(message))
+
     @staticmethod
     def _send_start_wave(socket: zmq.Socket, wave: int,
                          exclude_engine_index: Optional[int]):
@@ -305,6 +350,8 @@ class DPCoordinatorProc:
         socket.send_multipart(
             (EngineCoreRequestType.START_DP_WAVE.value, wave_encoded))
 
-    def _get_engine_counts(self) -> list[list[int]]:
+    def _get_engine_counts(self, do_copy=False) -> list[list[int]]:
         """Return list of [waiting, running] count lists for each engine."""
+        if do_copy:
+            return [copy.copy(e.request_counts) for e in self.engines]
         return [e.request_counts for e in self.engines]
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 8a1f28a7a7715..61f3c29719e40 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -3,7 +3,6 @@
 import os
 import queue
 import signal
-import sys
 import threading
 import time
 from collections import deque
@@ -19,15 +18,14 @@ import zmq
 
 from vllm.config import ParallelConfig, VllmConfig
 from vllm.distributed import stateless_destroy_torch_distributed_process_group
-from vllm.executor.multiproc_worker_utils import _add_prefix
 from vllm.logger import init_logger
 from vllm.logging_utils.dump_input import dump_engine_exception
 from vllm.lora.request import LoRARequest
 from vllm.tasks import POOLING_TASKS, SupportedTask
 from vllm.transformers_utils.config import (
     maybe_register_config_serialize_by_value)
-from vllm.utils import (bind_process_name, make_zmq_socket,
-                        resolve_obj_by_qualname)
+from vllm.utils import (decorate_logs, make_zmq_socket,
+                        resolve_obj_by_qualname, set_process_title)
 from vllm.v1.core.kv_cache_utils import (get_kv_cache_config,
                                          unify_kv_cache_configs)
 from vllm.v1.core.sched.interface import SchedulerInterface
@@ -36,7 +34,7 @@ from vllm.v1.core.sched.scheduler import Scheduler as V1Scheduler
 from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
                             EngineCoreRequestType,
                             ReconfigureDistributedRequest, ReconfigureRankType,
-                            UtilityOutput)
+                            UtilityOutput, UtilityResult)
 from vllm.v1.engine.mm_input_cache import MirroredProcessingCache
 from vllm.v1.engine.utils import EngineHandshakeMetadata, EngineZmqAddresses
 from vllm.v1.executor.abstract import Executor
@@ -209,8 +207,17 @@ class EngineCore:
     def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
         return self.model_executor.supported_tasks
 
-    def add_request(self, request: EngineCoreRequest):
-        """Add request to the scheduler."""
+    def add_request(self, request: Request, request_wave: int = 0):
+        """Add request to the scheduler.
+        
+        `request_wave`: indicate which wave of requests this is expected to
+        belong to in DP case
+        """
+        # Validate the request_id type.
+        if not isinstance(request.request_id, str):
+            raise TypeError(
+                f"request_id must be a string, got {type(request.request_id)}")
+
         if pooling_params := request.pooling_params:
             supported_pooling_tasks = [
                 task for task in self.get_supported_tasks()
@@ -221,27 +228,12 @@ class EngineCore:
                 raise ValueError(f"Unsupported task: {pooling_params.task!r} "
                                  f"Supported tasks: {supported_pooling_tasks}")
 
-        if request.mm_hashes is not None:
-            # Here, if hash exists for a multimodal input, then it will be
-            # fetched from the cache, else it will be added to the cache.
-            # Note that the cache here is mirrored with the client cache, so
-            # anything that has a hash must have a HIT cache entry here
-            # as well.
-            assert request.mm_inputs is not None
-            request.mm_inputs = self.mm_input_cache_server.get_and_update_p1(
-                request.mm_inputs, request.mm_hashes)
-
-        req = Request.from_engine_core_request(request)
-        if req.use_structured_output:
-            # Start grammar compilation asynchronously
-            self.structured_output_manager.grammar_init(req)
-
-        if req.kv_transfer_params is not None and (
+        if request.kv_transfer_params is not None and (
                 not self.scheduler.get_kv_connector()):
             logger.warning("Got kv_transfer_params, but no KVConnector found. "
                            "Disabling KVTransfer for this request.")
 
-        self.scheduler.add_request(req)
+        self.scheduler.add_request(request)
 
     def abort_requests(self, request_ids: list[str]):
         """Abort requests from the scheduler."""
@@ -413,6 +405,31 @@ class EngineCore:
         self.model_executor.save_tensorized_model(
             tensorizer_config=tensorizer_config, )
 
+    def preprocess_add_request(
+            self, request: EngineCoreRequest) -> tuple[Request, int]:
+        """Preprocess the request.
+        
+        This function could be directly used in input processing thread to allow
+        request initialization running in parallel with Model forward
+        """
+        if request.mm_hashes is not None:
+            assert request.mm_inputs is not None
+            # Note on thread safety: no race condition.
+            # `mm_input_cache_server` is reset at the end of LLMEngine init,
+            # and will only accessed in the input processing thread afterwards.
+            request.mm_inputs = self.mm_input_cache_server.get_and_update_p1(
+                request.mm_inputs, request.mm_hashes)
+
+        req = Request.from_engine_core_request(request)
+        if req.use_structured_output:
+            # Note on thread safety: no race condition.
+            # `grammar_init` is only invoked in input processing thread. For
+            # `structured_output_manager`, each request is independent and
+            # grammar compilation is async. Scheduler always checks grammar
+            # compilation status before scheduling request.
+            self.structured_output_manager.grammar_init(req)
+        return req, request.current_wave
+
 
 class EngineCoreProc(EngineCore):
     """ZMQ-wrapper for running EngineCore in background process."""
@@ -429,7 +446,6 @@ class EngineCoreProc(EngineCore):
         client_handshake_address: Optional[str] = None,
         engine_index: int = 0,
     ):
-        bind_process_name(self.__class__.__name__, f"{engine_index}")
         self.input_queue = queue.Queue[tuple[EngineCoreRequestType, Any]]()
         self.output_queue = queue.Queue[Union[tuple[int, EngineCoreOutputs],
                                               bytes]]()
@@ -449,8 +465,11 @@ class EngineCoreProc(EngineCore):
             self.has_coordinator = addresses.coordinator_output is not None
             self.frontend_stats_publish_address = (
                 addresses.frontend_stats_publish_address)
+            logger.debug("Has DP Coordinator: %s, stats publish address: %s",
+                         self.has_coordinator,
+                         self.frontend_stats_publish_address)
             # Only publish request queue stats to coordinator for "internal"
-            # LB mode.
+            # and "hybrid" LB modes .
             self.publish_dp_lb_stats = (
                 self.has_coordinator
                 and not vllm_config.parallel_config.data_parallel_external_lb)
@@ -460,25 +479,38 @@ class EngineCoreProc(EngineCore):
             super().__init__(vllm_config, executor_class, log_stats,
                              executor_fail_callback)
 
+            # Background Threads and Queues for IO. These enable us to
+            # overlap ZMQ socket IO with GPU since they release the GIL,
+            # and to overlap some serialization/deserialization with the
+            # model forward pass.
+            # Threads handle Socket <-> Queues and core_busy_loop uses Queue.
+            ready_event = threading.Event()
+            input_thread = threading.Thread(target=self.process_input_sockets,
+                                            args=(addresses.inputs,
+                                                  addresses.coordinator_input,
+                                                  identity, ready_event),
+                                            daemon=True)
+            input_thread.start()
+
+            self.output_thread = threading.Thread(
+                target=self.process_output_sockets,
+                args=(addresses.outputs, addresses.coordinator_output,
+                      self.engine_index),
+                daemon=True)
+            self.output_thread.start()
+
+            # Don't complete handshake until DP coordinator ready message is
+            # received.
+            while not ready_event.wait(timeout=10):
+                if not input_thread.is_alive():
+                    raise RuntimeError(
+                        "Input socket thread died during startup")
+                assert addresses.coordinator_input is not None
+                logger.info("Waiting for READY message from DP Coordinator...")
+
         self.step_fn = (self.step if self.batch_queue is None else
                         self.step_with_batch_queue)
 
-        # Background Threads and Queues for IO. These enable us to
-        # overlap ZMQ socket IO with GPU since they release the GIL,
-        # and to overlap some serialization/deserialization with the
-        # model forward pass.
-        # Threads handle Socket <-> Queues and core_busy_loop uses Queue.
-        threading.Thread(target=self.process_input_sockets,
-                         args=(addresses.inputs, addresses.coordinator_input,
-                               identity),
-                         daemon=True).start()
-        self.output_thread = threading.Thread(
-            target=self.process_output_sockets,
-            args=(addresses.outputs, addresses.coordinator_output,
-                  self.engine_index),
-            daemon=True)
-        self.output_thread.start()
-
     @contextmanager
     def _perform_handshakes(
         self,
@@ -493,10 +525,10 @@ class EngineCoreProc(EngineCore):
 
         For DP=1 or offline mode, this is with the colocated front-end process.
 
-        For DP>1 with internal loadbalancing this is with the shared front-end
+        For DP>1 with internal load-balancing this is with the shared front-end
         process which may reside on a different node.
 
-        For DP>1 with external or hybrid loadbalancing, two handshakes are
+        For DP>1 with external or hybrid load-balancing, two handshakes are
         performed:
             - With the rank 0 front-end process which retrieves the
               DP Coordinator ZMQ addresses and DP process group address.
@@ -634,11 +666,15 @@ class EngineCoreProc(EngineCore):
             parallel_config: ParallelConfig = kwargs[
                 "vllm_config"].parallel_config
             if parallel_config.data_parallel_size > 1 or dp_rank > 0:
+                set_process_title("DPEngineCore", str(dp_rank))
+                decorate_logs()
                 # Set data parallel rank for this engine process.
                 parallel_config.data_parallel_rank = dp_rank
                 parallel_config.data_parallel_rank_local = local_dp_rank
                 engine_core = DPEngineCoreProc(*args, **kwargs)
             else:
+                set_process_title("EngineCore")
+                decorate_logs()
                 engine_core = EngineCoreProc(*args, **kwargs)
 
             engine_core.run_busy_loop()
@@ -705,7 +741,8 @@ class EngineCoreProc(EngineCore):
         """Dispatch request from client."""
 
         if request_type == EngineCoreRequestType.ADD:
-            self.add_request(request)
+            req, request_wave = request
+            self.add_request(req, request_wave)
         elif request_type == EngineCoreRequestType.ABORT:
             self.abort_requests(request)
         elif request_type == EngineCoreRequestType.UTILITY:
@@ -713,8 +750,8 @@ class EngineCoreProc(EngineCore):
             output = UtilityOutput(call_id)
             try:
                 method = getattr(self, method_name)
-                output.result = method(
-                    *self._convert_msgspec_args(method, args))
+                result = method(*self._convert_msgspec_args(method, args))
+                output.result = UtilityResult(result)
             except BaseException as e:
                 logger.exception("Invocation of %s method failed", method_name)
                 output.failure_message = (f"Call to {method_name} method"
@@ -755,7 +792,7 @@ class EngineCoreProc(EngineCore):
 
     def process_input_sockets(self, input_addresses: list[str],
                               coord_input_address: Optional[str],
-                              identity: bytes):
+                              identity: bytes, ready_event: threading.Event):
         """Input socket IO thread."""
 
         # Msgpack serialization decoding.
@@ -792,9 +829,14 @@ class EngineCoreProc(EngineCore):
                 # back to us.
                 input_socket.send(b'')
                 poller.register(input_socket, zmq.POLLIN)
+
             if coord_socket is not None:
+                # Wait for ready message from coordinator.
+                assert coord_socket.recv() == b"READY"
                 poller.register(coord_socket, zmq.POLLIN)
 
+            ready_event.set()
+            del ready_event
             while True:
                 for input_socket, _ in poller.poll():
                     # (RequestType, RequestData)
@@ -804,10 +846,11 @@ class EngineCoreProc(EngineCore):
                         bytes(type_frame.buffer))
 
                     # Deserialize the request data.
-                    decoder = add_request_decoder if (
-                        request_type
-                        == EngineCoreRequestType.ADD) else generic_decoder
-                    request = decoder.decode(data_frames)
+                    if request_type == EngineCoreRequestType.ADD:
+                        request = add_request_decoder.decode(data_frames)
+                        request = self.preprocess_add_request(request)
+                    else:
+                        request = generic_decoder.decode(data_frames)
 
                     # Push to input queue for core busy loop.
                     self.input_queue.put_nowait((request_type, request))
@@ -887,11 +930,9 @@ class DPEngineCoreProc(EngineCoreProc):
         log_stats: bool,
         client_handshake_address: Optional[str] = None,
     ):
-        self._decorate_logs()
-
         # Counts forward-passes of the model so that we can synchronize
         # finished with DP peers every N steps.
-        self.counter = 0
+        self.step_counter = 0
         self.current_wave = 0
         self.last_counts = (0, 0)
 
@@ -901,15 +942,6 @@ class DPEngineCoreProc(EngineCoreProc):
                          executor_class, log_stats, client_handshake_address,
                          dp_rank)
 
-    def _decorate_logs(self):
-        # Add process-specific prefix to stdout and stderr before
-        # we initialize the engine.
-        from multiprocessing import current_process
-        process_name = current_process().name
-        pid = os.getpid()
-        _add_prefix(sys.stdout, process_name, pid)
-        _add_prefix(sys.stderr, process_name, pid)
-
     def _init_data_parallel(self, vllm_config: VllmConfig):
 
         # Configure GPUs and stateless process group for data parallel.
@@ -937,17 +969,17 @@ class DPEngineCoreProc(EngineCoreProc):
         if dp_group := getattr(self, "dp_group", None):
             stateless_destroy_torch_distributed_process_group(dp_group)
 
-    def add_request(self, request: EngineCoreRequest):
-        if self.has_coordinator and request.current_wave != self.current_wave:
-            if request.current_wave > self.current_wave:
-                self.current_wave = request.current_wave
+    def add_request(self, request: Request, request_wave: int = 0):
+        if self.has_coordinator and request_wave != self.current_wave:
+            if request_wave > self.current_wave:
+                self.current_wave = request_wave
             elif not self.engines_running:
                 # Request received for an already-completed wave, notify
                 # front-end that we need to start the next one.
                 self.output_queue.put_nowait(
                     (-1, EngineCoreOutputs(start_wave=self.current_wave)))
 
-        super().add_request(request)
+        super().add_request(request, request_wave)
 
     def _handle_client_request(self, request_type: EngineCoreRequestType,
                                request: Any) -> None:
@@ -971,7 +1003,9 @@ class DPEngineCoreProc(EngineCoreProc):
         counts = self.scheduler.get_request_counts()
         if counts != self.last_counts:
             self.last_counts = counts
-            stats = SchedulerStats(*counts)
+            stats = SchedulerStats(*counts,
+                                   step_counter=self.step_counter,
+                                   current_wave=self.current_wave)
             self.output_queue.put_nowait(
                 (-1, EngineCoreOutputs(scheduler_stats=stats)))
 
@@ -1013,15 +1047,16 @@ class DPEngineCoreProc(EngineCoreProc):
                     self.output_queue.put_nowait(
                         (client_index,
                          EngineCoreOutputs(wave_complete=self.current_wave)))
+                # Increment wave count and reset step counter.
                 self.current_wave += 1
+                self.step_counter = 0
 
     def _has_global_unfinished_reqs(self, local_unfinished: bool) -> bool:
 
         # Optimization - only perform finish-sync all-reduce every 32 steps.
-        self.counter += 1
-        if self.counter != 32:
+        self.step_counter += 1
+        if self.step_counter % 32 != 0:
             return True
-        self.counter = 0
 
         return ParallelConfig.has_unfinished_dp(self.dp_group,
                                                 local_unfinished)
@@ -1131,9 +1166,6 @@ class DPEngineCoreActor(DPEngineCoreProc):
                 f"{(local_dp_rank + 1) * world_size}) "
                 f"base value: \"{os.getenv(device_control_env_var)}\"") from e
 
-    def _decorate_logs(self):
-        pass
-
     @contextmanager
     def _perform_handshakes(self, handshake_address: str, identity: bytes,
                             local_client: bool, vllm_config: VllmConfig,
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index acff5bf6823d9..4d30bb6b74466 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -86,11 +86,12 @@ class EngineCoreClient(ABC):
         executor_class: type[Executor],
         log_stats: bool,
         client_addresses: Optional[dict[str, str]] = None,
+        client_count: int = 1,
         client_index: int = 0,
     ) -> "MPClient":
         parallel_config = vllm_config.parallel_config
         client_args = (vllm_config, executor_class, log_stats,
-                       client_addresses, client_index)
+                       client_addresses, client_count, client_index)
         if parallel_config.data_parallel_size > 1:
             if parallel_config.data_parallel_external_lb:
                 # External load balancer - client per DP rank.
@@ -250,7 +251,8 @@ class InprocClient(EngineCoreClient):
         return self.engine_core.get_supported_tasks()
 
     def add_request(self, request: EngineCoreRequest) -> None:
-        self.engine_core.add_request(request)
+        req, request_wave = self.engine_core.preprocess_add_request(request)
+        self.engine_core.add_request(req, request_wave)
 
     def abort_requests(self, request_ids: list[str]) -> None:
         if len(request_ids) > 0:
@@ -552,7 +554,8 @@ def _process_utility_output(output: UtilityOutput,
     if output.failure_message is not None:
         future.set_exception(Exception(output.failure_message))
     else:
-        future.set_result(output.result)
+        assert output.result is not None
+        future.set_result(output.result.result)
 
 
 class SyncMPClient(MPClient):
@@ -725,6 +728,7 @@ class AsyncMPClient(MPClient):
                  executor_class: type[Executor],
                  log_stats: bool,
                  client_addresses: Optional[dict[str, str]] = None,
+                 client_count: int = 1,
                  client_index: int = 0):
         super().__init__(
             asyncio_mode=True,
@@ -927,11 +931,12 @@ class DPAsyncMPClient(AsyncMPClient):
                  executor_class: type[Executor],
                  log_stats: bool,
                  client_addresses: Optional[dict[str, str]] = None,
+                 client_count: int = 1,
                  client_index: int = 0):
         self.current_wave = 0
 
         super().__init__(vllm_config, executor_class, log_stats,
-                         client_addresses, client_index)
+                         client_addresses, client_count, client_index)
 
         # List of [waiting, running] pair per engine.
         # Used only by DPLBAsyncMPClient subclass.
@@ -1027,7 +1032,11 @@ class DPAsyncMPClient(AsyncMPClient):
                     counts, wave, running = msgspec.msgpack.decode(buf)
                     self.current_wave = wave
                     self.engines_running = running
-                    self.lb_engines = counts[count_slice]
+                    if counts is not None:
+                        sliced_counts = counts[count_slice]
+                        self.lb_engines = sliced_counts
+                        logger.debug("Received counts: %s (%s)", sliced_counts,
+                                     count_slice)
 
         resources.stats_update_task = asyncio.create_task(
             run_engine_stats_update_task())
@@ -1063,40 +1072,45 @@ class DPLBAsyncMPClient(DPAsyncMPClient):
                  executor_class: type[Executor],
                  log_stats: bool,
                  client_addresses: Optional[dict[str, str]] = None,
+                 client_count: int = 1,
                  client_index: int = 0):
 
+        self.client_count = client_count
+
         # To route aborts to the correct engine.
         self.reqs_in_flight: dict[str, EngineIdentity] = {}
 
         super().__init__(vllm_config, executor_class, log_stats,
-                         client_addresses, client_index)
+                         client_addresses, client_count, client_index)
 
         assert len(self.core_engines) > 1
 
+        self.eng_start_index = (len(self.core_engines) *
+                                self.client_index) // client_count
+
     def get_core_engine_for_request(
             self, request: EngineCoreRequest) -> EngineIdentity:
         # Engines are in rank order.
+        current_counts = self.lb_engines
         if (eng_index := request.data_parallel_rank) is None:
-            if not self.lb_engines:
+            if not current_counts:
                 return self.core_engine
             # TODO use P2C alg for larger DP sizes
-            num_engines = len(self.lb_engines)
-            min_counts = [sys.maxsize, sys.maxsize]
+            num_engines = len(current_counts)
+            min_score = sys.maxsize
             eng_index = 0
             for i in range(num_engines):
                 # Start from client_index to help with balancing when engines
                 # are empty.
-                idx = (self.client_index + i) % num_engines
-                counts = self.lb_engines[idx]
-                if counts < min_counts:
-                    min_counts = counts
+                idx = (self.eng_start_index + i) % num_engines
+                waiting, running = current_counts[idx]
+                score = waiting * 4 + running
+                if score < min_score:
+                    min_score = score
                     eng_index = idx
-            # Adjust local counts for better balancing between stats updates
-            # from the coordinator (which happen every 100ms).
-            if min_counts[0]:
-                min_counts[0] += 1
-            else:
-                min_counts[1] += 1
+            # Increment local waiting count for better balancing between stats
+            # updates from the coordinator (which happen every 100ms).
+            current_counts[eng_index][0] += self.client_count
 
         chosen_engine = self.core_engines[eng_index]
         # Record which engine is chosen for this request, to handle aborts.
diff --git a/vllm/v1/engine/logprobs.py b/vllm/v1/engine/logprobs.py
index e95da0a5e5aaf..3de7fa6889e55 100644
--- a/vllm/v1/engine/logprobs.py
+++ b/vllm/v1/engine/logprobs.py
@@ -138,7 +138,7 @@ class LogprobsProcessor:
 
     def pop_prompt_logprobs(self) -> Optional[PromptLogprobs]:
         """Pop and return all request prompt logprobs
-        
+
         The logprobs processor aggregates prompt chunk logprobs
         over one or more prefill chunks. This method returns
         all prompt logprobs at once and then forgets them.
@@ -176,7 +176,8 @@ class LogprobsProcessor:
         Returns:
           dict[token id, Logprob]
         """
-
+        if num_logprobs == -1:
+            num_logprobs = len(logprobs)
         # We do not need a special case for the sampled token
         # being in the topk, since inserting duplicated data
         # into a dictionary twice is the same as doing it once.
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 0f2f404a130ef..692a7dd5640e0 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -65,8 +65,11 @@ class Processor:
         params: SamplingParams,
     ) -> None:
         max_logprobs = self.model_config.max_logprobs
+        if max_logprobs == -1:
+            return
         # Validate sample logprobs.
-        if params.logprobs and params.logprobs > max_logprobs:
+        if params.logprobs and (params.logprobs == -1
+                                or params.logprobs > max_logprobs):
             raise ValueError(
                 f"Requested sample logprobs of {params.logprobs}, "
                 f"which is greater than max allowed: {max_logprobs}")
@@ -89,6 +92,10 @@ class Processor:
             return
         if not params.allowed_token_ids:
             raise ValueError("allowed_token_ids is not None and empty!")
+        if self.tokenizer is None:
+            # When skip_tokenizer_init=True, we can't validate token IDs
+            # Skip validation and let the model handle invalid tokens
+            return
         tokenizer = self.tokenizer.get_lora_tokenizer(lora_request)
         vocab_size = len(tokenizer)
         if not all(0 <= tid < vocab_size for tid in params.allowed_token_ids):
@@ -283,8 +290,9 @@ class Processor:
                     len(decoder_inputs["prompt_token_ids"]))
             sampling_params.update_from_generation_config(
                 self.generation_config_fields, eos_token_id)
-            sampling_params.update_from_tokenizer(
-                self.tokenizer.get_lora_tokenizer(lora_request))
+            if self.tokenizer is not None:
+                sampling_params.update_from_tokenizer(
+                    self.tokenizer.get_lora_tokenizer(lora_request))
         else:
             pooling_params = params.clone()
 
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 897174c1599df..0db3bcd7fb408 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -4,7 +4,6 @@ import multiprocessing
 import os
 import pickle
 import signal
-import sys
 import threading
 import time
 import traceback
@@ -28,10 +27,11 @@ from vllm.distributed.device_communicators.shm_broadcast import (Handle,
                                                                  MessageQueue)
 from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator
 from vllm.executor.multiproc_worker_utils import (
-    _add_prefix, set_multiprocessing_worker_envs)
+    set_multiprocessing_worker_envs)
 from vllm.logger import init_logger
-from vllm.utils import (bind_process_name, get_distributed_init_method,
-                        get_loopback_ip, get_mp_context, get_open_port)
+from vllm.utils import (decorate_logs, get_distributed_init_method,
+                        get_loopback_ip, get_mp_context, get_open_port,
+                        set_process_title)
 from vllm.v1.executor.abstract import Executor, FailureCallback
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.worker.worker_base import WorkerWrapperBase
@@ -41,6 +41,8 @@ logger = init_logger(__name__)
 
 class MultiprocExecutor(Executor):
 
+    supports_pp: bool = True
+
     def _init_executor(self) -> None:
         # Call self.shutdown at exit to clean up
         # and ensure workers will be terminated.
@@ -376,13 +378,17 @@ class WorkerProc:
         }
         wrapper.init_worker(all_kwargs)
         self.worker = wrapper
-        bind_process_name(
-            self.worker.worker.__class__.__name__,
-            f"TP{self.rank}_DP{vllm_config.parallel_config.data_parallel_rank}"
-        )
-        pid = os.getpid()
-        _add_prefix(sys.stdout, f"VllmWorker rank={rank}", pid)
-        _add_prefix(sys.stderr, f"VllmWorker rank={rank}", pid)
+
+        pp_size = vllm_config.parallel_config.pipeline_parallel_size
+        tp_size = vllm_config.parallel_config.tensor_parallel_size
+        pp_str = f"PP{rank // tp_size}" if pp_size > 1 else ""
+        tp_str = f"TP{rank % tp_size}" if tp_size > 1 else ""
+        suffix = f"{pp_str}{'_' if pp_str and tp_str else ''}{tp_str}"
+        process_name = "VllmWorker"
+        if suffix:
+            set_process_title(suffix, append=True)
+            process_name = f"{process_name} {suffix}"
+        decorate_logs(process_name)
 
         # Initialize MessageQueue for receiving SchedulerOutput
         self.rpc_broadcast_mq = MessageQueue.create_from_handle(
diff --git a/vllm/v1/executor/ray_distributed_executor.py b/vllm/v1/executor/ray_distributed_executor.py
index b86ac048f5206..c05ad1966d611 100644
--- a/vllm/v1/executor/ray_distributed_executor.py
+++ b/vllm/v1/executor/ray_distributed_executor.py
@@ -43,6 +43,8 @@ class FutureWrapper(Future):
 class RayDistributedExecutor(RayDistributedExecutorV0, Executor):
     """Ray distributed executor using Ray Compiled Graphs."""
 
+    supports_pp: bool = True
+
     def _init_executor(self) -> None:
         super()._init_executor()
 
diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py
index 1da5230116d26..4ff96f9786b88 100644
--- a/vllm/v1/kv_cache_interface.py
+++ b/vllm/v1/kv_cache_interface.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import copy
-from dataclasses import dataclass
+from dataclasses import dataclass, fields
 from math import prod
 from typing import Optional
 
@@ -16,7 +16,7 @@ from vllm.utils import cdiv, get_dtype_size
 logger = init_logger(__name__)
 
 
-@dataclass
+@dataclass(frozen=True)
 class KVCacheSpec:
     """
     A base class for specifying the KV cache format of one layer.
@@ -25,20 +25,6 @@ class KVCacheSpec:
     # number of tokens in a block
     block_size: int
 
-    @property
-    def type_id(self) -> str:
-        """
-        The type identifier of this KV cache.
-        Return different strings for layers with different KV cache type (e.g.,
-        different number of tokens like full attention vs sliding window
-        attention, different KV cache size per token like layers with different
-        number of heads)
-
-        Returns:
-            The type identifier of this KV cache.
-        """
-        raise NotImplementedError
-
     @property
     def page_size_bytes(self) -> int:
         """
@@ -63,13 +49,12 @@ class KVCacheSpec:
         """
         Merge a list of KVCacheSpec objects into a single KVCacheSpec object.
         """
-        assert all(spec.type_id == specs[0].type_id for spec in specs[1:]), (
-            "All layers in the same KV cache group must share the same "
-            "type_id.")
+        assert all(spec == specs[0] for spec in specs[1:]), (
+            "All layers in the same KV cache group must be the same.")
         return copy.deepcopy(specs[0])
 
 
-@dataclass
+@dataclass(frozen=True)
 class AttentionSpec(KVCacheSpec):
     num_kv_heads: int
     head_size: int
@@ -84,7 +69,7 @@ class AttentionSpec(KVCacheSpec):
                 * get_dtype_size(self.dtype)
 
 
-@dataclass
+@dataclass(frozen=True)
 class FullAttentionSpec(AttentionSpec):
     sliding_window: Optional[int] = None
     attention_chunk_size: Optional[int] = None
@@ -98,10 +83,6 @@ class FullAttentionSpec(AttentionSpec):
     Default to None for not using sliding window attention.
     """
 
-    @property
-    def type_id(self) -> str:
-        return f"full_attention_{self.block_size}_{self.page_size_bytes}"
-
     def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
         max_model_len = vllm_config.model_config.max_model_len
         return cdiv(max_model_len, self.block_size) * self.page_size_bytes
@@ -123,15 +104,28 @@ class FullAttentionSpec(AttentionSpec):
         Merge a list of FullAttentionSpec objects into a single 
         FullAttentionSpec object.
         """
-        merged_spec = super().merge(specs)
+        assert all(isinstance(spec, FullAttentionSpec) for spec in specs), (
+            "All attention layers in the same KV cache group must be "
+            "FullAttentionSpec.")
+
         sliding_window = set(spec.sliding_window for spec in specs
                              if spec.sliding_window is not None)
         attention_chunk_size = set(spec.attention_chunk_size for spec in specs
                                    if spec.attention_chunk_size is not None)
-
-        merged_spec.sliding_window = cls.merge_window_sizes(sliding_window)
-        merged_spec.attention_chunk_size = (
-            cls.merge_window_sizes(attention_chunk_size))
+        merged_spec = cls(
+            block_size=specs[0].block_size,
+            num_kv_heads=specs[0].num_kv_heads,
+            head_size=specs[0].head_size,
+            dtype=specs[0].dtype,
+            use_mla=specs[0].use_mla,
+            sliding_window=cls.merge_window_sizes(sliding_window),
+            attention_chunk_size=cls.merge_window_sizes(attention_chunk_size),
+        )
+        for spec in specs:
+            for f in fields(AttentionSpec):
+                assert getattr(spec, f.name) == getattr(merged_spec, f.name), (
+                    "All attention layers in the same KV cache group must have "
+                    "the same attention spec.")
         assert (
             (merged_spec.sliding_window is not None) +
             (merged_spec.attention_chunk_size is not None) <= 1
@@ -140,16 +134,10 @@ class FullAttentionSpec(AttentionSpec):
         return merged_spec
 
 
-@dataclass
+@dataclass(frozen=True)
 class ChunkedLocalAttentionSpec(AttentionSpec):
     attention_chunk_size: int
 
-    @property
-    def type_id(self) -> str:
-        return (
-            f"local_attention_{self.attention_chunk_size}_{self.block_size}_{self.page_size_bytes}"
-        )  # noqa
-
     def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
         max_model_len = vllm_config.model_config.max_model_len
         max_num_batched_tokens = (
@@ -165,17 +153,13 @@ class ChunkedLocalAttentionSpec(AttentionSpec):
         return cdiv(num_tokens, self.block_size) * self.page_size_bytes
 
 
-@dataclass
+@dataclass(frozen=True)
 class SlidingWindowSpec(AttentionSpec):
     sliding_window: int
 
     def __post_init__(self):
         assert not self.use_mla, "MLA is not supported for sliding window"
 
-    @property
-    def type_id(self) -> str:
-        return f"sliding_window_{self.sliding_window}_{self.block_size}_{self.page_size_bytes}"  # noqa
-
     def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
         max_model_len = vllm_config.model_config.max_model_len
         max_num_batched_tokens = (
@@ -195,23 +179,17 @@ class SlidingWindowSpec(AttentionSpec):
         return (cdiv(num_tokens, self.block_size) + 1) * self.page_size_bytes
 
 
-@dataclass
+@dataclass(frozen=True)
 class MambaSpec(KVCacheSpec):
     shapes: tuple[tuple[int, ...], ...]
     dtype: torch.dtype
     page_size_padded: Optional[int] = None
     mamba_type: str = "mamba2"
 
-    def __post_init__(self):
-        self.num_elements = sum(prod(shape) for shape in self.shapes)
-
-    @property
-    def type_id(self) -> str:
-        return f"mamba_{self.shapes}_{self.dtype}_{self.mamba_type}"
-
     @property
     def page_size_bytes(self) -> int:
-        page_size = self.num_elements * get_dtype_size(self.dtype)
+        num_elements = sum(prod(shape) for shape in self.shapes)
+        page_size = num_elements * get_dtype_size(self.dtype)
         if self.page_size_padded is not None:
             assert self.page_size_padded >= page_size
             return self.page_size_padded
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index 1eb10ccb6c493..9a80460261e02 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -33,6 +33,10 @@ class SchedulerStats:
     num_running_reqs: int = 0
     num_waiting_reqs: int = 0
 
+    # These are used for internal DP load-balancing.
+    step_counter: int = 0
+    current_wave: int = 0
+
     kv_cache_usage: float = 0.0
 
     prefix_cache_stats: PrefixCacheStats = field(
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index f78623f571b2d..7d7cd0c94dd04 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -71,6 +71,13 @@ class SamplerOutput:
     logprobs_tensors: Optional[LogprobsTensors]
 
 
+@dataclass
+class KVConnectorOutput:
+    # [req_ids]
+    finished_sending: Optional[set[str]] = None
+    finished_recving: Optional[set[str]] = None
+
+
 # ModelRunnerOutput is serialized and sent to the scheduler process.
 # This is expensive for torch.Tensor so prefer to use list instead.
 @dataclass
@@ -104,9 +111,7 @@ class ModelRunnerOutput:
     # [num_reqs, hidden_size]
     pooler_output: list[Optional[torch.Tensor]]
 
-    # [req_ids]
-    finished_sending: Optional[set[str]] = None
-    finished_recving: Optional[set[str]] = None
+    kv_connector_output: Optional[KVConnectorOutput] = None
 
     # req_id -> num_nans_in_logits
     num_nans_in_logits: Optional[dict[str, int]] = None
@@ -119,6 +124,4 @@ EMPTY_MODEL_RUNNER_OUTPUT = ModelRunnerOutput(req_ids=[],
                                               logprobs=None,
                                               prompt_logprobs_dict={},
                                               pooler_output=[],
-                                              finished_sending=None,
-                                              finished_recving=None,
                                               num_nans_in_logits=None)
diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index 03200c2c2f8ec..809a60c1962f8 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
+import importlib
 import pickle
 from collections.abc import Sequence
 from inspect import isclass
@@ -9,6 +10,7 @@ from types import FunctionType
 from typing import Any, Optional, Union
 
 import cloudpickle
+import msgspec
 import numpy as np
 import torch
 import zmq
@@ -22,6 +24,7 @@ from vllm.multimodal.inputs import (BaseMultiModalField,
                                     MultiModalFlatField, MultiModalKwargs,
                                     MultiModalKwargsItem,
                                     MultiModalSharedField, NestedTensors)
+from vllm.v1.engine import UtilityResult
 
 logger = init_logger(__name__)
 
@@ -46,6 +49,13 @@ def _log_insecure_serialization_warning():
                         "VLLM_ALLOW_INSECURE_SERIALIZATION=1")
 
 
+def _typestr(val: Any) -> Optional[tuple[str, str]]:
+    if val is None:
+        return None
+    t = type(val)
+    return t.__module__, t.__qualname__
+
+
 class MsgpackEncoder:
     """Encoder with custom torch tensor and numpy array serialization.
 
@@ -122,6 +132,17 @@ class MsgpackEncoder:
                     for itemlist in mm._items_by_modality.values()
                     for item in itemlist]
 
+        if isinstance(obj, UtilityResult):
+            result = obj.result
+            if not envs.VLLM_ALLOW_INSECURE_SERIALIZATION:
+                return None, result
+            # Since utility results are not strongly typed, we also encode
+            # the type (or a list of types in the case it's a list) to
+            # help with correct msgspec deserialization.
+            return _typestr(result) if type(result) is not list else [
+                _typestr(v) for v in result
+            ], result
+
         if not envs.VLLM_ALLOW_INSECURE_SERIALIZATION:
             raise TypeError(f"Object of type {type(obj)} is not serializable"
                             "Set VLLM_ALLOW_INSECURE_SERIALIZATION=1 to allow "
@@ -237,8 +258,35 @@ class MsgpackDecoder:
                     k: self._decode_nested_tensors(v)
                     for k, v in obj.items()
                 })
+            if t is UtilityResult:
+                return self._decode_utility_result(obj)
         return obj
 
+    def _decode_utility_result(self, obj: Any) -> UtilityResult:
+        result_type, result = obj
+        if result_type is not None:
+            if not envs.VLLM_ALLOW_INSECURE_SERIALIZATION:
+                raise TypeError("VLLM_ALLOW_INSECURE_SERIALIZATION must "
+                                "be set to use custom utility result types")
+            assert isinstance(result_type, list)
+            if len(result_type) == 2 and isinstance(result_type[0], str):
+                result = self._convert_result(result_type, result)
+            else:
+                assert isinstance(result, list)
+                result = [
+                    self._convert_result(rt, r)
+                    for rt, r in zip(result_type, result)
+                ]
+        return UtilityResult(result)
+
+    def _convert_result(self, result_type: Sequence[str], result: Any) -> Any:
+        if result_type is None:
+            return result
+        mod_name, name = result_type
+        mod = importlib.import_module(mod_name)
+        result_type = getattr(mod, name)
+        return msgspec.convert(result, result_type, dec_hook=self.dec_hook)
+
     def _decode_ndarray(self, arr: Any) -> np.ndarray:
         dtype, shape, data = arr
         # zero-copy decode. We assume the ndarray will not be kept around,
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 63f6fc276189d..b2380bb3dd5ab 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -1,5 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import ast
+from dataclasses import replace
+from typing import Optional
+
 import numpy as np
 import torch
 import torch.nn as nn
@@ -15,6 +19,8 @@ from vllm.model_executor.models import supports_multimodal
 from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM
 from vllm.utils import is_pin_memory_available
 from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
+from vllm.v1.attention.backends.tree_attn import (TreeAttentionMetadata,
+                                                  TreeAttentionMetadataBuilder)
 from vllm.v1.attention.backends.utils import CommonAttentionMetadata
 from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.sample.metadata import SamplingMetadata
@@ -51,6 +57,9 @@ class EagleProposer:
         # hidden size (e.g., Llama 3.3 70B).
         self.hidden_size = self.draft_model_config.get_hidden_size()
 
+        self.is_multimodal_model = vllm_config.model_config \
+            .is_multimodal_model
+
         self.use_cuda_graph = (self.vllm_config.compilation_config.level
                                == CompilationLevel.PIECEWISE and
                                not self.vllm_config.model_config.enforce_eager)
@@ -69,12 +78,51 @@ class EagleProposer:
             (self.max_num_tokens, self.hidden_size),
             dtype=self.dtype,
             device=device)
-        # We need +1 here because the arange is used to set query_start_loc,
-        # which has one more element than batch_size.
-        self.arange = torch.arange(vllm_config.scheduler_config.max_num_seqs +
-                                   1,
-                                   device=device,
-                                   dtype=torch.int32)
+
+        max_batch_size = vllm_config.scheduler_config.max_num_seqs
+        self.arange = torch.arange(
+            # We need +1 here because the arange is used to set query_start_loc,
+            # which has one more element than batch_size.
+            max_batch_size + 1,
+            device=device,
+            dtype=torch.int32,
+        )
+
+        self.inputs_embeds = torch.zeros(
+            (self.max_num_tokens, self.hidden_size),
+            dtype=self.dtype,
+            device=device)
+
+        # Parse the speculative token tree.
+        spec_token_tree = self.speculative_config.speculative_token_tree
+        self.tree_choices: list[tuple[int,
+                                      ...]] = ast.literal_eval(spec_token_tree)
+        tree_depth = len(self.tree_choices[-1])
+        # Precompute per-level properties of the tree.
+        num_drafts_per_level = [0] * tree_depth
+        for node in self.tree_choices:
+            num_drafts_per_level[len(node) - 1] += 1
+        self.cu_drafts_per_level = [num_drafts_per_level[0]]
+        self.child_drafts_per_level = [num_drafts_per_level[0]]
+        for level in range(1, tree_depth):
+            self.cu_drafts_per_level.append(self.cu_drafts_per_level[-1] +
+                                            num_drafts_per_level[level])
+            self.child_drafts_per_level.append(num_drafts_per_level[level] //
+                                               num_drafts_per_level[level - 1])
+        # Find the first level where the tree branches off into one or more
+        # children.
+        self.first_branching_level = None
+        for level in range(tree_depth):
+            if self.cu_drafts_per_level[level] > level + 1:
+                self.first_branching_level = level
+                break
+        # Precompute draft position offsets in flattened tree.
+        self.tree_draft_pos_offsets = torch.arange(
+            1,
+            len(self.tree_choices) + 1,
+            device=device,
+            dtype=torch.int32,
+        ).repeat(max_batch_size, 1)
 
     def propose(
         self,
@@ -88,6 +136,7 @@ class EagleProposer:
         next_token_ids: torch.Tensor,
         common_attn_metadata: CommonAttentionMetadata,
         sampling_metadata: SamplingMetadata,
+        mm_embeds: Optional[list[torch.Tensor]] = None,
     ) -> torch.Tensor:
         num_tokens = target_token_ids.shape[0]
         batch_size = next_token_ids.shape[0]
@@ -109,11 +158,9 @@ class EagleProposer:
         assert self.runner is not None
 
         # FIXME: need to consider multiple kv_cache_groups
-        attn_metadata = self.runner.attn_metadata_builders[0].build(
-            common_prefix_len=0,
-            common_attn_metadata=common_attn_metadata,
-            fast_build=True,
-        )
+        attn_metadata = self.runner.attn_metadata_builders[
+            0].build_for_drafting(common_attn_metadata=common_attn_metadata,
+                                  draft_index=0)
 
         # At this moment, we assume all eagle layers belong to the same KV
         # cache group, thus using the same attention metadata.
@@ -128,14 +175,27 @@ class EagleProposer:
         # copy inputs to buffer for cudagraph
         self.positions[:num_tokens] = target_positions
         self.hidden_states[:num_tokens] = target_hidden_states
+        if self.is_multimodal_model:
+            input_ids = self.input_ids[:num_tokens]
+            inputs_embeds = self.model.get_input_embeddings(
+                input_ids,
+                multimodal_embeddings=mm_embeds or None,
+            )
+            self.inputs_embeds[:num_tokens] = inputs_embeds
+            inputs_embeds = self.inputs_embeds[:num_input_tokens]
+            input_ids = None
+        else:
+            inputs_embeds = None
+            input_ids = self.input_ids[:num_input_tokens]
 
         with set_forward_context(per_layer_attn_metadata,
                                  self.vllm_config,
                                  num_tokens=num_input_tokens):
             ret_hidden_states = self.model(
-                self.input_ids[:num_input_tokens],
-                self.positions[:num_input_tokens],
-                self.hidden_states[:num_input_tokens],
+                input_ids=input_ids,
+                positions=self.positions[:num_input_tokens],
+                hidden_states=self.hidden_states[:num_input_tokens],
+                inputs_embeds=inputs_embeds,
             )
             if self.method == "deepseek_mtp":
                 last_hidden_states = ret_hidden_states
@@ -143,6 +203,22 @@ class EagleProposer:
                 last_hidden_states, hidden_states = ret_hidden_states
         sample_hidden_states = last_hidden_states[last_token_indices]
         logits = self.model.compute_logits(sample_hidden_states, None)
+        positions = target_positions[last_token_indices]
+        hidden_states = hidden_states[last_token_indices]
+        if self.first_branching_level == 0:
+            # Branching has occurred at the root level. Draft using tree
+            # attention.
+            draft_token_ids_list = self.propose_tree(
+                tree_root_level=0,
+                batch_size=batch_size,
+                logits=logits,
+                positions=positions,
+                hidden_states=hidden_states,
+                common_attn_metadata=common_attn_metadata,
+            )
+            # [batch_size, num_tree_tokens]
+            return torch.cat(draft_token_ids_list, dim=1)
+
         draft_token_ids = logits.argmax(dim=-1)
 
         # Early exit if there is only one draft token to be generated.
@@ -154,16 +230,15 @@ class EagleProposer:
         # one layer. Adapt this code to support multiple layers once
         # there's a multi-layer MTP module.
 
-        # Currently FlashAttention is the only backend that supports
-        # multi-token eagle spec decode. This is because the code below
+        # Currently, only FlashAttention and TreeAttention support multi-token
+        # eagle spec decode. This is because the code below
         # makes assumptions about attn_metadata attributes available.
-        assert isinstance(attn_metadata, FlashAttentionMetadata)
+        assert isinstance(attn_metadata,
+                          (FlashAttentionMetadata, TreeAttentionMetadata))
 
         # Generate the remaining draft tokens.
         draft_token_ids_list = [draft_token_ids]
 
-        positions = target_positions[last_token_indices]
-        hidden_states = hidden_states[last_token_indices]
         if self.use_cuda_graph and \
             batch_size <= self.cudagraph_batch_sizes[-1]:
             input_batch_size = self.vllm_config.pad_for_cudagraph(batch_size)
@@ -172,7 +247,7 @@ class EagleProposer:
         attn_metadata.num_actual_tokens = batch_size
         attn_metadata.max_query_len = 1
         attn_metadata.query_start_loc = self.arange[:batch_size + 1]
-        for _ in range(self.num_speculative_tokens - 1):
+        for token_index in range(self.num_speculative_tokens - 1):
             # Update the inputs.
             # cast to int32 is crucial when eagle model is compiled.
             # tensor.argmax() returns int64 by default.
@@ -218,21 +293,43 @@ class EagleProposer:
             self.input_ids[:batch_size] = input_ids
             self.positions[:batch_size] = clamped_positions
             self.hidden_states[:batch_size] = hidden_states
+            if self.is_multimodal_model:
+                inputs_embeds = self.model.get_input_embeddings(input_ids)
+                self.inputs_embeds[:batch_size] = inputs_embeds
+                inputs_embeds = self.inputs_embeds[:input_batch_size]
+                input_ids = None
+            else:
+                inputs_embeds = None
+                input_ids = self.input_ids[:input_batch_size]
 
             # Run the model.
             with set_forward_context(per_layer_attn_metadata,
                                      self.vllm_config,
                                      num_tokens=input_batch_size):
                 last_hidden_states, hidden_states = self.model(
-                    self.input_ids[:input_batch_size],
-                    self.positions[:input_batch_size],
-                    self.hidden_states[:input_batch_size],
+                    input_ids=input_ids,
+                    positions=self.positions[:input_batch_size],
+                    hidden_states=self.hidden_states[:input_batch_size],
+                    inputs_embeds=inputs_embeds,
                 )
             hidden_states = hidden_states[:batch_size]
             logits = self.model.compute_logits(last_hidden_states[:batch_size],
                                                None)
 
-            # TODO(wenlong): get more than one token for tree attention
+            if self.first_branching_level == token_index + 1:
+                # Branching has occurred. The remaining tokens are drafted
+                # using tree attention.
+                draft_token_ids_list += self.propose_tree(
+                    tree_root_level=token_index + 1,
+                    batch_size=batch_size,
+                    logits=logits,
+                    positions=positions,
+                    hidden_states=hidden_states,
+                    common_attn_metadata=common_attn_metadata,
+                )
+                # [batch_size, num_tree_tokens]
+                return torch.cat(draft_token_ids_list, dim=1)
+
             draft_token_ids = logits.argmax(dim=-1)
             draft_token_ids_list.append(draft_token_ids)
 
@@ -240,6 +337,175 @@ class EagleProposer:
         draft_token_ids = torch.stack(draft_token_ids_list, dim=1)
         return draft_token_ids
 
+    def propose_tree(
+        self,
+        tree_root_level: int,
+        batch_size: int,
+        # [num_tokens, vocab_size]
+        logits: torch.Tensor,
+        # [num_tokens]
+        positions: torch.Tensor,
+        # [num_tokens, hidden_size]
+        hidden_states: torch.Tensor,
+        common_attn_metadata: CommonAttentionMetadata,
+    ) -> list[torch.Tensor]:
+        tree_attn_metadata_builder = self.runner.attn_metadata_builders[0]
+        assert isinstance(tree_attn_metadata_builder,
+                          TreeAttentionMetadataBuilder)
+
+        total_num_drafts = self.cu_drafts_per_level[tree_root_level]
+        level_num_drafts = total_num_drafts
+        # Sample a draft token for each child at the tree root level.
+        num_children = self.child_drafts_per_level[tree_root_level]
+        if num_children == 1:
+            draft_token_ids = logits.argmax(dim=-1).view(batch_size, -1)
+        else:
+            draft_token_ids = torch.topk(logits, num_children,
+                                         dim=-1).indices.view(batch_size, -1)
+        draft_token_ids_list = [draft_token_ids]
+        draft_hidden_states = hidden_states.view(batch_size, 1, -1)
+
+        # Initialize empty tensors for concatenation with the level outputs.
+        tree_input_ids = torch.empty(0,
+                                     device=self.input_ids.device,
+                                     dtype=self.input_ids.dtype)
+        tree_positions = torch.empty(0,
+                                     device=self.positions.device,
+                                     dtype=self.positions.dtype)
+        tree_hidden_states = torch.empty(0,
+                                         device=self.hidden_states.device,
+                                         dtype=self.hidden_states.dtype)
+        # Precompute the draft token positions.
+        flattened_draft_positions = (
+            positions.view(batch_size, -1) +
+            self.tree_draft_pos_offsets[:batch_size, :])
+        tree_depth = len(self.cu_drafts_per_level)
+        for level in range(tree_root_level, tree_depth - 1):
+            # Get draft positions for RoPE.
+            draft_positions = positions + (level + 1)
+            exceeds_max_model_len = (positions +
+                                     total_num_drafts) >= self.max_model_len
+            # Mask out the position ids that exceed the max model length.
+            # Otherwise, we may get out-of-range error in RoPE.
+            clamped_draft_positions = torch.where(
+                exceeds_max_model_len,
+                0,
+                draft_positions,
+            )
+            if level_num_drafts > 1:
+                # Repeat the positions for each draft at this level.
+                draft_positions = clamped_draft_positions.repeat_interleave(
+                    level_num_drafts).reshape(batch_size, -1)
+
+            if num_children > 1:
+                # Repeat draft hidden states for each child.
+                draft_hidden_states = draft_hidden_states.repeat_interleave(
+                    num_children, dim=1)
+
+            # Concatenate the draft tokens, positions, and hidden states.
+            tree_input_ids = torch.cat([tree_input_ids, draft_token_ids],
+                                       dim=1)
+            tree_positions = torch.cat([tree_positions, draft_positions],
+                                       dim=1)
+            tree_hidden_states = torch.cat(
+                [tree_hidden_states, draft_hidden_states], dim=1)
+
+            # Build new attention metadata for the next level of drafts.
+            # This is necessary to support tree attention.
+            query_len = total_num_drafts - tree_root_level
+            common_attn_metadata = replace(
+                common_attn_metadata,
+                query_start_loc=query_len * self.arange[:batch_size + 1],
+                seq_lens=common_attn_metadata.seq_lens + level_num_drafts,
+                num_actual_tokens=batch_size * query_len,
+                max_query_len=query_len,
+            )
+            attn_metadata = tree_attn_metadata_builder.build_for_drafting(
+                common_attn_metadata=common_attn_metadata,
+                draft_index=tree_root_level + 1,
+            )
+
+            # Apply new attention metadata to all layers.
+            per_layer_attn_metadata = {}
+            for layer_name in self.attn_layer_names:
+                per_layer_attn_metadata[layer_name] = attn_metadata
+
+            # Consider max model length.
+            attn_metadata.max_seq_len = min(attn_metadata.max_seq_len,
+                                            self.max_model_len)
+            # For the requests that exceed the max model length, we set the
+            # sequence length to 1 to minimize their overheads in attention.
+            attn_metadata.seq_lens.masked_fill_(exceeds_max_model_len, 1)
+
+            # Compute the slot mapping.
+            query_positions = flattened_draft_positions[:, level:level +
+                                                        query_len]
+            block_numbers = query_positions // self.block_size
+            block_ids = attn_metadata.block_table.gather(dim=1,
+                                                         index=block_numbers)
+            slot_mapping = (block_ids * self.block_size +
+                            query_positions % self.block_size)
+            # Mask out the slot mappings that exceed the max model length.
+            # Otherwise, the KV cache will be inadvertently updated with the
+            # padding tokens.
+            slot_mapping[exceeds_max_model_len] = PADDING_SLOT_ID
+            attn_metadata.slot_mapping = slot_mapping.view(-1)
+
+            # Copy inputs to buffer for cudagraph.
+            num_tokens = attn_metadata.num_actual_tokens
+            input_ids = tree_input_ids.view(-1)
+            self.input_ids[:num_tokens] = input_ids
+            self.positions[:num_tokens] = tree_positions.view(-1)
+            self.hidden_states[:num_tokens] = tree_hidden_states.view(
+                num_tokens, -1)
+
+            if self.use_cuda_graph and \
+                num_tokens <= self.cudagraph_batch_sizes[-1]:
+                num_input_tokens = self.vllm_config.pad_for_cudagraph(
+                    num_tokens)
+            else:
+                num_input_tokens = num_tokens
+            # Run the model.
+            with set_forward_context(per_layer_attn_metadata,
+                                     self.vllm_config,
+                                     num_tokens=num_input_tokens):
+                last_hidden_states, hidden_states = self.model(
+                    input_ids=self.input_ids[:num_input_tokens],
+                    positions=self.positions[:num_input_tokens],
+                    hidden_states=self.hidden_states[:num_input_tokens],
+                    inputs_embeds=None,
+                )
+
+            # Get the output hidden states for the draft tokens.
+            draft_hidden_states = hidden_states[:num_tokens].view(
+                batch_size, query_len, -1)[:, -level_num_drafts:]
+            draft_last_hidden_states = last_hidden_states[:num_tokens].view(
+                batch_size, query_len, -1)[:, -level_num_drafts:]
+
+            # Get the output logits for the draft tokens.
+            logits = self.model.compute_logits(
+                draft_last_hidden_states.reshape(batch_size * level_num_drafts,
+                                                 -1),
+                None,
+            )
+
+            # Sample a draft token for each child at the next tree level.
+            num_children = self.child_drafts_per_level[level + 1]
+            if num_children == 1:
+                draft_token_ids = logits.argmax(dim=-1).view(batch_size, -1)
+            else:
+                draft_token_ids = torch.topk(logits, num_children,
+                                             dim=-1).indices.view(
+                                                 batch_size, -1)
+            draft_token_ids_list.append(draft_token_ids)
+
+            # Update the # drafts counters for the next tree level.
+            level_num_drafts = self.cu_drafts_per_level[level +
+                                                        1] - total_num_drafts
+            total_num_drafts = self.cu_drafts_per_level[level + 1]
+
+        return draft_token_ids_list
+
     def prepare_inputs(
         self,
         common_attn_metadata: CommonAttentionMetadata,
@@ -391,10 +657,18 @@ class EagleProposer:
     ) -> None:
         with set_forward_context(None, self.vllm_config,
                                  num_tokens=num_tokens):
+            if self.is_multimodal_model:
+                input_ids = None
+                inputs_embeds = self.inputs_embeds[:num_tokens]
+            else:
+                input_ids = self.input_ids[:num_tokens]
+                inputs_embeds = None
+
             self.model(
-                self.input_ids[:num_tokens],
-                self.positions[:num_tokens],
-                self.hidden_states[:num_tokens],
+                input_ids=input_ids,
+                positions=self.positions[:num_tokens],
+                hidden_states=self.hidden_states[:num_tokens],
+                inputs_embeds=inputs_embeds,
             )
 
     def validate_same_kv_cache_group(self,
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index bb5a36f38386b..b5750c82db023 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -15,8 +15,8 @@ import torch
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled,
                                   usage_message)
-from vllm.utils import (bind_process_name, get_open_port,
-                        get_open_zmq_ipc_path, get_tcp_uri, kill_process_tree)
+from vllm.utils import (get_open_port, get_open_zmq_ipc_path, get_tcp_uri,
+                        kill_process_tree)
 
 if TYPE_CHECKING:
     from vllm.v1.engine.coordinator import DPCoordinator
@@ -34,22 +34,22 @@ class ConstantList(Generic[T], Sequence):
         self._x = x
 
     def append(self, item):
-        raise Exception("Cannot append to a constant list")
+        raise TypeError("Cannot append to a constant list")
 
     def extend(self, item):
-        raise Exception("Cannot extend a constant list")
+        raise TypeError("Cannot extend a constant list")
 
     def insert(self, item):
-        raise Exception("Cannot insert into a constant list")
+        raise TypeError("Cannot insert into a constant list")
 
     def pop(self, item):
-        raise Exception("Cannot pop from a constant list")
+        raise TypeError("Cannot pop from a constant list")
 
     def remove(self, item):
-        raise Exception("Cannot remove from a constant list")
+        raise TypeError("Cannot remove from a constant list")
 
     def clear(self):
-        raise Exception("Cannot clear a constant list")
+        raise TypeError("Cannot clear a constant list")
 
     def index(self,
               item: T,
@@ -78,10 +78,10 @@ class ConstantList(Generic[T], Sequence):
         ...
 
     def __setitem__(self, item: Union[int, slice], value: Union[T, list[T]]):
-        raise Exception("Cannot set item in a constant list")
+        raise TypeError("Cannot set item in a constant list")
 
     def __delitem__(self, item):
-        raise Exception("Cannot delete item from a constant list")
+        raise TypeError("Cannot delete item from a constant list")
 
     def __iter__(self):
         return iter(self._x)
@@ -144,7 +144,7 @@ class APIServerProcessManager:
         self.listen_address = listen_address
         self.sock = sock
         self.args = args
-        bind_process_name(self.__class__.__name__)
+
         # Start API servers
         spawn_context = multiprocessing.get_context("spawn")
         self.processes: list[BaseProcess] = []
@@ -154,6 +154,7 @@ class APIServerProcessManager:
             client_config = {
                 "input_address": in_addr,
                 "output_address": out_addr,
+                "client_count": num_servers,
                 "client_index": i
             }
             if stats_update_address is not None:
diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py
index 6b2b50a57e1f8..d8f3e0d89a960 100644
--- a/vllm/v1/worker/cpu_model_runner.py
+++ b/vllm/v1/worker/cpu_model_runner.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from contextlib import contextmanager
-from typing import Any
+from typing import TYPE_CHECKING, Any
 
 import torch
 import torch.nn as nn
@@ -9,8 +9,12 @@ import torch.nn as nn
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
+from vllm.v1.attention.backends.cpu_attn import TorchSDPAMetadataBuilderV1
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner
 
+if TYPE_CHECKING:
+    from vllm.v1.core.sched.output import SchedulerOutput
+
 logger = init_logger(__name__)
 
 
@@ -27,6 +31,34 @@ class CPUModelRunner(GPUModelRunner):
 
         self._postprocess_tenosrs()
 
+    def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None:
+        """
+        Update the order of requests in the batch based on the attention
+        backend's needs. For example, some attention backends (namely MLA) may
+        want to separate requests based on if the attention computation will be
+        compute-bound or memory-bound.
+
+        Args:
+            scheduler_output: The scheduler output.
+        """
+        # Attention free models have zero kv_cache_goups, however models
+        # like Mamba are also attention free but use the kv_cache for
+        # keeping its internal state. This is why we check the number
+        # of kv_cache groups instead of solely checking
+        # for self.model_config.is_attention_free.
+        if len(self.kv_cache_config.kv_cache_groups) == 0:
+            return
+
+        if len(self.kv_cache_config.kv_cache_groups) > 1:
+            raise ValueError("Multiple KVCacheGroups is not"
+                             "currently supported with CPU model runner.")
+
+        assert type(
+            self.attn_metadata_builders[0]) is TorchSDPAMetadataBuilderV1
+
+        self.attn_metadata_builders[0].reorder_batch(self.input_batch,
+                                                     scheduler_output)
+
     def _postprocess_tenosrs(self) -> None:
         # Note: replace device tensors with cpu tensors
         def replace_tensor(obj: Any, cpu_attr_name: str,
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index c63041600f388..d9d0b4bec871a 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -337,7 +337,9 @@ class InputBatch:
                 self.generators[req_index] = request.generator
 
             if sampling_params.logprobs is not None:
-                self.num_logprobs[req_id] = sampling_params.logprobs
+                self.num_logprobs[req_id] = (self.vocab_size
+                                             if sampling_params.logprobs == -1
+                                             else sampling_params.logprobs)
             if sampling_params.prompt_logprobs is not None:
                 self.num_prompt_logprobs[
                     req_id] = sampling_params.prompt_logprobs
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index fc55d09fc97e7..85976fc1c825b 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import dataclasses
 import gc
 import time
 from contextlib import contextmanager
@@ -35,7 +36,8 @@ from vllm.model_executor.models.interfaces import (is_mixture_of_experts,
 from vllm.model_executor.models.interfaces_base import (
     VllmModelForPooling, is_pooling_model, is_text_generation_model)
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
+from vllm.multimodal.inputs import (BatchedTensorInputs, MultiModalKwargs,
+                                    PlaceholderRange)
 from vllm.multimodal.utils import group_mm_inputs_by_modality
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingType
@@ -43,12 +45,13 @@ from vllm.sequence import IntermediateTensors, PoolerOutput
 from vllm.tasks import GenerationTask, PoolingTask, SupportedTask
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
                         GiB_bytes, LazyLoader, check_use_alibi, get_dtype_size,
-                        is_pin_memory_available, round_up)
+                        is_pin_memory_available, round_up, supports_dynamo)
 from vllm.v1.attention.backends.mamba_selectors import get_mamba_attn_backend
 from vllm.v1.attention.backends.utils import (
-    AttentionMetadataBuilder, CommonAttentionMetadata,
-    make_local_attention_virtual_batches)
-from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
+    AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata,
+    make_kv_sharing_fast_prefill_attention_metadata,
+    make_local_attention_virtual_batches,
+    reorder_batch_to_split_decodes_and_prefills)
 from vllm.v1.kv_cache_interface import (AttentionSpec,
                                         ChunkedLocalAttentionSpec,
                                         FullAttentionSpec, KVCacheConfig,
@@ -66,11 +69,11 @@ from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
 from vllm.v1.spec_decode.ngram_proposer import NgramProposer
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 from vllm.v1.worker.kv_connector_model_runner_mixin import (
-    KVConnectorModelRunnerMixin)
+    KVConnectorModelRunnerMixin, KVConnectorOutput)
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
 
 from ..sample.logits_processor import LogitsProcessorManager
-from .utils import (bind_kv_cache, gather_mm_placeholders,
+from .utils import (MultiModalBudget, bind_kv_cache, gather_mm_placeholders,
                     initialize_kv_cache_for_kv_sharing,
                     sanity_check_mm_encoder_outputs, scatter_mm_placeholders)
 
@@ -145,14 +148,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self.mm_registry = MULTIMODAL_REGISTRY
         self.uses_mrope = model_config.uses_mrope
 
-        encoder_compute_budget, encoder_cache_size = compute_encoder_budget(
-            model_config=model_config,
-            scheduler_config=scheduler_config,
-            mm_registry=self.mm_registry,
-        )
-        self.max_num_encoder_input_tokens = encoder_compute_budget
-        self.encoder_cache_size = encoder_cache_size
-
         # Sampler
         self.sampler = Sampler(logprobs_mode=self.model_config.logprobs_mode)
 
@@ -320,6 +315,22 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # means this layer will perform attention using the keys and values
         # from the KV cache of `shared_kv_cache_layers[layer_name]`.
         self.shared_kv_cache_layers: dict[str, str] = {}
+        self.kv_sharing_fast_prefill_eligible_layers: set[str] = set()
+
+        self.kv_sharing_fast_prefill_logits_indices = None
+        if self.cache_config.kv_sharing_fast_prefill:
+            self.kv_sharing_fast_prefill_logits_indices = torch.zeros(
+                self.max_num_tokens, dtype=torch.int32, device=self.device)
+
+        self.mm_budget = (MultiModalBudget(
+            self.model_config,
+            self.scheduler_config,
+            self.mm_registry,
+            max_model_len=self.max_model_len,
+            max_num_reqs=self.max_num_reqs,
+        ) if self.is_multimodal_model else None)
+
+        self.reorder_batch_threshold: Optional[int] = None
 
     def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None:
         """
@@ -339,20 +350,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         if len(self.kv_cache_config.kv_cache_groups) == 0:
             return
 
-        self.attn_metadata_builders[0].reorder_batch(self.input_batch,
-                                                     scheduler_output)
-
-        # For models with multiple KV cache groups, the groups should agree on
-        # the same order of requests. We ensure this by only allowing the first
-        # group to reorder the batch and asserting that all other groups do not
-        # reorder the batch.
-        # TODO(tdoublep): make this more flexible so that any group can
-        # re-order the batch (not only the first).
-        # TODO(tdoublep): verify this during engine init instead of at runtime
-        for i in range(1, len(self.kv_cache_config.kv_cache_groups)):
-            batch_reordered = self.attn_metadata_builders[i].reorder_batch(
-                self.input_batch, scheduler_output)
-            assert not batch_reordered
+        if self.reorder_batch_threshold is not None:
+            reorder_batch_to_split_decodes_and_prefills(
+                self.input_batch,
+                scheduler_output,
+                decode_threshold=self.reorder_batch_threshold)
 
     # Note: used for model runner override.
     def _init_device_properties(self) -> None:
@@ -576,37 +578,33 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # Refresh batch metadata with any pending updates.
         self.input_batch.refresh_metadata()
 
-    def _init_model_kwargs_for_multimodal_model(
+    def _extract_mm_kwargs(
         self,
-        scheduler_output: Optional["SchedulerOutput"] = None,
-        num_reqs: int = -1,
-    ) -> dict[str, Any]:
-
-        model_kwargs: dict[str, Any] = {}
-        if self.is_multimodal_raw_input_supported:
-            # This model requires the raw multimodal data in input.
+        scheduler_output: "SchedulerOutput",
+    ) -> BatchedTensorInputs:
+        if self.is_multimodal_raw_input_supported:  # noqa: SIM102
             if scheduler_output:
-                multi_modal_kwargs_list = []
+                multi_modal_kwargs_list = list[MultiModalKwargs]()
                 for req in scheduler_output.scheduled_new_reqs:
                     req_mm_inputs = req.mm_inputs
                     if not isinstance(req_mm_inputs, list):
                         req_mm_inputs = list(req_mm_inputs)
                     multi_modal_kwargs_list.extend(req_mm_inputs)
-                multi_modal_kwargs = MultiModalKwargs.batch(
-                    multi_modal_kwargs_list)
-            else:
-                # The only case where SchedulerOutput is None is for
-                # a dummy run let's get some dummy data.
-                dummy_data = [
-                    self.mm_registry.get_decoder_dummy_data(
-                        model_config=self.model_config,
-                        seq_len=1).multi_modal_data for i in range(num_reqs)
-                ]
-                multi_modal_kwargs = MultiModalKwargs.batch(dummy_data)
 
-            model_kwargs.update(multi_modal_kwargs)
+                return MultiModalKwargs.batch(multi_modal_kwargs_list)
 
-        return model_kwargs
+        return {}
+
+    def _dummy_mm_kwargs(self, num_seqs: int) -> BatchedTensorInputs:
+        if self.is_multimodal_raw_input_supported:
+            mm_budget = self.mm_budget
+            assert mm_budget is not None
+
+            dummy_modality, _ = mm_budget.get_modality_with_max_tokens()
+
+            return self._get_mm_dummy_batch(dummy_modality, num_seqs)
+
+        return {}
 
     def _get_cumsum_and_arange(
         self,
@@ -735,6 +733,55 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         spec_decode_common_attn_metadata = None
 
+        use_spec_decode = len(
+            scheduler_output.scheduled_spec_decode_tokens) > 0
+        if not use_spec_decode:
+            # NOTE(woosuk): Due to chunked prefills, the batch may contain
+            # partial requests. While we should not sample any token
+            # from these partial requests, we do so for simplicity.
+            # We will ignore the sampled tokens from the partial requests.
+            # TODO: Support prompt logprobs.
+            logits_indices = query_start_loc[1:] - 1
+            spec_decode_metadata = None
+        else:
+            # Get the number of draft tokens for each request.
+            # Iterate over the dictionary rather than all requests since not all
+            # requests have draft tokens.
+            num_draft_tokens = np.zeros(num_reqs, dtype=np.int32)
+            for req_id, draft_token_ids in (
+                    scheduler_output.scheduled_spec_decode_tokens.items()):
+                req_idx = self.input_batch.req_id_to_index[req_id]
+                num_draft_tokens[req_idx] = len(draft_token_ids)
+
+            spec_decode_metadata = self._calc_spec_decode_metadata(
+                num_draft_tokens, cu_num_tokens)
+            logits_indices = spec_decode_metadata.logits_indices
+
+        logits_indices_padded = None
+        if self.cache_config.kv_sharing_fast_prefill:
+            assert self.kv_sharing_fast_prefill_logits_indices is not None
+            num_logits = logits_indices.shape[0]
+            assert num_logits > 0
+            self.kv_sharing_fast_prefill_logits_indices[:num_logits].copy_(
+                logits_indices)
+            # There might have leftover indices in logits_indices[num_logits:]
+            # from previous iterations, whose values may be greater than the
+            # batch size in the current iteration. To ensure indices are always
+            # valid, we fill the padded indices with the last index.
+            self.kv_sharing_fast_prefill_logits_indices[num_logits:].fill_(
+                logits_indices[-1].item())
+            if (self.use_cuda_graph
+                    and num_logits <= self.cudagraph_batch_sizes[-1]):
+                # Use piecewise CUDA graphs.
+                # Add padding to the batch size.
+                num_logits_padded = self.vllm_config.pad_for_cudagraph(
+                    num_logits)
+            else:
+                num_logits_padded = num_logits
+            logits_indices_padded = (
+                self.kv_sharing_fast_prefill_logits_indices[:num_logits_padded]
+            )
+
         attn_metadata: dict[str, Any] = {}
 
         # Prepare encoder attention metadata separately
@@ -806,7 +853,28 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 common_attn_metadata=common_attn_metadata,
             ))
 
+            fast_prefill_metadata = attn_metadata_i
+            if (self.cache_config.kv_sharing_fast_prefill
+                    and self.kv_sharing_fast_prefill_eligible_layers):
+                # Dynamically create a a dataclass type that inherits
+                # from attention metadata type but includes additional
+                # fields logits_indices_padded and num_logits_indices
+                # which are required for prefill truncation
+                fast_prefill_metadata_type = (
+                    make_kv_sharing_fast_prefill_attention_metadata(
+                        metadata_cls=type(attn_metadata_i), ))
+                fast_prefill_metadata = fast_prefill_metadata_type(
+                    **dataclasses.asdict(attn_metadata_i),
+                    logits_indices_padded=logits_indices_padded,
+                    num_logits_indices=logits_indices.size(0),
+                )
+
             for layer_name in kv_cache_group_spec.layer_names:
+                if (self.cache_config.kv_sharing_fast_prefill and layer_name
+                        in self.kv_sharing_fast_prefill_eligible_layers):
+                    attn_metadata[layer_name] = fast_prefill_metadata
+                    continue
+
                 attn_metadata[layer_name] = attn_metadata_i
 
             # Hack for now to fix chunked local attention + no hybrid kv cache
@@ -838,30 +906,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             b.can_run_in_cudagraph(common_attn_metadata)
             for b in self.attn_metadata_builders)
 
-        use_spec_decode = len(
-            scheduler_output.scheduled_spec_decode_tokens) > 0
-        if not use_spec_decode:
-            # NOTE(woosuk): Due to chunked prefills, the batch may contain
-            # partial requests. While we should not sample any token
-            # from these partial requests, we do so for simplicity.
-            # We will ignore the sampled tokens from the partial requests.
-            # TODO: Support prompt logprobs.
-            logits_indices = query_start_loc[1:] - 1
-            spec_decode_metadata = None
-        else:
-            # Get the number of draft tokens for each request.
-            # Iterate over the dictionary rather than all requests since not all
-            # requests have draft tokens.
-            num_draft_tokens = np.zeros(num_reqs, dtype=np.int32)
-            for req_id, draft_token_ids in (
-                    scheduler_output.scheduled_spec_decode_tokens.items()):
-                req_idx = self.input_batch.req_id_to_index[req_id]
-                num_draft_tokens[req_idx] = len(draft_token_ids)
-
-            spec_decode_metadata = self._calc_spec_decode_metadata(
-                num_draft_tokens, cu_num_tokens)
-            logits_indices = spec_decode_metadata.logits_indices
-
         # Hot-Swap lora model
         if self.lora_config:
             self.set_active_loras(self.input_batch, num_scheduled_tokens)
@@ -1151,13 +1195,15 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
     def _gather_mm_embeddings(
         self,
         scheduler_output: "SchedulerOutput",
+        shift_computed_tokens: int = 0,
     ) -> list[torch.Tensor]:
         mm_embeds: list[torch.Tensor] = []
         for req_id in self.input_batch.req_ids:
             num_scheduled_tokens = scheduler_output.num_scheduled_tokens[
                 req_id]
             req_state = self.requests[req_id]
-            num_computed_tokens = req_state.num_computed_tokens
+            num_computed_tokens = \
+                req_state.num_computed_tokens + shift_computed_tokens
             mm_positions = req_state.mm_positions
             for i, pos_info in enumerate(mm_positions):
                 start_pos = pos_info.offset
@@ -1373,8 +1419,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         hidden_states: torch.Tensor,
         num_scheduled_tokens: int,
         num_scheduled_tokens_np: np.ndarray,
-        finished_sending: Optional[set[str]],
-        finished_recving: Optional[set[str]],
+        kv_connector_output: Optional[KVConnectorOutput],
     ) -> ModelRunnerOutput:
         assert self.input_batch.num_reqs ==\
             len(self.input_batch.pooling_params), \
@@ -1409,8 +1454,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             logprobs=None,
             prompt_logprobs_dict={},
             pooler_output=pooler_output,
-            finished_sending=finished_sending,
-            finished_recving=finished_recving,
+            kv_connector_output=kv_connector_output,
         )
 
     @torch.inference_mode()
@@ -1433,6 +1477,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
          spec_decode_metadata, num_scheduled_tokens_np,
          spec_decode_common_attn_metadata) = (
              self._prepare_inputs(scheduler_output))
+
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         if (self.use_cuda_graph
                 and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]):
@@ -1468,19 +1513,18 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             # NOTE(woosuk): To unify token ids and soft tokens (vision
             # embeddings), we always use embeddings (rather than token ids)
             # as input to the multimodal model, even when the input is text.
-            input_ids = self.input_ids[:num_scheduled_tokens]
-
-            model_kwargs = self._init_model_kwargs_for_multimodal_model(
-                scheduler_output=scheduler_output)
-            inputs_embeds = self.model.get_input_embeddings(
-                input_ids=input_ids,
+            inputs_embeds_scheduled = self.model.get_input_embeddings(
+                input_ids=self.input_ids[:num_scheduled_tokens],
                 multimodal_embeddings=mm_embeds or None,
             )
 
             # TODO(woosuk): Avoid the copy. Optimize.
-            self.inputs_embeds[:num_scheduled_tokens].copy_(inputs_embeds)
-            inputs_embeds = self.inputs_embeds[:num_input_tokens]
+            self.inputs_embeds[:num_scheduled_tokens].copy_(
+                inputs_embeds_scheduled)
+
             input_ids = None
+            inputs_embeds = self.inputs_embeds[:num_input_tokens]
+            model_mm_kwargs = self._extract_mm_kwargs(scheduler_output)
         else:
             # For text-only models, we use token ids as input.
             # While it is possible to use embeddings as input just like the
@@ -1488,7 +1532,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             # then the embedding layer is not included in the CUDA graph.
             input_ids = self.input_ids[:num_input_tokens]
             inputs_embeds = None
-            model_kwargs = {}
+            model_mm_kwargs = {}
         if self.uses_mrope:
             positions = self.mrope_positions[:, :num_input_tokens]
         else:
@@ -1513,8 +1557,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 num_tokens=num_input_tokens,
                 num_tokens_across_dp=num_tokens_across_dp,
                 skip_cuda_graphs=skip_cuda_graphs,
-        ):
-            self.maybe_setup_kv_connector(scheduler_output)
+        ), self.maybe_get_kv_connector_output(
+                scheduler_output) as kv_connector_output:
 
             model_output = self.model(
                 input_ids=input_ids,
@@ -1522,15 +1566,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 intermediate_tensors=intermediate_tensors,
                 inputs_embeds=inputs_embeds,
                 **MultiModalKwargs.as_kwargs(
-                    model_kwargs,
+                    model_mm_kwargs,
                     device=self.device,
                 ),
             )
 
-            self.maybe_wait_for_kv_save()
-            finished_sending, finished_recving = (
-                self.get_finished_kv_transfers(scheduler_output))
-
         if self.use_aux_hidden_state_outputs:
             hidden_states, aux_hidden_states = model_output
         else:
@@ -1546,20 +1586,17 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             == "external_launcher" and len(get_pp_group().ranks) > 0
         if not get_pp_group().is_last_rank:
             # For mid-pipeline stages, return the hidden states.
-            if not broadcast_pp_output:
-                if finished_sending or finished_recving:
-                    hidden_states.finished_sending = finished_sending
-                    hidden_states.finished_recving = finished_recving
-                return hidden_states
             assert isinstance(hidden_states, IntermediateTensors)
+            if not broadcast_pp_output:
+                hidden_states.kv_connector_output = kv_connector_output
+                return hidden_states
             get_pp_group().send_tensor_dict(hidden_states.tensors,
                                             all_gather_group=get_tp_group())
             logits = None
         else:
             if self.input_batch.pooling_params:
                 return self._pool(hidden_states, num_scheduled_tokens,
-                                  num_scheduled_tokens_np, finished_sending,
-                                  finished_recving)
+                                  num_scheduled_tokens_np, kv_connector_output)
 
             sample_hidden_states = hidden_states[logits_indices]
             logits = self.model.compute_logits(sample_hidden_states, None)
@@ -1709,8 +1746,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             logprobs=logprobs_lists,
             prompt_logprobs_dict=prompt_logprobs_dict,
             pooler_output=[],
-            finished_sending=finished_sending,
-            finished_recving=finished_recving,
+            kv_connector_output=kv_connector_output,
             num_nans_in_logits=num_nans_in_logits,
         )
 
@@ -1803,6 +1839,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                         [h[token_indices] for h in aux_hidden_states], dim=-1)
                 else:
                     target_hidden_states = hidden_states[token_indices]
+            mm_embeds = None
+            if self.is_multimodal_model:
+                mm_embeds = self._gather_mm_embeddings(scheduler_output,
+                                                       shift_computed_tokens=1)
+
             draft_token_ids = self.drafter.propose(
                 target_token_ids=target_token_ids,
                 target_positions=target_positions,
@@ -1810,6 +1851,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 next_token_ids=next_token_ids,
                 sampling_metadata=sampling_metadata,
                 common_attn_metadata=common_attn_metadata,
+                mm_embeds=mm_embeds,
             )
             spec_token_ids = draft_token_ids.tolist()
         return spec_token_ids
@@ -1930,6 +1972,17 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 rank_mapping,
             )
 
+        if (
+            self.vllm_config.compilation_config.level == \
+                CompilationLevel.DYNAMO_AS_IS and supports_dynamo()
+        ):
+            backend = self.vllm_config.compilation_config.init_backend(
+                self.vllm_config)
+            compilation_counter.dynamo_as_is_count += 1
+            self.model.compile(
+                fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
+                backend=backend)
+
     def reload_weights(self) -> None:
         assert getattr(self, "model", None) is not None, \
             "Cannot reload weights before model is loaded."
@@ -2091,6 +2144,30 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             yield
             input_ids.fill_(0)
 
+    def _get_mm_dummy_batch(
+        self,
+        modality: str,
+        max_items_per_batch: int,
+    ) -> BatchedTensorInputs:
+        """Dummy data for profiling and precompiling multimodal models."""
+        dummy_decoder_data = self.mm_registry.get_decoder_dummy_data(
+            model_config=self.model_config,
+            seq_len=self.max_num_tokens,
+            mm_counts={modality: 1},
+        )
+        dummy_mm_data = dummy_decoder_data.multi_modal_data
+
+        # Result in the maximum GPU consumption of the model
+        dummy_mm_item = dummy_mm_data.get_item(modality=modality, item_index=0)
+        dummy_mm_kwargs = MultiModalKwargs.from_items([dummy_mm_item])
+
+        batched_dummy_mm_inputs = MultiModalKwargs.batch([dummy_mm_kwargs] *
+                                                         max_items_per_batch)
+        return MultiModalKwargs.as_kwargs(
+            batched_dummy_mm_inputs,
+            device=self.device,
+        )
+
     @torch.inference_mode()
     def _dummy_run(
         self,
@@ -2155,16 +2232,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         with self.maybe_dummy_run_with_lora(self.lora_config,
                                             num_scheduled_tokens):
-            model = self.model
             if self.is_multimodal_model:
-                model_kwargs = self._init_model_kwargs_for_multimodal_model(
-                    num_reqs=num_reqs)
                 input_ids = None
                 inputs_embeds = self.inputs_embeds[:num_tokens]
+                model_mm_kwargs = self._dummy_mm_kwargs(num_reqs)
             else:
                 input_ids = self.input_ids[:num_tokens]
                 inputs_embeds = None
-                model_kwargs = {}
+                model_mm_kwargs = {}
 
             if self.uses_mrope:
                 positions = self.mrope_positions[:, :num_tokens]
@@ -2189,13 +2264,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     self.vllm_config,
                     num_tokens=num_tokens,
                     num_tokens_across_dp=num_tokens_across_dp):
-                outputs = model(
+                outputs = self.model(
                     input_ids=input_ids,
                     positions=positions,
                     intermediate_tensors=intermediate_tensors,
                     inputs_embeds=inputs_embeds,
                     **MultiModalKwargs.as_kwargs(
-                        model_kwargs,
+                        model_mm_kwargs,
                         device=self.device,
                     ),
                 )
@@ -2365,75 +2440,51 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
     def profile_run(self) -> None:
         # Profile with multimodal encoder & encoder cache.
-        # TODO: handle encoder-decoder models once we support them.
-        if (self.is_multimodal_model and self.max_num_encoder_input_tokens > 0
-                and self.encoder_cache_size > 0):
+        if self.is_multimodal_model:
+            mm_budget = self.mm_budget
+            assert mm_budget is not None
 
-            # NOTE: Currently model is profiled with a single non-text
-            # modality with the max possible input tokens even when
-            # it supports multiple.
-            max_tokens_by_modality_dict = self.mm_registry \
-                .get_max_tokens_per_item_by_nonzero_modality(self.model_config)
-            dummy_data_modality, max_tokens_per_mm_item = max(
-                max_tokens_by_modality_dict.items(), key=lambda item: item[1])
+            # TODO: handle encoder-decoder models once we support them.
+            if (encoder_budget := mm_budget.get_encoder_budget()) > 0:
+                # NOTE: Currently model is profiled with a single non-text
+                # modality with the max possible input tokens even when
+                # it supports multiple.
+                (
+                    dummy_modality,
+                    max_tokens,
+                ) = mm_budget.get_modality_with_max_tokens()
+                (
+                    max_mm_items_per_prompt,
+                    max_mm_items_per_batch,
+                ) = mm_budget.get_max_items(dummy_modality, max_tokens)
 
-            # Check how many items of this modality can be supported by
-            # the encoder budget.
-            encoder_budget = min(self.max_num_encoder_input_tokens,
-                                 self.encoder_cache_size)
+                logger.info(
+                    "Encoder cache will be initialized with a budget of "
+                    "%s tokens, and profiled with %s %s items of the maximum "
+                    "feature size.",
+                    encoder_budget,
+                    max_mm_items_per_batch,
+                    dummy_modality,
+                )
 
-            max_num_mm_items_encoder_budget = encoder_budget // \
-                max_tokens_per_mm_item
+                # Create dummy batch of multimodal inputs.
+                batched_dummy_mm_inputs = self._get_mm_dummy_batch(
+                    dummy_modality,
+                    max_mm_items_per_batch,
+                )
 
-            # Check how many items of this modality can be supported by
-            # the decoder budget.
-            max_mm_items_per_req = self.mm_registry.get_mm_limits_per_prompt(
-                self.model_config)[dummy_data_modality]
+                # Run multimodal encoder.
+                dummy_encoder_outputs = self.model.get_multimodal_embeddings(
+                    **batched_dummy_mm_inputs)
 
-            # NOTE: We do not consider max_num_batched_tokens on purpose
-            # because the multimodal embeddings can be generated in advance
-            # and chunked prefilled.
-            max_num_mm_items_decoder_budget = self.max_num_reqs * \
-                max_mm_items_per_req
+                sanity_check_mm_encoder_outputs(
+                    dummy_encoder_outputs,
+                    expected_num_items=max_mm_items_per_batch,
+                )
 
-            max_num_mm_items = max(
-                1,
-                min(max_num_mm_items_encoder_budget,
-                    max_num_mm_items_decoder_budget))
-
-            logger.info(
-                "Encoder cache will be initialized with a budget of %s tokens,"
-                " and profiled with %s %s items of the maximum feature size.",
-                encoder_budget, max_num_mm_items, dummy_data_modality)
-
-            # Create dummy batch of multimodal inputs.
-            dummy_mm_kwargs = self.mm_registry.get_decoder_dummy_data(
-                model_config=self.model_config,
-                seq_len=max_tokens_per_mm_item,
-                mm_counts={
-                    dummy_data_modality: 1
-                },
-            ).multi_modal_data
-
-            batched_dummy_mm_inputs = MultiModalKwargs.batch(
-                [dummy_mm_kwargs] * max_num_mm_items,
-                pin_memory=self.pin_memory)
-            batched_dummy_mm_inputs = MultiModalKwargs.as_kwargs(
-                batched_dummy_mm_inputs,
-                device=self.device,
-            )
-
-            # Run multimodal encoder.
-            dummy_encoder_outputs = self.model.get_multimodal_embeddings(
-                **batched_dummy_mm_inputs)
-
-            sanity_check_mm_encoder_outputs(
-                dummy_encoder_outputs,
-                expected_num_items=max_num_mm_items,
-            )
-
-            # Cache the dummy encoder outputs.
-            self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs))
+                # Cache the dummy encoder outputs.
+                self.encoder_cache["tmp"] = dict(
+                    enumerate(dummy_encoder_outputs))
 
         # Add `is_profile` here to pre-allocate communication buffers
         hidden_states, last_hidden_states \
@@ -2510,7 +2561,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     elapsed_time, cuda_graph_size / (1 << 30))
 
     def _initialize_single_attn_backend(
-        self, kv_cache_spec: KVCacheSpec
+        self, kv_cache_spec: KVCacheSpec, layer_names: list[str]
     ) -> tuple[AttentionBackend, AttentionMetadataBuilder]:
         if isinstance(kv_cache_spec, AttentionSpec):
             attn_backend_i = get_attn_backend(
@@ -2540,16 +2591,27 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         attn_metadata_builder_i = attn_backend_i.get_builder_cls()(
             kv_cache_spec,
+            layer_names,
             self.vllm_config,
             self.device,
         )
 
-        if (self.full_cuda_graph
-                and not attn_metadata_builder_i.full_cudagraph_supported):
-            raise ValueError(
-                f"Full CUDAGraph not supported for "
-                f"{attn_backend_i.__name__}. Turn off CompilationConfig."
-                f"full_cuda_graph or use a different attention backend.")
+        if self.full_cuda_graph:
+            if attn_metadata_builder_i.attn_cudagraph_support == \
+                AttentionCGSupport.NEVER:
+                raise ValueError(f"Full CUDAGraph not supported for "
+                                 f"{attn_backend_i.__name__}. Turn off "
+                                 f"CompilationConfig.full_cuda_graph or use a "
+                                 f" different attention backend.")
+            if attn_metadata_builder_i.attn_cudagraph_support == \
+                AttentionCGSupport.PURE_DECODE_ONLY:
+                # Limit the max cudagraph size to the max number of
+                # sequences for pure decode only cudagraph backend,
+                # whose max_query_len is 1.
+                self.cudagraph_batch_sizes = [
+                    size for size in self.cudagraph_batch_sizes
+                    if size <= self.scheduler_config.max_num_seqs
+                ]
         return attn_backend_i, attn_metadata_builder_i
 
     def initialize_attn_backend(self, kv_cache_config: KVCacheConfig) -> None:
@@ -2563,11 +2625,15 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 kv_cache_config.kv_cache_groups):
             kv_cache_spec = kv_cache_group_spec.kv_cache_spec
 
-            attn_backend_i, attn_metadata_builder_i = \
-                self._initialize_single_attn_backend(kv_cache_spec)
+            attn_backend_i, attn_metadata_builder_i = (
+                self._initialize_single_attn_backend(
+                    kv_cache_spec, kv_cache_group_spec.layer_names))
             self.attn_backends.append(attn_backend_i)
             self.attn_metadata_builders.append(attn_metadata_builder_i)
 
+        # Calculate reorder batch threshold (if neeeded)
+        self.calculate_reorder_batch_threshold()
+
         if len(self.attn_backends) > 0:
             return
 
@@ -2595,12 +2661,35 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             assert len(attn_specs) == len(attn_layers), \
                 "All or none of the layers are expected to be encoder-only"
 
-            attn_backend, attn_metadata_builder = \
-                self._initialize_single_attn_backend(attn_specs[0])
+            attn_backend, attn_metadata_builder = (
+                self._initialize_single_attn_backend(attn_specs[0],
+                                                     attn_layers.keys()))
             self.attn_backends.append(attn_backend)
             self.attn_metadata_builders.append(attn_metadata_builder)
             self.is_encoder_only_model = True
 
+    def calculate_reorder_batch_threshold(self) -> None:
+        """
+        Check that if any backends reorder batches; that the reordering
+        is compatible (e.g., decode threshold is the same)
+        """
+        for attn_metadata_builder_i in self.attn_metadata_builders:
+            # check that if any backends reorder batches; that the reordering
+            # is compatible (e.g., decode threshold is the same)
+            reorder_batch_threshold_i = (
+                attn_metadata_builder_i.reorder_batch_threshold)
+            if reorder_batch_threshold_i is not None:
+                if self.reorder_batch_threshold is not None:
+                    if reorder_batch_threshold_i != \
+                        self.reorder_batch_threshold:
+                        raise ValueError(
+                            f"Attention backend reorders decodes with "
+                            f"threshold {reorder_batch_threshold_i} but other "
+                            f"backend uses threshold "
+                            f"{self.reorder_batch_threshold}")
+                else:
+                    self.reorder_batch_threshold = reorder_batch_threshold_i
+
     def may_reinitialize_input_batch(self,
                                      kv_cache_config: KVCacheConfig) -> None:
         """
@@ -2800,6 +2889,16 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 kv_cache_config.kv_cache_groups,
                 kv_caches,
             )
+            attn_layers = get_layers_from_vllm_config(self.vllm_config,
+                                                      Attention)
+            # Iterate in reversed order and add layers that re-use KV cache
+            # e.g. in YOCO-like KV sharing setups (e.g. Gemma3n)
+            for layer_name in reversed(attn_layers):
+                if layer_name in self.shared_kv_cache_layers:
+                    self.kv_sharing_fast_prefill_eligible_layers.add(
+                        layer_name)
+                else:
+                    break
 
         bind_kv_cache(kv_caches,
                       self.compilation_config.static_forward_context,
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index ec4261aa5aab5..4151949e38d26 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -16,8 +16,7 @@ from vllm.config import VllmConfig
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment,
                               set_custom_all_reduce)
-from vllm.distributed.kv_transfer import (ensure_kv_transfer_initialized,
-                                          has_kv_transfer_group)
+from vllm.distributed.kv_transfer import ensure_kv_transfer_initialized
 from vllm.distributed.parallel_state import get_pp_group, get_tp_group
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
@@ -72,12 +71,23 @@ class Worker(WorkerBase):
             torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
             logger.info("Profiling enabled. Traces will be saved to: %s",
                         torch_profiler_trace_dir)
+            logger.debug(
+                "Profiler config: record_shapes=%s,"
+                "profile_memory=%s,with_stack=%s,with_flops=%s",
+                envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
+                envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
+                envs.VLLM_TORCH_PROFILER_WITH_STACK,
+                envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
+            )
             self.profiler = torch.profiler.profile(
                 activities=[
                     torch.profiler.ProfilerActivity.CPU,
                     torch.profiler.ProfilerActivity.CUDA,
                 ],
-                with_stack=True,
+                record_shapes=envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
+                profile_memory=envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
+                with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
+                with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
                 on_trace_ready=torch.profiler.tensorboard_trace_handler(
                     torch_profiler_trace_dir, use_gzip=True))
         else:
@@ -210,7 +220,7 @@ class Worker(WorkerBase):
 
     @torch.inference_mode()
     def determine_available_memory(self) -> int:
-        """Profiles the peak memory usage of the model to determine how much 
+        """Profiles the peak memory usage of the model to determine how much
         memory can be used for KV cache without OOMs.
 
         The engine will first conduct a profiling of the existing memory usage.
@@ -311,11 +321,16 @@ class Worker(WorkerBase):
         if get_pp_group().is_last_rank:
             max_num_reqs = min(self.scheduler_config.max_num_seqs,
                                self.scheduler_config.max_num_batched_tokens)
+            # activate building attn_metadata for this dummy run to avoid
+            # potential illegal memory access for full cudagraph relay.
+            attn_cudagraph = self.compilation_config.full_cuda_graph and\
+                not self.model_config.enforce_eager
 
             # We skip EPLB here since we don't want to record dummy metrics
             hidden_states, last_hidden_states = \
                 self.model_runner._dummy_run(
                     num_tokens=max_num_reqs,
+                    capture_attn_cudagraph=attn_cudagraph,
                     skip_eplb=True,
                 )
             if self.model_runner.is_pooling_model:
@@ -355,17 +370,20 @@ class Worker(WorkerBase):
             assert isinstance(output, IntermediateTensors)
             get_pp_group().send_tensor_dict(output.tensors,
                                             all_gather_group=get_tp_group())
-            if not has_kv_transfer_group():
+
+            kv_connector_output = output.kv_connector_output
+            if not kv_connector_output:
                 return None
 
             # In case of PP with kv transfer, we need to pass through the
-            # finished_sending and finished_recving buffers.
-            new_output = EMPTY_MODEL_RUNNER_OUTPUT
-            if output.finished_sending or output.finished_recving:
-                new_output = copy.copy(new_output)
-                new_output.finished_sending = output.finished_sending
-                new_output.finished_recving = output.finished_recving
-            output = new_output
+            # kv_connector_output
+            if (not kv_connector_output.finished_sending
+                    and not kv_connector_output.finished_recving):
+                return EMPTY_MODEL_RUNNER_OUTPUT
+
+            output = copy.copy(EMPTY_MODEL_RUNNER_OUTPUT)
+            output.kv_connector_output = kv_connector_output
+            return output
 
         assert isinstance(output, ModelRunnerOutput)
         return output
diff --git a/vllm/v1/worker/kv_connector_model_runner_mixin.py b/vllm/v1/worker/kv_connector_model_runner_mixin.py
index 5a3186058fcfe..a03ebe35d8e0a 100644
--- a/vllm/v1/worker/kv_connector_model_runner_mixin.py
+++ b/vllm/v1/worker/kv_connector_model_runner_mixin.py
@@ -4,15 +4,18 @@
 Define KV connector functionality mixin for model runners.
 """
 import copy
+from contextlib import AbstractContextManager, contextmanager, nullcontext
+from typing import Generator  # noqa: UP035
 from typing import TYPE_CHECKING, Optional
 
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer import (get_kv_transfer_group,
                                           has_kv_transfer_group)
-from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1
+from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase
 from vllm.forward_context import get_forward_context, set_forward_context
 from vllm.logger import init_logger
-from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT, ModelRunnerOutput
+from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, KVConnectorOutput,
+                             ModelRunnerOutput)
 
 if TYPE_CHECKING:
     from vllm.v1.core.sched.output import SchedulerOutput
@@ -28,7 +31,7 @@ class KVConnectorModelRunnerMixin:
         # Update KVConnector with the KVConnector metadata forward().
         if has_kv_transfer_group():
             kv_connector = get_kv_transfer_group()
-            assert isinstance(kv_connector, KVConnectorBase_V1)
+            assert isinstance(kv_connector, KVConnectorBase)
             assert scheduler_output.kv_connector_metadata is not None
             kv_connector.bind_connector_metadata(
                 scheduler_output.kv_connector_metadata)
@@ -53,18 +56,60 @@ class KVConnectorModelRunnerMixin:
                 scheduler_output.finished_req_ids)
         return None, None
 
-    def kv_connector_no_forward(self, scheduler_output: "SchedulerOutput",
+    @staticmethod
+    def kv_connector_no_forward(scheduler_output: "SchedulerOutput",
                                 vllm_config: VllmConfig) -> ModelRunnerOutput:
         # KV send/recv even if no work to do.
-        with set_forward_context(None, vllm_config):
-            self.maybe_setup_kv_connector(scheduler_output)
-            finished_sending, finished_recving = (
-                self.get_finished_kv_transfers(scheduler_output))
+        with set_forward_context(
+                None, vllm_config
+        ), KVConnectorModelRunnerMixin._get_kv_connector_output(
+                scheduler_output, wait_for_save=False) as kv_connector_output:
+            pass
 
-        if not finished_sending and not finished_recving:
+        if (not kv_connector_output.finished_sending
+                and not kv_connector_output.finished_recving):
             return EMPTY_MODEL_RUNNER_OUTPUT
 
         output = copy.copy(EMPTY_MODEL_RUNNER_OUTPUT)
-        output.finished_sending = finished_sending
-        output.finished_recving = finished_recving
+        output.kv_connector_output = kv_connector_output
         return output
+
+    @staticmethod
+    def maybe_get_kv_connector_output(
+        scheduler_output: "SchedulerOutput"
+    ) -> AbstractContextManager[Optional[KVConnectorOutput]]:
+        return KVConnectorModelRunnerMixin._get_kv_connector_output(
+            scheduler_output) if has_kv_transfer_group() else nullcontext()
+
+    # This context manager must be used within an active forward context.
+    # It encapsulates the entire KV conector lifecycle within execute_model
+    @staticmethod
+    @contextmanager
+    def _get_kv_connector_output(
+        scheduler_output: "SchedulerOutput",
+        wait_for_save: bool = True
+    ) -> Generator[KVConnectorOutput, None, None]:
+        output = KVConnectorOutput()
+
+        # Update KVConnector with the KVConnector metadata forward().
+        kv_connector = get_kv_transfer_group()
+        assert isinstance(kv_connector, KVConnectorBase)
+        assert scheduler_output.kv_connector_metadata is not None
+        kv_connector.bind_connector_metadata(
+            scheduler_output.kv_connector_metadata)
+
+        # Background KV cache transfers happen here.
+        # These transfers are designed to be async and the requests
+        # involved may be disjoint from the running requests.
+        # Do this here to save a collective_rpc.
+        kv_connector.start_load_kv(get_forward_context())
+        try:
+            yield output
+        finally:
+            if wait_for_save:
+                kv_connector.wait_for_save()
+
+            output.finished_sending, output.finished_recving = (
+                kv_connector.get_finished(scheduler_output.finished_req_ids))
+
+            kv_connector.clear_connector_metadata()
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 59cbb0150570b..5f3188efdb244 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -42,7 +42,6 @@ from vllm.v1.attention.backends.pallas import (TPU_STR_DTYPE_TO_TORCH_DTYPE,
                                                PallasAttentionBackend,
                                                PallasMetadata,
                                                get_page_size_bytes)
-from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
 from vllm.v1.kv_cache_interface import (AttentionSpec, FullAttentionSpec,
                                         KVCacheConfig, KVCacheSpec,
                                         SlidingWindowSpec)
@@ -51,11 +50,12 @@ from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsLists,
 from vllm.v1.sample.tpu.metadata import TPUSupportedSamplingMetadata
 from vllm.v1.sample.tpu.sampler import Sampler as TPUSampler
 from vllm.v1.worker.kv_connector_model_runner_mixin import (
-    KVConnectorModelRunnerMixin)
+    KVConnectorModelRunnerMixin, KVConnectorOutput)
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
 from vllm.v1.worker.tpu_input_batch import CachedRequestState, InputBatch
 
-from .utils import (bind_kv_cache, initialize_kv_cache_for_kv_sharing,
+from .utils import (MultiModalBudget, bind_kv_cache,
+                    initialize_kv_cache_for_kv_sharing,
                     sanity_check_mm_encoder_outputs)
 
 if TYPE_CHECKING:
@@ -195,14 +195,6 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # TODO: Support M-RoPE (e.g, Qwen2-VL)
         assert not self.uses_mrope, "TPU does not support M-RoPE yet."
 
-        encoder_compute_budget, encoder_cache_size = compute_encoder_budget(
-            model_config=model_config,
-            scheduler_config=scheduler_config,
-            mm_registry=self.mm_registry,
-        )
-        self.max_num_encoder_input_tokens = encoder_compute_budget
-        self.encoder_cache_size = encoder_cache_size
-
         self._num_slices_per_kv_cache_update_block = \
             _get_num_slices_per_kv_cache_update_block(get_page_size_bytes(
                 block_size=self.block_size,
@@ -294,36 +286,13 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self.structured_decode_arange = torch.arange(
             0, 32, device="cpu", pin_memory=self.pin_memory)
 
-        # Get maximum number of mm items per modality (batch size).
-        self.max_num_mm_items_by_modality = dict()
-        if (self.is_multimodal_model and self.max_num_encoder_input_tokens > 0
-                and self.encoder_cache_size > 0):
-            max_tokens_by_modality_dict = (
-                MULTIMODAL_REGISTRY.
-                get_max_tokens_per_item_by_nonzero_modality(self.model_config))
-            for modality, max_tokens in max_tokens_by_modality_dict.items():
-                # Check how many items of this modality can be supported by
-                # the encoder budget.
-                encoder_budget = min(self.max_num_encoder_input_tokens,
-                                     self.encoder_cache_size)
-
-                max_num_mm_items_encoder_budget = cdiv(encoder_budget,
-                                                       max_tokens)
-
-                # Check how many items of this modality can be supported by
-                # the decoder budget.
-                max_mm_items_per_req = self.mm_registry.\
-                    get_mm_limits_per_prompt(self.model_config)[modality]
-
-                # NOTE: We do not consider max_num_batched_tokens on purpose
-                # because the multimodal embeddings can be generated in advance
-                # and chunked prefilled.
-                max_num_mm_items_decoder_budget = self.max_num_reqs * \
-                    max_mm_items_per_req
-
-                max_num_mm_items = min(max_num_mm_items_encoder_budget,
-                                       max_num_mm_items_decoder_budget)
-                self.max_num_mm_items_by_modality[modality] = max_num_mm_items
+        self.mm_budget = (MultiModalBudget(
+            self.model_config,
+            self.scheduler_config,
+            self.mm_registry,
+            max_model_len=self.max_model_len,
+            max_num_reqs=self.max_num_reqs,
+        ) if self.is_multimodal_model else None)
 
         if not self.use_spmd:
             self.sample_from_logits_func = torch.compile(
@@ -1175,9 +1144,10 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             logprobs=logprobs_lists,
             prompt_logprobs_dict=prompt_logprobs_dict,
             pooler_output=[],
-            finished_sending=finished_sending,
-            finished_recving=finished_recving,
-        )
+            kv_connector_output=KVConnectorOutput(
+                finished_sending=finished_sending,
+                finished_recving=finished_recving,
+            ))
 
         # Check there are no new graphs compiled - all the graphs should be
         # captured and compiled during warm up.
@@ -1334,23 +1304,33 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         xm.mark_step()  # Captures metadata updates
 
     def _precompile_mm_encoder(self) -> None:
+        if not self.is_multimodal_model:
+            return
+
         # Pre-compile MM encoder for all supported data modalities.
         hf_config = self.vllm_config.model_config.hf_config
-        for mode, max_items_by_mode in \
-            self.max_num_mm_items_by_modality.items():
+
+        mm_budget = self.mm_budget
+        assert mm_budget is not None
+
+        max_items_per_seq_by_modality = mm_budget.max_items_per_batch_by_modality  # noqa: E501
+
+        for mode, max_items_per_seq in max_items_per_seq_by_modality.items():
             logger.info(
                 "Compiling Multimodal %s Encoder with different input"
                 " shapes.", mode)
             start = time.perf_counter()
             # No padding for MM encoder just yet.
-            for num_items in range(1, max_items_by_mode + 1):
+            for num_items in range(1, max_items_per_seq + 1):
                 logger.info("  -- mode: %s items: %d", mode, num_items)
                 batched_dummy_mm_inputs = self._get_mm_dummy_batch(
-                    mode, num_items)
+                    mode,
+                    num_items,
+                )
                 # Run multimodal encoder.
                 xm.mark_step()
-                mm_embeds = self.model.\
-                    get_multimodal_embeddings(**batched_dummy_mm_inputs)
+                mm_embeds = self.model.get_multimodal_embeddings(
+                    **batched_dummy_mm_inputs)
                 xm.mark_step()
                 num_patches = mm_embeds[0].shape[0]
                 items_size = num_patches * num_items
@@ -1546,51 +1526,61 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         num_tokens: int,
     ) -> None:
         # Profile with multimodal encoder & encoder cache.
-        # TODO: handle encoder-decoder models once we support them.
-        if (self.is_multimodal_model and self.max_num_encoder_input_tokens > 0
-                and self.encoder_cache_size > 0):
+        if self.is_multimodal_model:
+            mm_budget = self.mm_budget
+            assert mm_budget is not None
 
-            # NOTE: Currently model is profiled with a single non-text
-            # modality with the max possible input tokens even when
-            # it supports multiple.
-            dummy_data_modality, max_num_mm_items = max(
-                self.max_num_mm_items_by_modality.items(), key=lambda t: t[1])
+            # TODO: handle encoder-decoder models once we support them.
+            if (encoder_budget := mm_budget.get_encoder_budget()) > 0:
+                # NOTE: Currently model is profiled with a single non-text
+                # modality with the max possible input tokens even when
+                # it supports multiple.
+                (
+                    dummy_modality,
+                    max_tokens,
+                ) = mm_budget.get_modality_with_max_tokens()
+                (
+                    max_mm_items_per_prompt,
+                    max_mm_items_per_batch,
+                ) = mm_budget.get_max_items(dummy_modality, max_tokens)
 
-            encoder_budget = min(self.max_num_encoder_input_tokens,
-                                 self.encoder_cache_size)
+                logger.info(
+                    "Encoder cache will be initialized with a budget of "
+                    "%s tokens, and profiled with %s %s items of the maximum "
+                    "feature size.",
+                    encoder_budget,
+                    max_mm_items_per_batch,
+                    dummy_modality,
+                )
 
-            logger.info(
-                "Encoder cache will be initialized with a budget of %d tokens,"
-                " and profiled with %s %s items of the maximum feature size.",
-                encoder_budget, max_num_mm_items, dummy_data_modality)
+                # Create dummy batch of multimodal inputs.
+                batched_dummy_mm_inputs = self._get_mm_dummy_batch(
+                    dummy_modality,
+                    max_mm_items_per_batch,
+                )
 
-            # Create dummy batch of multimodal inputs.
-            batched_dummy_mm_inputs = self._get_mm_dummy_batch(
-                dummy_data_modality, max_num_mm_items)
+                # Run multimodal encoder.
+                # Isolate encoder graph from post-processing to minimize
+                # impact of recompilation until it's fixed.
+                start = time.perf_counter()
+                xm.mark_step()
+                dummy_encoder_outputs = self.model.get_multimodal_embeddings(
+                    **batched_dummy_mm_inputs)
+                xm.mark_step()
+                xm.wait_device_ops()
+                end = time.perf_counter()
+                logger.info(
+                    "Multimodal Encoder profiling finished in in %.2f [secs].",
+                    end - start)
 
-            # Run multimodal encoder.
-            # Isolate encoder graph from post-processing to minimize
-            # impact of recompilation until it's fixed.
-            start = time.perf_counter()
-            xm.mark_step()
-            dummy_encoder_outputs = self.model.get_multimodal_embeddings(
-                **batched_dummy_mm_inputs)
-            xm.mark_step()
-            xm.wait_device_ops()
-            end = time.perf_counter()
-            logger.info(
-                "Multimodal Encoder profiling finished in in %.2f [secs].",
-                end - start)
+                sanity_check_mm_encoder_outputs(
+                    dummy_encoder_outputs,
+                    expected_num_items=max_mm_items_per_batch,
+                )
 
-            assert len(dummy_encoder_outputs) == max_num_mm_items, (
-                "Expected dimension 0 of encoder outputs to match the number "
-                f"of multimodal data items: {max_num_mm_items}, got "
-                f"{len(dummy_encoder_outputs)=} instead. This is most likely "
-                "due to the 'get_multimodal_embeddings' method of the model "
-                "not implemented correctly.")
-
-            # Cache the dummy encoder outputs.
-            self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs))
+                # Cache the dummy encoder outputs.
+                self.encoder_cache["tmp"] = dict(
+                    enumerate(dummy_encoder_outputs))
 
         # Trigger compilation for general shape.
         self._dummy_run(num_tokens, self.num_reqs_max_model_len,
@@ -1808,33 +1798,25 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             self.grammar_bitmask_cpu[:num_reqs].to(logits.device), \
             self.structured_decode_arange.to(logits.device)
 
-    def _get_mm_dummy_batch(self, modality: str,
-                            batch_size: int) -> BatchedTensorInputs:
-        # Dummy data for pre-compiling multimodal models.
-        dummy_request_data = self.mm_registry.get_decoder_dummy_data(
+    def _get_mm_dummy_batch(
+        self,
+        modality: str,
+        max_items_per_batch: int,
+    ) -> BatchedTensorInputs:
+        """Dummy data for profiling and precompiling multimodal models."""
+        dummy_decoder_data = self.mm_registry.get_decoder_dummy_data(
             model_config=self.model_config,
             seq_len=self.max_num_tokens,
+            mm_counts={modality: 1},
         )
-        dummy_mm_data = dummy_request_data.multi_modal_data
+        dummy_mm_data = dummy_decoder_data.multi_modal_data
 
-        # Dummy data definition in V0 may contain multiple multimodal items
-        # (e.g, multiple images) for a single request, therefore here we
-        # always replicate first item by max_num_mm_items times since in V1
-        # they are scheduled to be processed separately.
-        assert isinstance(dummy_mm_data, MultiModalKwargs), (
-            "Expected dummy multimodal data to be of type "
-            f"MultiModalKwargs, got {type(dummy_mm_data)=} instead. "
-            "This is most likely due to the model not having a merged "
-            "processor.")
-
-        # When models have a merged processor, their dummy data is
-        # already batched `MultiModalKwargs`, therefore we take the first
-        # `MultiModalKwargsItem` from the desired modality to profile on.
+        # Result in the maximum GPU consumption of the model
         dummy_mm_item = dummy_mm_data.get_item(modality=modality, item_index=0)
         dummy_mm_kwargs = MultiModalKwargs.from_items([dummy_mm_item])
 
         batched_dummy_mm_inputs = MultiModalKwargs.batch([dummy_mm_kwargs] *
-                                                         batch_size)
+                                                         max_items_per_batch)
         return MultiModalKwargs.as_kwargs(
             batched_dummy_mm_inputs,
             device=self.device,
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index 3ecb1d7dd6560..6761b3c5e41db 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -5,14 +5,123 @@ from typing import TYPE_CHECKING, Optional
 
 import torch
 
+from vllm.config import ModelConfig, SchedulerConfig
 from vllm.model_executor.models.interfaces import MultiModalEmbeddings
 from vllm.model_executor.models.utils import extract_layer_index
+from vllm.multimodal.registry import MultiModalRegistry
+from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
 from vllm.v1.kv_cache_interface import KVCacheGroupSpec
 
 if TYPE_CHECKING:
     from vllm.attention.layer import Attention
 
 
+class MultiModalBudget:
+    """Helper class to calculate budget information for multi-modal models."""
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        scheduler_config: SchedulerConfig,
+        mm_registry: MultiModalRegistry,
+        *,
+        max_model_len: int,
+        max_num_reqs: int,
+    ) -> None:
+        super().__init__()
+
+        self.model_config = model_config
+        self.scheduler_config = scheduler_config
+        self.mm_registry = mm_registry
+
+        encoder_compute_budget, encoder_cache_size = compute_encoder_budget(
+            model_config=model_config,
+            scheduler_config=scheduler_config,
+            mm_registry=mm_registry,
+        )
+
+        self.max_num_encoder_input_tokens = encoder_compute_budget
+        self.encoder_cache_size = encoder_cache_size
+        self.max_model_len = max_model_len
+        self.max_num_reqs = max_num_reqs
+
+        self.mm_limits = mm_registry.get_mm_limits_per_prompt(model_config)
+
+        max_items_per_prompt_by_modality = dict[str, int]()
+        max_items_per_batch_by_modality = dict[str, int]()
+
+        max_tokens_by_modality = mm_registry \
+            .get_max_tokens_per_item_by_nonzero_modality(model_config)
+
+        for modality, max_tokens in max_tokens_by_modality.items():
+            (
+                max_items_per_prompt,
+                max_items_per_batch,
+            ) = self.get_max_items(modality, max_tokens)
+
+            max_items_per_prompt_by_modality[modality] = max_items_per_prompt
+            max_items_per_batch_by_modality[modality] = max_items_per_batch
+
+        self.max_tokens_by_modality = max_tokens_by_modality
+        self.max_items_per_prompt_by_modality = max_items_per_prompt_by_modality
+        self.max_items_per_batch_by_modality = max_items_per_batch_by_modality
+
+    def get_modality_with_max_tokens(self) -> tuple[str, int]:
+        max_tokens_by_modality = self.max_tokens_by_modality
+        modality, max_tokens = max(max_tokens_by_modality.items(),
+                                   key=lambda item: item[1])
+
+        return modality, max_tokens
+
+    def get_encoder_budget(self) -> int:
+        return min(self.max_num_encoder_input_tokens, self.encoder_cache_size)
+
+    def get_max_items(
+        self,
+        modality: str,
+        max_tokens_per_item: int,
+    ) -> tuple[int, int]:
+        if max_tokens_per_item == 0:
+            return 0, 0
+
+        # Check how many items of this modality can be supported by
+        # the encoder budget.
+        encoder_budget = self.get_encoder_budget()
+
+        # TODO: handle encoder-decoder models once we support them.
+        if encoder_budget == 0:
+            return 0, 0
+
+        max_encoder_items_per_batch = encoder_budget // max_tokens_per_item
+
+        # Check how many items of this modality can be supported by
+        # the decoder budget.
+        mm_limit = self.mm_limits[modality]
+
+        max_items_per_prompt = max(
+            1,
+            min(mm_limit, self.max_model_len // max_tokens_per_item),
+        )
+
+        scheduler_config = self.scheduler_config
+        max_num_reqs = self.max_num_reqs
+
+        if not scheduler_config.enable_chunked_prefill:
+            max_num_reqs = min(
+                max_num_reqs,
+                scheduler_config.max_num_batched_tokens // max_tokens_per_item,
+            )
+
+        max_decoder_items_per_batch = max_num_reqs * max_items_per_prompt
+
+        max_items_per_batch = max(
+            1,
+            min(max_encoder_items_per_batch, max_decoder_items_per_batch),
+        )
+
+        return max_items_per_prompt, max_items_per_batch
+
+
 def sanity_check_mm_encoder_outputs(
     mm_embeddings: MultiModalEmbeddings,
     expected_num_items: int,
diff --git a/vllm/v1/worker/xpu_worker.py b/vllm/v1/worker/xpu_worker.py
index c7885694f7a38..2a7e0625b2f87 100644
--- a/vllm/v1/worker/xpu_worker.py
+++ b/vllm/v1/worker/xpu_worker.py
@@ -41,12 +41,23 @@ class XPUWorker(Worker):
             torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
             logger.info("Profiling enabled. Traces will be saved to: %s",
                         torch_profiler_trace_dir)
+            logger.debug(
+                "Profiler config: record_shapes=%s,"
+                "profile_memory=%s,with_stack=%s,with_flops=%s",
+                envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
+                envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
+                envs.VLLM_TORCH_PROFILER_WITH_STACK,
+                envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
+            )
             self.profiler = torch.profiler.profile(
                 activities=[
                     torch.profiler.ProfilerActivity.CPU,
                     torch.profiler.ProfilerActivity.XPU,
                 ],
-                with_stack=True,
+                record_shapes=envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
+                profile_memory=envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
+                with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
+                with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
                 on_trace_ready=torch.profiler.tensorboard_trace_handler(
                     torch_profiler_trace_dir, use_gzip=True))
         else:
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 5a185e7451ade..20b9b733cd3b9 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -22,6 +22,7 @@ import vllm.envs as envs
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.attention.backends.abstract import AttentionState
 from vllm.attention.backends.utils import CommonAttentionState
+from vllm.compilation.counter import compilation_counter
 from vllm.config import CompilationLevel, VllmConfig
 from vllm.core.scheduler import SchedulerOutputs
 from vllm.distributed import broadcast_tensor_dict, get_pp_group
@@ -1121,6 +1122,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
             CompilationLevel.DYNAMO_AS_IS and supports_dynamo():
             backend = self.vllm_config.compilation_config.init_backend(
                 self.vllm_config)
+            compilation_counter.dynamo_as_is_count += 1
             self.model = torch.compile(
                 self.model,
                 fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py
index 7ccf1a2c0a876..8317b9abff0cd 100644
--- a/vllm/worker/neuron_model_runner.py
+++ b/vllm/worker/neuron_model_runner.py
@@ -15,8 +15,7 @@ from vllm.lora.request import LoRARequest
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.model_loader.neuron import get_neuron_model
-from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
-                             MultiModalKwargs)
+from vllm.multimodal import BatchedTensorInputs, MultiModalKwargs
 from vllm.platforms import current_platform
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
@@ -88,10 +87,6 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
         self.device = self.device_config.device
         self.pin_memory = is_pin_memory_available()
 
-        # Multi-modal data support
-        self.multi_modal_input_mapper = MULTIMODAL_REGISTRY \
-            .create_input_mapper(self.model_config)
-
         # Lazy initialization.
         self.model: nn.Module  # initialize after load_model.