diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md
index cdf6a645147e5..fcde284efea98 100644
--- a/.buildkite/nightly-benchmarks/README.md
+++ b/.buildkite/nightly-benchmarks/README.md
@@ -28,6 +28,7 @@ See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performanc
 ## Trigger the benchmark
 
 Performance benchmark will be triggered when:
+
 - A PR being merged into vllm.
 - Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
 
@@ -38,6 +39,7 @@ bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
 ```
 
 Runtime environment variables:
+
 - `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
 - `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
 - `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
@@ -46,12 +48,14 @@ Runtime environment variables:
 - `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
 
 Nightly benchmark will be triggered when:
+
 - Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
 
 ## Performance benchmark details
 
 See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
 > NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
+>
 ### Latency test
 
 Here is an example of one test inside `latency-tests.json`:
@@ -74,7 +78,7 @@ Here is an example of one test inside `latency-tests.json`:
 In this example:
 
 - The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
-- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
+- The `parameters` attribute control the command line arguments to be used for `vllm bench latency`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `vllm bench latency`. For example, the corresponding command line arguments for `vllm bench latency` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
 
 Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
 
@@ -82,13 +86,13 @@ WARNING: The benchmarking script will save json results by itself, so please do
 
 ### Throughput test
 
-The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
+The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `vllm bench throughput`.
 
 The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
 
 ### Serving test
 
-We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
+We test the throughput by using `vllm bench serve` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
 
 ```json
 [
@@ -118,8 +122,8 @@ Inside this example:
 
 - The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
 - The `server-parameters` includes the command line arguments for vLLM server.
-- The `client-parameters` includes the command line arguments for `benchmark_serving.py`.
-- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `benchmark_serving.py`
+- The `client-parameters` includes the command line arguments for `vllm bench serve`.
+- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `vllm bench serve`
 
 The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly.
 
@@ -149,6 +153,7 @@ Here is an example using the script to compare result_a and result_b without det
 
 Here is an example using the script to compare result_a and result_b with detail test name.
 `python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
+
 |   | results_a/benchmark_results.json_name | results_a/benchmark_results.json | results_b/benchmark_results.json_name | results_b/benchmark_results.json | perf_ratio        |
 |---|---------------------------------------------|----------------------------------------|---------------------------------------------|----------------------------------------|----------|
 | 0 | serving_llama8B_tp1_sharegpt_qps_1          | 142.633982                             | serving_llama8B_tp1_sharegpt_qps_1          | 156.526018                             | 1.097396 |
diff --git a/.buildkite/nightly-benchmarks/nightly-annotation.md b/.buildkite/nightly-benchmarks/nightly-annotation.md
index ef11c040057c8..466def07b6f1f 100644
--- a/.buildkite/nightly-benchmarks/nightly-annotation.md
+++ b/.buildkite/nightly-benchmarks/nightly-annotation.md
@@ -1,3 +1,4 @@
+# Nightly benchmark annotation
 
 ## Description
 
@@ -13,15 +14,15 @@ Please download the visualization scripts in the post
 
 - Find the docker we use in `benchmarking pipeline`
 - Deploy the docker, and inside the docker:
-  - Download `nightly-benchmarks.zip`.
-  - In the same folder, run the following code:
+    - Download `nightly-benchmarks.zip`.
+    - In the same folder, run the following code:
 
-  ```bash
-  export HF_TOKEN=<your HF token>
-  apt update
-  apt install -y git
-  unzip nightly-benchmarks.zip
-  VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
-  ```
+    ```bash
+    export HF_TOKEN=<your HF token>
+    apt update
+    apt install -y git
+    unzip nightly-benchmarks.zip
+    VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+    ```
 
 And the results will be inside `./benchmarks/results`.
diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md
index 5f003f42f07c0..8afde017d383e 100644
--- a/.buildkite/nightly-benchmarks/nightly-descriptions.md
+++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@@ -13,25 +13,25 @@ Latest reproduction guilde: [github issue link](https://github.com/vllm-project/
 ## Setup
 
 - Docker images:
-  - vLLM: `vllm/vllm-openai:v0.6.2`
-  - SGLang: `lmsysorg/sglang:v0.3.2-cu121`
-  - LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
-  - TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
-    - *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.*
-  - Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
+    - vLLM: `vllm/vllm-openai:v0.6.2`
+    - SGLang: `lmsysorg/sglang:v0.3.2-cu121`
+    - LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
+    - TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
+        - *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.*
+    - Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
 - Hardware
-  - 8x Nvidia A100 GPUs
+    - 8x Nvidia A100 GPUs
 - Workload:
-  - Dataset
-    - ShareGPT dataset
-    - Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
-    - Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
-    - Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
-  - Models: llama-3 8B, llama-3 70B.
-    - We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
-  - Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
-    - Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
-  - Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
+    - Dataset
+        - ShareGPT dataset
+        - Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
+        - Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
+        - Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
+    - Models: llama-3 8B, llama-3 70B.
+        - We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
+    - Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
+        - Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
+    - Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
 
 ## Known issues
 
diff --git a/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md b/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md
index a1f8441ccdac8..8bb16bd3cf373 100644
--- a/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md
+++ b/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md
@@ -1,3 +1,4 @@
+# Performance benchmarks descriptions
 
 ## Latency tests
 
diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
index 724b53056ca8f..554256b4bdb8b 100644
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -44,6 +44,7 @@ serving_column_mapping = {
     "test_name": "Test name",
     "gpu_type": "GPU",
     "completed": "# of req.",
+    "max_concurrency": "# of max concurrency.",
     "request_throughput": "Tput (req/s)",
     "total_token_throughput": "Total Token Tput (tok/s)",
     "output_throughput": "Output Tput (tok/s)",
@@ -100,7 +101,7 @@ if __name__ == "__main__":
             raw_result = json.loads(f.read())
 
         if "serving" in str(test_file):
-            # this result is generated via `benchmark_serving.py`
+            # this result is generated via `vllm bench serve` command
 
             # attach the benchmarking command to raw_result
             try:
@@ -120,7 +121,7 @@ if __name__ == "__main__":
             continue
 
         elif "latency" in f.name:
-            # this result is generated via `benchmark_latency.py`
+            # this result is generated via `vllm bench latency` command
 
             # attach the benchmarking command to raw_result
             try:
@@ -148,7 +149,7 @@ if __name__ == "__main__":
             continue
 
         elif "throughput" in f.name:
-            # this result is generated via `benchmark_throughput.py`
+            # this result is generated via `vllm bench throughput` command
 
             # attach the benchmarking command to raw_result
             try:
diff --git a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
index 4d01a314adc47..06d7b5ed484da 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
@@ -73,7 +73,7 @@ get_current_llm_serving_engine() {
     echo "Container: vllm"
     # move to a completely irrelevant directory, to avoid import vllm from current folder
     export CURRENT_LLM_SERVING_ENGINE=vllm
-    
+
     return
   fi
 }
@@ -95,12 +95,14 @@ json2args() {
 }
 
 kill_gpu_processes() {
-  pkill -f python
-  pkill -f python3
-  pkill -f tritonserver
-  pkill -f pt_main_thread
-  pkill -f text-generation
-  pkill -f lmdeploy
+  pkill -f '[p]ython'
+  pkill -f '[p]ython3'
+  pkill -f '[t]ritonserver'
+  pkill -f '[p]t_main_thread'
+  pkill -f '[t]ext-generation'
+  pkill -f '[l]mdeploy'
+  # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
+  pkill -f '[V]LLM'
 
   while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
     sleep 1
@@ -125,7 +127,7 @@ ensure_installed() {
 }
 
 run_serving_tests() {
-  # run serving tests using `benchmark_serving.py`
+  # run serving tests using `vllm bench serve` command
   # $1: a json file specifying serving test cases
 
   local serving_test_file
@@ -225,7 +227,7 @@ run_serving_tests() {
 
       if [[ "$dataset_name" = "sharegpt" ]]; then
 
-        client_command="python3 benchmark_serving.py \
+        client_command="vllm bench serve \
           --backend $backend \
           --tokenizer /tokenizer_cache \
           --model $model \
@@ -246,7 +248,7 @@ run_serving_tests() {
         sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
         sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
 
-        client_command="python3 benchmark_serving.py \
+        client_command="vllm bench serve \
           --backend $backend \
           --tokenizer /tokenizer_cache \
           --model $model \
@@ -265,13 +267,13 @@ run_serving_tests() {
           $client_args"
 
       else
-  
+
         echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
         exit 1
 
       fi
 
-        
+
 
       echo "Running test case $test_name with qps $qps"
       echo "Client command: $client_command"
@@ -302,7 +304,7 @@ run_serving_tests() {
 }
 
 run_genai_perf_tests() {
-  # run genai-perf tests 
+  # run genai-perf tests
 
   # $1: a json file specifying genai-perf test cases
   local genai_perf_test_file
@@ -311,14 +313,14 @@ run_genai_perf_tests() {
   # Iterate over genai-perf tests
   jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
     # get the test name, and append the GPU type back to it.
-    test_name=$(echo "$params" | jq -r '.test_name')    
-    
+    test_name=$(echo "$params" | jq -r '.test_name')
+
     # if TEST_SELECTOR is set, only run the test cases that match the selector
     if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
       echo "Skip test case $test_name."
       continue
     fi
-    
+
     # prepend the current serving engine to the test name
     test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
 
@@ -369,10 +371,10 @@ run_genai_perf_tests() {
         qps=$num_prompts
         echo "now qps is $qps"
       fi
-    
+
       new_test_name=$test_name"_qps_"$qps
       backend=$CURRENT_LLM_SERVING_ENGINE
-      
+
       if [[ "$backend" == *"vllm"* ]]; then
         backend="vllm"
       fi
@@ -413,7 +415,7 @@ prepare_dataset() {
   do
     cat sonnet.txt >> sonnet_4x.txt
   done
-  
+
 }
 
 main() {
diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
index f05040618981c..2c57666a81aa3 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@@ -33,7 +33,7 @@ check_gpus() {
 
 check_cpus() {
   # check the number of CPUs and NUMA Node and GPU type.
-  declare -g numa_count=$(python3 -c  "from numa import info;numa_size = info.get_num_configured_nodes(); print(numa_size)")
+  declare -g numa_count=$(lscpu | grep "NUMA node(s):" | awk '{print $3}')
   if [[ $numa_count -gt 0 ]]; then
     echo "NUMA found."
     echo $numa_count
@@ -126,7 +126,8 @@ kill_gpu_processes() {
   ps -aux
   lsof -t -i:8000 | xargs -r kill -9
   pgrep python3 | xargs -r kill -9
-
+  # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
+  pgrep VLLM | xargs -r kill -9
 
   # wait until GPU memory usage smaller than 1GB
   if command -v nvidia-smi; then
@@ -164,7 +165,7 @@ upload_to_buildkite() {
 }
 
 run_latency_tests() {
-  # run latency tests using `benchmark_latency.py`
+  # run latency tests using `vllm bench latency` command
   # $1: a json file specifying latency test cases
 
   local latency_test_file
@@ -205,7 +206,7 @@ run_latency_tests() {
       fi
     fi
 
-    latency_command=" $latency_envs python3 benchmark_latency.py \
+    latency_command=" $latency_envs vllm bench latency \
       --output-json $RESULTS_FOLDER/${test_name}.json \
       $latency_args"
 
@@ -231,7 +232,7 @@ run_latency_tests() {
 }
 
 run_throughput_tests() {
-  # run throughput tests using `benchmark_throughput.py`
+  # run throughput tests using `vllm bench throughput`
   # $1: a json file specifying throughput test cases
 
   local throughput_test_file
@@ -272,7 +273,7 @@ run_throughput_tests() {
       fi
     fi
 
-    throughput_command=" $throughput_envs python3 benchmark_throughput.py \
+    throughput_command=" $throughput_envs vllm bench throughput \
       --output-json $RESULTS_FOLDER/${test_name}.json \
       $throughput_args"
 
@@ -297,7 +298,7 @@ run_throughput_tests() {
 }
 
 run_serving_tests() {
-  # run serving tests using `benchmark_serving.py`
+  # run serving tests using `vllm bench serve` command
   # $1: a json file specifying serving test cases
 
   local serving_test_file
@@ -393,7 +394,7 @@ run_serving_tests() {
 
       # pass the tensor parallel size to the client so that it can be displayed
       # on the benchmark dashboard
-      client_command="python3 benchmark_serving.py \
+      client_command="vllm bench serve \
         --save-result \
         --result-dir $RESULTS_FOLDER \
         --result-filename ${new_test_name}.json \
@@ -447,7 +448,7 @@ main() {
   (which jq) || (apt-get update && apt-get -y install jq)
   (which lsof) || (apt-get update && apt-get install -y lsof)
 
-  # get the current IP address, required by benchmark_serving.py
+  # get the current IP address, required by `vllm bench serve` command
   export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
   # turn of the reporting of the status of each request, to clean up the terminal output
   export VLLM_LOGGING_LEVEL="WARNING"
diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
new file mode 100644
index 0000000000000..a144b4420fbf1
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
@@ -0,0 +1,209 @@
+[
+    {
+        "test_name": "serving_llama8B_tp1_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+	    "max_concurrency": 60,
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_tp2_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+	    "max_concurrency": 60,
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_tp4_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 4,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+	    "max_concurrency": 60,
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_tp1_random_128_128",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+	    "max_concurrency": 1000,
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_tp2_random_128_128",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+	    "max_concurrency": 1000,
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_tp4_random_128_128",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 4,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+	    "max_concurrency": 1000,
+            "num_prompts": 1000
+        }
+    }
+]
diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
new file mode 100644
index 0000000000000..e6e69b63b74df
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
@@ -0,0 +1,211 @@
+[
+    {
+        "test_name": "serving_llama8B_pp1_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "pipeline_parallel_size": 1,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+	    "max_concurrency": 60,
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_pp3_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "pipeline_parallel_size": 3,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+	    "max_concurrency": 60,
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_tp2pp6_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 2,
+            "pipeline_parallel_size": 3,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+	    "max_concurrency": 60,
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_pp1_random_128_128",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "pipeline_parallel_size": 1,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+	    "max_concurrency": 1000,
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_pp3_random_128_128",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL:": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "pipeline_parallel_size": 3,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+	    "max_concurrency": 1000,
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_tp2pp3_random_128_128",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 2,
+            "pipeline_parallel_size": 3,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+	    "max_concurrency": 1000,
+            "num_prompts": 1000
+        }
+    }
+]
diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
index 22f71c993ff33..ce1f924de387f 100644
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
@@ -6,6 +6,7 @@
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "server_parameters": {
@@ -18,6 +19,8 @@
             "disable_log_stats": "",
             "disable_log_requests": "",
 	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
             "load_format": "dummy"
         },
         "client_parameters": {
@@ -36,6 +39,7 @@
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "server_parameters": {
@@ -48,6 +52,8 @@
             "disable_log_stats": "",
             "disable_log_requests": "",
 	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
             "load_format": "dummy"
         },
         "client_parameters": {
@@ -66,6 +72,7 @@
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "server_parameters": {
@@ -78,6 +85,8 @@
             "disable_log_stats": "",
             "disable_log_requests": "",
 	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
             "load_format": "dummy"
         },
         "client_parameters": {
@@ -96,6 +105,7 @@
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "server_parameters": {
@@ -109,6 +119,8 @@
             "disable_log_stats": "",
             "disable_log_requests": "",
 	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
             "load_format": "dummy"
         },
         "client_parameters": {
@@ -129,6 +141,7 @@
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "server_parameters": {
@@ -142,6 +155,8 @@
             "disable_log_stats": "",
             "disable_log_requests": "",
 	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
             "load_format": "dummy"
         },
         "client_parameters": {
diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
index 90cc9c8446223..57a7bc4e5f5df 100644
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -13,9 +13,9 @@ NUMA_NODE=${NUMA_NODE:-1}
 export CMAKE_BUILD_PARALLEL_LEVEL=32
 
 # Setup cleanup
-remove_docker_container() { 
-    set -e; 
-    docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true; 
+remove_docker_container() {
+    set -e;
+    docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
 }
 trap remove_docker_container EXIT
 remove_docker_container
@@ -69,7 +69,7 @@ function cpu_tests() {
   docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
     pytest -s -v \
-    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]" 
+    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
 
   # Note: disable it until supports V1
   # Run AWQ test
@@ -78,23 +78,23 @@ function cpu_tests() {
   #   VLLM_USE_V1=0 pytest -s -v \
   #   tests/quantization/test_ipex_quant.py"
 
-  # online serving
-  docker exec cpu-test-"$NUMA_NODE" bash -c '
-    set -e
-    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
-    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
-    python3 benchmarks/benchmark_serving.py \
-      --backend vllm \
-      --dataset-name random \
-      --model meta-llama/Llama-3.2-3B-Instruct \
-      --num-prompts 20 \
-      --endpoint /v1/completions'
-
   # Run multi-lora tests
   docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
     pytest -s -v \
     tests/lora/test_qwen2vl.py"
+
+  # online serving
+  docker exec cpu-test-"$NUMA_NODE" bash -c '
+    set -e
+    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
+    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
+    vllm bench serve \
+      --backend vllm \
+      --dataset-name random \
+      --model meta-llama/Llama-3.2-3B-Instruct \
+      --num-prompts 20 \
+      --endpoint /v1/completions'
 }
 
 # All of CPU tests are expected to be finished less than 40 mins.
diff --git a/.buildkite/scripts/hardware_ci/run-gh200-test.sh b/.buildkite/scripts/hardware_ci/run-gh200-test.sh
index 8c64e14606d3b..f69e4b06680f5 100644
--- a/.buildkite/scripts/hardware_ci/run-gh200-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-gh200-test.sh
@@ -16,8 +16,7 @@ DOCKER_BUILDKIT=1 docker build . \
   --build-arg max_jobs=66 \
   --build-arg nvcc_threads=2 \
   --build-arg RUN_WHEEL_CHECK=false \
-  --build-arg torch_cuda_arch_list="9.0+PTX" \
-  --build-arg vllm_fa_cmake_gpu_arches="90-real"
+  --build-arg torch_cuda_arch_list="9.0+PTX"
 
 # Setup cleanup
 remove_docker_container() { docker rm -f gh200-test || true; }
diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
new file mode 100755
index 0000000000000..d998c1f73b514
--- /dev/null
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
@@ -0,0 +1,166 @@
+#!/bin/bash
+
+set -xu
+
+
+remove_docker_container() { 
+    docker rm -f tpu-test || true; 
+    docker rm -f vllm-tpu || true;
+}
+
+trap remove_docker_container EXIT
+
+# Remove the container that might not be cleaned up in the previous run.
+remove_docker_container
+
+# Build the docker image.
+docker build -f docker/Dockerfile.tpu -t vllm-tpu .
+
+# Set up cleanup.
+cleanup_docker() {
+  # Get Docker's root directory
+  docker_root=$(docker info -f '{{.DockerRootDir}}')
+  if [ -z "$docker_root" ]; then
+    echo "Failed to determine Docker root directory."
+    exit 1
+  fi
+  echo "Docker root directory: $docker_root"
+  # Check disk usage of the filesystem where Docker's root directory is located
+  disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
+  # Define the threshold
+  threshold=70
+  if [ "$disk_usage" -gt "$threshold" ]; then
+    echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
+    # Remove dangling images (those that are not tagged and not used by any container)
+    docker image prune -f
+    # Remove unused volumes / force the system prune for old images as well.
+    docker volume prune -f && docker system prune --force --filter "until=72h" --all
+    echo "Docker images and volumes cleanup completed."
+  else
+    echo "Disk usage is below $threshold%. No cleanup needed."
+  fi
+}
+cleanup_docker
+
+# For HF_TOKEN.
+source /etc/environment
+
+docker run --privileged --net host --shm-size=16G -it \
+    -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
+    vllm-tpu /bin/bash -c '
+set -e # Exit immediately if a command exits with a non-zero status.
+set -u # Treat unset variables as an error.
+
+echo "--- Starting script inside Docker container ---"
+
+# Create results directory
+RESULTS_DIR=$(mktemp -d)
+# If mktemp fails, set -e will cause the script to exit.
+echo "Results will be stored in: $RESULTS_DIR"
+
+# Install dependencies
+echo "--- Installing Python dependencies ---"
+python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
+    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
+    && python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 \
+    && python3 -m pip install --progress-bar off hf-transfer
+echo "--- Python dependencies installed ---"
+export VLLM_USE_V1=1
+export VLLM_XLA_CHECK_RECOMPILATION=1
+export VLLM_XLA_CACHE_PATH=
+echo "Using VLLM V1"
+
+echo "--- Hardware Information ---"
+# tpu-info
+echo "--- Starting Tests ---"
+set +e
+overall_script_exit_code=0
+
+# --- Test Definitions ---
+# If a test fails, this function will print logs and will not cause the main script to exit.
+run_test() {
+    local test_num=$1
+    local test_name=$2
+    local test_command=$3
+    local log_file="$RESULTS_DIR/test_${test_num}.log"
+    local actual_exit_code
+
+    echo "--- TEST_$test_num: Running $test_name ---"
+    
+    # Execute the test command.
+    eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2)
+    actual_exit_code=$?
+
+    echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log
+    echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log
+
+    if [ "$actual_exit_code" -ne 0 ]; then
+        echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2
+        echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2
+        if [ -f "$log_file" ]; then
+            cat "$log_file" >&2
+        else
+            echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2
+        fi
+        echo "--- End of log for TEST_$test_num ($test_name) ---" >&2
+        return "$actual_exit_code" # Return the failure code
+    else
+        echo "TEST_$test_num ($test_name) PASSED."
+        return 0 # Return success
+    fi
+}
+
+# Helper function to call run_test and update the overall script exit code
+run_and_track_test() {
+    local test_num_arg="$1"
+    local test_name_arg="$2"
+    local test_command_arg="$3"
+
+    # Run the test
+    run_test "$test_num_arg" "$test_name_arg" "$test_command_arg"
+    local test_specific_exit_code=$?
+
+    # If the test failed, set the overall script exit code to 1
+    if [ "$test_specific_exit_code" -ne 0 ]; then
+        # No need for extra echo here, run_test already logged the failure.
+        overall_script_exit_code=1
+    fi
+}
+
+# --- Actual Test Execution ---
+run_and_track_test 1 "test_struct_output_generate.py" \
+    "HF_HUB_DISABLE_XET=1 python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
+run_and_track_test 2 "test_moe_pallas.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
+run_and_track_test 3 "test_lora.py" \
+    "VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
+run_and_track_test 4 "test_tpu_qkv_linear.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
+run_and_track_test 5 "test_spmd_model_weight_loading.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
+run_and_track_test 6 "test_kv_cache_update_kernel.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
+
+# After all tests have been attempted, exit with the overall status.
+if [ "$overall_script_exit_code" -ne 0 ]; then
+    echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---"
+else
+    echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---"
+fi
+exit "$overall_script_exit_code"
+' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct.
+
+# Capture the exit code of the docker run command
+DOCKER_RUN_EXIT_CODE=$?
+
+# The trap will run for cleanup.
+# Exit the main script with the Docker run command's exit code.
+if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then
+    echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE."
+    exit "$DOCKER_RUN_EXIT_CODE"
+else
+    echo "Docker run command completed successfully."
+    exit 0
+fi
+# TODO: This test fails because it uses RANDOM_SEED sampling
+# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
index 5514d7770cff8..e565d4b246945 100755
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -150,18 +150,6 @@ run_and_track_test 9 "test_multimodal.py" \
     "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py"
 run_and_track_test 10 "test_pallas.py" \
     "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
-run_and_track_test 11 "test_struct_output_generate.py" \
-    "HF_HUB_DISABLE_XET=1 python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
-run_and_track_test 12 "test_moe_pallas.py" \
-    "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
-run_and_track_test 13 "test_lora.py" \
-    "VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
-run_and_track_test 14 "test_tpu_qkv_linear.py" \
-    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
-run_and_track_test 15 "test_spmd_model_weight_loading.py" \
-    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
-run_and_track_test 16 "test_kv_cache_update_kernel.py" \
-    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
 
 # After all tests have been attempted, exit with the overall status.
 if [ "$overall_script_exit_code" -ne 0 ]; then
diff --git a/.buildkite/scripts/run-benchmarks.sh b/.buildkite/scripts/run-benchmarks.sh
index 195a8063fd743..72812218cb668 100644
--- a/.buildkite/scripts/run-benchmarks.sh
+++ b/.buildkite/scripts/run-benchmarks.sh
@@ -11,10 +11,10 @@ cd "$(dirname "${BASH_SOURCE[0]}")/../.."
 (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
 
 # run python-based benchmarks and upload the result to buildkite
-python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
+vllm bench latency --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
 bench_latency_exit_code=$?
 
-python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
+vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
 bench_throughput_exit_code=$?
 
 # run server-based benchmarks and upload the result to buildkite
@@ -24,7 +24,7 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
 
 # wait for server to start, timeout after 600 seconds
 timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
-python3 benchmarks/benchmark_serving.py \
+vllm bench serve \
     --backend vllm \
     --dataset-name sharegpt \
     --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \
diff --git a/.buildkite/scripts/tpu/run_bm.sh b/.buildkite/scripts/tpu/run_bm.sh
index 877669cd956ac..beecaf7a740ae 100755
--- a/.buildkite/scripts/tpu/run_bm.sh
+++ b/.buildkite/scripts/tpu/run_bm.sh
@@ -77,7 +77,7 @@ done
 echo "run benchmark test..."
 echo "logging to $BM_LOG"
 echo
-python benchmarks/benchmark_serving.py \
+vllm bench serve \
     --backend vllm \
     --model $MODEL  \
     --dataset-name sonnet \
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 948ce9e8667f5..2bf0b6fd9a169 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -128,11 +128,10 @@ steps:
   - tests/entrypoints/offline_mode
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
+  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_collective_rpc.py
   - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
   - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
   - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
-  - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
   - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 
 - label: Entrypoints Test (API Server) # 40min
@@ -403,17 +402,18 @@ steps:
   - vllm/model_executor/layers/quantization
   - tests/kernels/quantization
   commands:
-    - pytest -v -s kernels/quantization  --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+    - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
   parallelism: 2
 
-- label: Kernels MoE Test
+- label: Kernels MoE Test %N
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - csrc/moe/
   - tests/kernels/moe
   - vllm/model_executor/layers/fused_moe/
   commands:
-    - pytest -v -s kernels/moe
+    - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  parallelism: 2
 
 - label: Kernels Mamba Test
   mirror_hardwares: [amdexperimental, amdproduction]
@@ -643,6 +643,17 @@ steps:
     - python3 examples/offline_inference/audio_language.py --model-type whisper
     - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
 
+- label: Blackwell Test
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  optional: true
+  source_file_dependencies:
+  - csrc/
+  - vllm/
+  commands:
+    - nvidia-smi
+    - python3 examples/offline_inference/basic/chat.py
+
 #####  1 GPU test  #####
 #####  multi gpus test  #####
 
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 2441055371663..5bc944296763d 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -10,7 +10,6 @@
 /vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
-/vllm/model_executor/guided_decoding @mgoin @russellb @aarnphm
 /vllm/multimodal @DarkLight1337 @ywang96
 /vllm/vllm_flash_attn @LucasWilkinson
 /vllm/lora @jeejeelee
@@ -35,9 +34,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/distributed/test_pipeline_parallel.py @youkaichao
 /tests/distributed/test_same_node.py @youkaichao
 /tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm
-/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb @aarnphm
 /tests/kernels @tlrmchlsmth @WoosukKwon
-/tests/model_executor/test_guided_processors.py @mgoin @russellb
 /tests/models @DarkLight1337 @ywang96
 /tests/multi_step @alexm-redhat @comaniac
 /tests/multimodal @DarkLight1337 @ywang96
@@ -64,3 +61,15 @@ mkdocs.yaml @hmellor
 /vllm/v1/worker/^xpu @jikunshang
 /vllm/platforms/xpu.py @jikunshang
 /docker/Dockerfile.xpu @jikunshang
+
+# Qwen-specific files
+/vllm/attention/backends/dual_chunk_flash_attn.py @sighingnow
+/vllm/model_executor/models/qwen* @sighingnow
+
+# Mistral-specific files
+/vllm/model_executor/models/mistral*.py @patrickvonplaten
+/vllm/model_executor/models/mixtral*.py @patrickvonplaten
+/vllm/model_executor/models/voxtral*.py @patrickvonplaten
+/vllm/model_executor/models/pixtral*.py @patrickvonplaten
+/vllm/transformers_utils/configs/mistral.py @patrickvonplaten
+/vllm/transformers_utils/tokenizers/mistral.py @patrickvonplaten
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index 017ec7ca82da7..d4aceab4472fa 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -1,4 +1,5 @@
-## Essential Elements of an Effective PR Description Checklist
+# Essential Elements of an Effective PR Description Checklist
+
 - [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
 - [ ] The test plan, such as providing test command.
 - [ ] The test results, such as pasting the results comparison before and after, or e2e results
@@ -14,5 +15,4 @@ PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE B
 
 ## (Optional) Documentation Update
 
-<!--- pyml disable-next-line no-emphasis-as-heading -->
 **BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)
diff --git a/.github/mergify.yml b/.github/mergify.yml
index 5c878ac02069f..d8ae509e0ac30 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -149,9 +149,6 @@ pull_request_rules:
       - files=examples/offline_inference/structured_outputs.py
       - files=examples/online_serving/openai_chat_completion_structured_outputs.py
       - files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
-      - files~=^vllm/model_executor/guided_decoding/
-      - files=tests/model_executor/test_guided_processors.py
-      - files=tests/entrypoints/llm/test_guided_generate.py
       - files~=^tests/v1/structured_output/
       - files=tests/v1/entrypoints/llm/test_guided_generate.py
       - files~=^vllm/v1/structured_output/
diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml
index d5736c0aee208..2b1086b7faf43 100644
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@@ -2,12 +2,16 @@ name: Lint and Deploy Charts
 
 on: pull_request
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 permissions:
   contents: read
 
 jobs:
   lint-and-deploy:
-    runs-on: ubuntu-24.04-arm
+    runs-on: ubuntu-latest
     steps:
       - name: Checkout
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
diff --git a/.github/workflows/matchers/markdownlint.json b/.github/workflows/matchers/markdownlint.json
new file mode 100644
index 0000000000000..fe094a9badb25
--- /dev/null
+++ b/.github/workflows/matchers/markdownlint.json
@@ -0,0 +1,17 @@
+{
+  "problemMatcher": [
+    {
+      "owner": "markdownlint",
+      "pattern": [
+        {
+          "regexp": "^([^:]*):(\\d+):?(\\d+)?\\s([\\w-\\/]*)\\s(.*)$",
+          "file": 1,
+          "line": 2,
+          "column": 3,
+          "code": 4,
+          "message": 5
+        }
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index 8e694d18134ef..195579f206a2f 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -5,6 +5,10 @@ on:
   push:
     branches: [main]
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
+
 permissions:
   contents: read
 
@@ -17,6 +21,7 @@ jobs:
       with:
         python-version: "3.12"
     - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
+    - run: echo "::add-matcher::.github/workflows/matchers/markdownlint.json"
     - run: echo "::add-matcher::.github/workflows/matchers/mypy.json"
     - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
       with:
diff --git a/.github/workflows/scripts/build.sh b/.github/workflows/scripts/build.sh
index 0f010832b465d..c69ebbb42da5a 100644
--- a/.github/workflows/scripts/build.sh
+++ b/.github/workflows/scripts/build.sh
@@ -15,7 +15,6 @@ $python_executable -m pip install -r requirements/build.txt -r requirements/cuda
 export MAX_JOBS=1
 # Make sure release wheels are built for the following architectures
 export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
-export VLLM_FA_CMAKE_GPU_ARCHES="80-real;90-real"
 
 bash tools/check_repo.sh
 
diff --git a/.markdownlint.yaml b/.markdownlint.yaml
new file mode 100644
index 0000000000000..c86fed9555d62
--- /dev/null
+++ b/.markdownlint.yaml
@@ -0,0 +1,13 @@
+MD007:
+  indent: 4
+MD013: false
+MD024:
+  siblings_only: true
+MD033: false
+MD042: false
+MD045: false
+MD046: false
+MD051: false
+MD052: false
+MD053: false
+MD059: false
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 5197820fb4020..612b290e88d46 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -35,12 +35,12 @@ repos:
     exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
     types_or: [c++, cuda]
     args: [--style=file, --verbose]
-- repo: https://github.com/jackdewinter/pymarkdown
-  rev: v0.9.29
+- repo: https://github.com/igorshubovych/markdownlint-cli
+  rev: v0.45.0
   hooks:
-  - id: pymarkdown
+  - id: markdownlint
     exclude: '.*\.inc\.md'
-    args: [fix]
+    stages: [manual] # Only run in CI
 - repo: https://github.com/rhysd/actionlint
   rev: v1.7.7
   hooks:
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index 98c3be25f7e76..4329750090683 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -7,6 +7,9 @@ build:
   os: ubuntu-22.04
   tools:
     python: "3.12"
+  jobs:
+    post_checkout:
+      - git fetch --unshallow || true
 
 mkdocs:
   configuration: mkdocs.yaml
diff --git a/README.md b/README.md
index dc2f0afbe3538..5348405b72d2c 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,4 @@
+<!-- markdownlint-disable MD001 MD041 -->
 <p align="center">
   <picture>
     <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/assets/logos/vllm-logo-text-dark.png">
@@ -16,6 +17,7 @@ Easy, fast, and cheap LLM serving for everyone
 ---
 
 *Latest News* 🔥
+
 - [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
 - [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/).
 - [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
@@ -46,6 +48,7 @@ Easy, fast, and cheap LLM serving for everyone
 </details>
 
 ---
+
 ## About
 
 vLLM is a fast and easy-to-use library for LLM inference and serving.
@@ -75,6 +78,7 @@ vLLM is flexible and easy to use with:
 - Multi-LoRA support
 
 vLLM seamlessly supports most popular open-source models on HuggingFace, including:
+
 - Transformer-like LLMs (e.g., Llama)
 - Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3)
 - Embedding Models (e.g., E5-Mistral)
@@ -91,6 +95,7 @@ pip install vllm
 ```
 
 Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
+
 - [Installation](https://docs.vllm.ai/en/latest/getting_started/installation.html)
 - [Quickstart](https://docs.vllm.ai/en/latest/getting_started/quickstart.html)
 - [List of Supported Models](https://docs.vllm.ai/en/latest/models/supported_models.html)
@@ -107,6 +112,7 @@ vLLM is a community project. Our compute resources for development and testing a
 <!-- Note: Please sort them in alphabetical order. -->
 <!-- Note: Please keep these consistent with docs/community/sponsors.md -->
 Cash Donations:
+
 - a16z
 - Dropbox
 - Sequoia Capital
@@ -114,6 +120,7 @@ Cash Donations:
 - ZhenFund
 
 Compute Resources:
+
 - AMD
 - Anyscale
 - AWS
diff --git a/RELEASE.md b/RELEASE.md
index 9352e7ef706c6..db0d51afc7be1 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -60,9 +60,10 @@ Please note: **No feature work allowed for cherry picks**. All PRs that are cons
 Before each release, we perform end-to-end performance validation to ensure no regressions are introduced. This validation uses the [vllm-benchmark workflow](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) on PyTorch CI.
 
 **Current Coverage:**
+
 * Models: Llama3, Llama4, and Mixtral
 * Hardware: NVIDIA H100 and AMD MI300x
-* *Note: Coverage may change based on new model releases and hardware availability*
+* _Note: Coverage may change based on new model releases and hardware availability_
 
 **Performance Validation Process:**
 
@@ -71,11 +72,13 @@ Request write access to the [pytorch/pytorch-integration-testing](https://github
 
 **Step 2: Review Benchmark Setup**
 Familiarize yourself with the benchmark configurations:
+
 * [CUDA setup](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks/cuda)
 * [ROCm setup](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks/rocm)
 
 **Step 3: Run the Benchmark**
 Navigate to the [vllm-benchmark workflow](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) and configure:
+
 * **vLLM branch**: Set to the release branch (e.g., `releases/v0.9.2`)
 * **vLLM commit**: Set to the RC commit hash
 
diff --git a/benchmarks/README.md b/benchmarks/README.md
index fb8690d42db98..644517235b122 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -4,7 +4,7 @@ This README guides you through running benchmark tests with the extensive
 datasets supported on vLLM. It’s a living document, updated as new features and datasets
 become available.
 
-**Dataset Overview**
+## Dataset Overview
 
 <table style="width:100%; border-collapse: collapse;">
   <thead>
@@ -81,9 +81,10 @@ become available.
 
 **Note**: HuggingFace dataset's `dataset-name` should be set to `hf`
 
----
+## 🚀 Example - Online Benchmark
+
 <details>
-<summary><b>🚀 Example - Online Benchmark</b></summary>
+<summary>Show more</summary>
 
 <br/>
 
@@ -98,7 +99,7 @@ Then run the benchmarking script
 ```bash
 # download dataset
 # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-python3 vllm/benchmarks/benchmark_serving.py \
+vllm bench serve \
   --backend vllm \
   --model NousResearch/Hermes-3-Llama-3.1-8B \
   --endpoint /v1/completions \
@@ -109,39 +110,39 @@ python3 vllm/benchmarks/benchmark_serving.py \
 
 If successful, you will see the following output
 
-```
+```text
 ============ Serving Benchmark Result ============
-Successful requests:                     10        
-Benchmark duration (s):                  5.78      
-Total input tokens:                      1369      
-Total generated tokens:                  2212      
-Request throughput (req/s):              1.73      
-Output token throughput (tok/s):         382.89    
-Total Token throughput (tok/s):          619.85    
+Successful requests:                     10
+Benchmark duration (s):                  5.78
+Total input tokens:                      1369
+Total generated tokens:                  2212
+Request throughput (req/s):              1.73
+Output token throughput (tok/s):         382.89
+Total Token throughput (tok/s):          619.85
 ---------------Time to First Token----------------
-Mean TTFT (ms):                          71.54     
-Median TTFT (ms):                        73.88     
-P99 TTFT (ms):                           79.49     
+Mean TTFT (ms):                          71.54
+Median TTFT (ms):                        73.88
+P99 TTFT (ms):                           79.49
 -----Time per Output Token (excl. 1st token)------
-Mean TPOT (ms):                          7.91      
-Median TPOT (ms):                        7.96      
-P99 TPOT (ms):                           8.03      
+Mean TPOT (ms):                          7.91
+Median TPOT (ms):                        7.96
+P99 TPOT (ms):                           8.03
 ---------------Inter-token Latency----------------
-Mean ITL (ms):                           7.74      
-Median ITL (ms):                         7.70      
-P99 ITL (ms):                            8.39      
+Mean ITL (ms):                           7.74
+Median ITL (ms):                         7.70
+P99 ITL (ms):                            8.39
 ==================================================
 ```
 
-**Custom Dataset**
+### Custom Dataset
 
 If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl
 
-```
+```json
 {"prompt": "What is the capital of India?"}
 {"prompt": "What is the capital of Iran?"}
 {"prompt": "What is the capital of China?"}
-``` 
+```
 
 ```bash
 # start server
@@ -150,7 +151,7 @@ VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests
 
 ```bash
 # run benchmarking script
-python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detailed \
+vllm bench serve --port 9001 --save-result --save-detailed \
   --backend vllm \
   --model meta-llama/Llama-3.1-8B-Instruct \
   --endpoint /v1/completions \
@@ -166,7 +167,7 @@ python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detaile
 
 You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.
 
-**VisionArena Benchmark for Vision Language Models**
+### VisionArena Benchmark for Vision Language Models
 
 ```bash
 # need a model with vision capability here
@@ -174,7 +175,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
 ```
 
 ```bash
-python3 vllm/benchmarks/benchmark_serving.py \
+vllm bench serve \
   --backend openai-chat \
   --model Qwen/Qwen2-VL-7B-Instruct \
   --endpoint /v1/chat/completions \
@@ -184,7 +185,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
   --num-prompts 1000
 ```
 
-**InstructCoder Benchmark with Speculative Decoding**
+### InstructCoder Benchmark with Speculative Decoding
 
 ``` bash
 VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
@@ -194,23 +195,23 @@ VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
 ```
 
 ``` bash
-python3 benchmarks/benchmark_serving.py \
+vllm bench serve \
     --model meta-llama/Meta-Llama-3-8B-Instruct \
     --dataset-name hf \
     --dataset-path likaixin/InstructCoder \
     --num-prompts 2048
 ```
 
-**Other HuggingFaceDataset Examples**
+### Other HuggingFaceDataset Examples
 
 ```bash
 vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
 ```
 
-**`lmms-lab/LLaVA-OneVision-Data`**
+`lmms-lab/LLaVA-OneVision-Data`:
 
 ```bash
-python3 vllm/benchmarks/benchmark_serving.py \
+vllm bench serve \
   --backend openai-chat \
   --model Qwen/Qwen2-VL-7B-Instruct \
   --endpoint /v1/chat/completions \
@@ -221,10 +222,10 @@ python3 vllm/benchmarks/benchmark_serving.py \
   --num-prompts 10
 ```
 
-**`Aeala/ShareGPT_Vicuna_unfiltered`**
+`Aeala/ShareGPT_Vicuna_unfiltered`:
 
 ```bash
-python3 vllm/benchmarks/benchmark_serving.py \
+vllm bench serve \
   --backend openai-chat \
   --model Qwen/Qwen2-VL-7B-Instruct \
   --endpoint /v1/chat/completions \
@@ -234,10 +235,10 @@ python3 vllm/benchmarks/benchmark_serving.py \
   --num-prompts 10
 ```
 
-**`AI-MO/aimo-validation-aime`**
+`AI-MO/aimo-validation-aime`:
 
 ``` bash
-python3 vllm/benchmarks/benchmark_serving.py \
+vllm bench serve \
     --model Qwen/QwQ-32B \
     --dataset-name hf \
     --dataset-path AI-MO/aimo-validation-aime \
@@ -245,23 +246,23 @@ python3 vllm/benchmarks/benchmark_serving.py \
     --seed 42
 ```
 
-**`philschmid/mt-bench`**
+`philschmid/mt-bench`:
 
 ``` bash
-python3 vllm/benchmarks/benchmark_serving.py \
+vllm bench serve \
     --model Qwen/QwQ-32B \
     --dataset-name hf \
     --dataset-path philschmid/mt-bench \
     --num-prompts 80
 ```
 
-**Running With Sampling Parameters**
+### Running With Sampling Parameters
 
 When using OpenAI-compatible backends such as `vllm`, optional sampling
 parameters can be specified. Example client command:
 
 ```bash
-python3 vllm/benchmarks/benchmark_serving.py \
+vllm bench serve \
   --backend vllm \
   --model NousResearch/Hermes-3-Llama-3.1-8B \
   --endpoint /v1/completions \
@@ -273,30 +274,34 @@ python3 vllm/benchmarks/benchmark_serving.py \
   --num-prompts 10
 ```
 
-**Running With Ramp-Up Request Rate**
+### Running With Ramp-Up Request Rate
 
 The benchmark tool also supports ramping up the request rate over the
 duration of the benchmark run. This can be useful for stress testing the
 server or finding the maximum throughput that it can handle, given some latency budget.
 
 Two ramp-up strategies are supported:
+
 - `linear`: Increases the request rate linearly from a start value to an end value.
 - `exponential`: Increases the request rate exponentially.
 
 The following arguments can be used to control the ramp-up:
+
 - `--ramp-up-strategy`: The ramp-up strategy to use (`linear` or `exponential`).
 - `--ramp-up-start-rps`: The request rate at the beginning of the benchmark.
 - `--ramp-up-end-rps`: The request rate at the end of the benchmark.
 
 </details>
 
+## 📈 Example - Offline Throughput Benchmark
+
 <details>
-<summary><b>📈 Example - Offline Throughput Benchmark</b></summary>
+<summary>Show more</summary>
 
 <br/>
 
 ```bash
-python3 vllm/benchmarks/benchmark_throughput.py \
+vllm bench throughput \
   --model NousResearch/Hermes-3-Llama-3.1-8B \
   --dataset-name sonnet \
   --dataset-path vllm/benchmarks/sonnet.txt \
@@ -305,16 +310,16 @@ python3 vllm/benchmarks/benchmark_throughput.py \
 
 If successful, you will see the following output
 
-```
+```text
 Throughput: 7.15 requests/s, 4656.00 total tokens/s, 1072.15 output tokens/s
 Total num prompt tokens:  5014
 Total num output tokens:  1500
 ```
 
-**VisionArena Benchmark for Vision Language Models**
+### VisionArena Benchmark for Vision Language Models
 
-``` bash
-python3 vllm/benchmarks/benchmark_throughput.py \
+```bash
+vllm bench throughput \
   --model Qwen/Qwen2-VL-7B-Instruct \
   --backend vllm-chat \
   --dataset-name hf \
@@ -325,18 +330,18 @@ python3 vllm/benchmarks/benchmark_throughput.py \
 
 The `num prompt tokens` now includes image token counts
 
-```
+```text
 Throughput: 2.55 requests/s, 4036.92 total tokens/s, 326.90 output tokens/s
 Total num prompt tokens:  14527
 Total num output tokens:  1280
 ```
 
-**InstructCoder Benchmark with Speculative Decoding**
+### InstructCoder Benchmark with Speculative Decoding
 
 ``` bash
 VLLM_WORKER_MULTIPROC_METHOD=spawn \
 VLLM_USE_V1=1 \
-python3 vllm/benchmarks/benchmark_throughput.py \
+vllm bench throughput \
     --dataset-name=hf \
     --dataset-path=likaixin/InstructCoder \
     --model=meta-llama/Meta-Llama-3-8B-Instruct \
@@ -349,18 +354,18 @@ python3 vllm/benchmarks/benchmark_throughput.py \
     "prompt_lookup_min": 2}'
 ```
 
-```
+```text
 Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s
 Total num prompt tokens:  261136
 Total num output tokens:  204800
 ```
 
-**Other HuggingFaceDataset Examples**
+### Other HuggingFaceDataset Examples
 
-**`lmms-lab/LLaVA-OneVision-Data`**
+`lmms-lab/LLaVA-OneVision-Data`:
 
 ```bash
-python3 vllm/benchmarks/benchmark_throughput.py \
+vllm bench throughput \
   --model Qwen/Qwen2-VL-7B-Instruct \
   --backend vllm-chat \
   --dataset-name hf \
@@ -370,10 +375,10 @@ python3 vllm/benchmarks/benchmark_throughput.py \
   --num-prompts 10
 ```
 
-**`Aeala/ShareGPT_Vicuna_unfiltered`**
+`Aeala/ShareGPT_Vicuna_unfiltered`:
 
 ```bash
-python3 vllm/benchmarks/benchmark_throughput.py \
+vllm bench throughput \
   --model Qwen/Qwen2-VL-7B-Instruct \
   --backend vllm-chat \
   --dataset-name hf \
@@ -382,10 +387,10 @@ python3 vllm/benchmarks/benchmark_throughput.py \
   --num-prompts 10
 ```
 
-**`AI-MO/aimo-validation-aime`**
+`AI-MO/aimo-validation-aime`:
 
 ```bash
-python3 benchmarks/benchmark_throughput.py \
+vllm bench throughput \
   --model Qwen/QwQ-32B \
   --backend vllm \
   --dataset-name hf \
@@ -394,12 +399,12 @@ python3 benchmarks/benchmark_throughput.py \
   --num-prompts 10
 ```
 
-**Benchmark with LoRA Adapters**
+Benchmark with LoRA adapters:
 
 ``` bash
 # download dataset
 # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-python3 vllm/benchmarks/benchmark_throughput.py \
+vllm bench throughput \
   --model meta-llama/Llama-2-7b-hf \
   --backend vllm \
   --dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
@@ -413,20 +418,22 @@ python3 vllm/benchmarks/benchmark_throughput.py \
 
 </details>
 
+## 🛠️ Example - Structured Output Benchmark
+
 <details>
-<summary><b>🛠️ Example - Structured Output Benchmark</b></summary>
+<summary>Show more</summary>
 
 <br/>
 
 Benchmark the performance of structured output generation (JSON, grammar, regex).
 
-**Server Setup**
+### Server Setup
 
 ```bash
 vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
 ```
 
-**JSON Schema Benchmark**
+### JSON Schema Benchmark
 
 ```bash
 python3 benchmarks/benchmark_serving_structured_output.py \
@@ -438,7 +445,7 @@ python3 benchmarks/benchmark_serving_structured_output.py \
   --num-prompts 1000
 ```
 
-**Grammar-based Generation Benchmark**
+### Grammar-based Generation Benchmark
 
 ```bash
 python3 benchmarks/benchmark_serving_structured_output.py \
@@ -450,7 +457,7 @@ python3 benchmarks/benchmark_serving_structured_output.py \
   --num-prompts 1000
 ```
 
-**Regex-based Generation Benchmark**
+### Regex-based Generation Benchmark
 
 ```bash
 python3 benchmarks/benchmark_serving_structured_output.py \
@@ -461,7 +468,7 @@ python3 benchmarks/benchmark_serving_structured_output.py \
   --num-prompts 1000
 ```
 
-**Choice-based Generation Benchmark**
+### Choice-based Generation Benchmark
 
 ```bash
 python3 benchmarks/benchmark_serving_structured_output.py \
@@ -472,7 +479,7 @@ python3 benchmarks/benchmark_serving_structured_output.py \
   --num-prompts 1000
 ```
 
-**XGrammar Benchmark Dataset**
+### XGrammar Benchmark Dataset
 
 ```bash
 python3 benchmarks/benchmark_serving_structured_output.py \
@@ -485,14 +492,16 @@ python3 benchmarks/benchmark_serving_structured_output.py \
 
 </details>
 
+## 📚 Example - Long Document QA Benchmark
+
 <details>
-<summary><b>📚 Example - Long Document QA Benchmark</b></summary>
+<summary>Show more</summary>
 
 <br/>
 
 Benchmark the performance of long document question-answering with prefix caching.
 
-**Basic Long Document QA Test**
+### Basic Long Document QA Test
 
 ```bash
 python3 benchmarks/benchmark_long_document_qa_throughput.py \
@@ -504,7 +513,7 @@ python3 benchmarks/benchmark_long_document_qa_throughput.py \
   --repeat-count 5
 ```
 
-**Different Repeat Modes**
+### Different Repeat Modes
 
 ```bash
 # Random mode (default) - shuffle prompts randomly
@@ -537,14 +546,16 @@ python3 benchmarks/benchmark_long_document_qa_throughput.py \
 
 </details>
 
+## 🗂️ Example - Prefix Caching Benchmark
+
 <details>
-<summary><b>🗂️ Example - Prefix Caching Benchmark</b></summary>
+<summary>Show more</summary>
 
 <br/>
 
 Benchmark the efficiency of automatic prefix caching.
 
-**Fixed Prompt with Prefix Caching**
+### Fixed Prompt with Prefix Caching
 
 ```bash
 python3 benchmarks/benchmark_prefix_caching.py \
@@ -555,7 +566,7 @@ python3 benchmarks/benchmark_prefix_caching.py \
   --input-length-range 128:256
 ```
 
-**ShareGPT Dataset with Prefix Caching**
+### ShareGPT Dataset with Prefix Caching
 
 ```bash
 # download dataset
@@ -572,14 +583,16 @@ python3 benchmarks/benchmark_prefix_caching.py \
 
 </details>
 
+## ⚡ Example - Request Prioritization Benchmark
+
 <details>
-<summary><b>⚡ Example - Request Prioritization Benchmark</b></summary>
+<summary>Show more</summary>
 
 <br/>
 
 Benchmark the performance of request prioritization in vLLM.
 
-**Basic Prioritization Test**
+### Basic Prioritization Test
 
 ```bash
 python3 benchmarks/benchmark_prioritization.py \
@@ -590,7 +603,7 @@ python3 benchmarks/benchmark_prioritization.py \
   --scheduling-policy priority
 ```
 
-**Multiple Sequences per Prompt**
+### Multiple Sequences per Prompt
 
 ```bash
 python3 benchmarks/benchmark_prioritization.py \
diff --git a/benchmarks/auto_tune/README.md b/benchmarks/auto_tune/README.md
index ae5962fe92542..9aad51df6e003 100644
--- a/benchmarks/auto_tune/README.md
+++ b/benchmarks/auto_tune/README.md
@@ -3,6 +3,7 @@
 This script automates the process of finding the optimal server parameter combination (`max-num-seqs` and `max-num-batched-tokens`) to maximize throughput for a vLLM server. It also supports additional constraints such as E2E latency and prefix cache hit rate.
 
 ## Table of Contents
+
 - [Prerequisites](#prerequisites)
 - [Configuration](#configuration)
 - [How to Run](#how-to-run)
@@ -52,7 +53,7 @@ You must set the following variables at the top of the script before execution.
 1. **Configure**: Edit the script and set the variables in the [Configuration](#configuration) section.
 2. **Execute**: Run the script. Since the process can take a long time, it is highly recommended to use a terminal multiplexer like `tmux` or `screen` to prevent the script from stopping if your connection is lost.
 
-```
+```bash
 cd <FOLDER_OF_THIS_SCRIPT>
 bash auto_tune.sh
 ```
@@ -64,6 +65,7 @@ bash auto_tune.sh
 Here are a few examples of how to configure the script for different goals:
 
 ### 1. Maximize Throughput (No Latency Constraint)
+
 - **Goal**: Find the best `max-num-seqs` and `max-num-batched-tokens` to get the highest possible throughput for 1800 input tokens and 20 output tokens.
 - **Configuration**:
 
@@ -76,6 +78,7 @@ MAX_LATENCY_ALLOWED_MS=100000000000 # A very large number
 ```
 
 #### 2. Maximize Throughput with a Latency Requirement
+
 - **Goal**: Find the best server parameters when P99 end-to-end latency must be below 500ms.
 - **Configuration**:
 
@@ -88,6 +91,7 @@ MAX_LATENCY_ALLOWED_MS=500
 ```
 
 #### 3. Maximize Throughput with Prefix Caching and Latency Requirements
+
 - **Goal**: Find the best server parameters assuming a 60% prefix cache hit rate and a latency requirement of 500ms.
 - **Configuration**:
 
@@ -105,11 +109,11 @@ After the script finishes, you will find the results in a new, timestamped direc
 
 - **Log Files**: The directory (`$BASE/auto-benchmark/YYYY_MM_DD_HH_MM/`) contains detailed logs for each run:
     - `vllm_log_...txt`: The log output from the vLLM server for each parameter combination.
-    - `bm_log_...txt`: The log output from the `benchmark_serving.py` script for each benchmark run.
+    - `bm_log_...txt`: The log output from the `vllm bench serve` command for each benchmark run.
 
 - **Final Result Summary**: A file named `result.txt` is created in the log directory. It contains a summary of each tested combination and concludes with the overall best parameters found.
 
-```
+```text
 # Example result.txt content
 hash:a1b2c3d4...
 max_num_seqs: 128, max_num_batched_tokens: 2048, request_rate: 10.0, e2el: 450.5, throughput: 9.8, goodput: 9.8
diff --git a/benchmarks/auto_tune/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh
index 8d3e1d4bee352..3cd8580e065dd 100644
--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# This script aims to tune the best server parameter combinations to maximize throughput for given requirement. 
+# This script aims to tune the best server parameter combinations to maximize throughput for given requirement.
 # See details in README (benchmarks/auto_tune/README.md).
 
 TAG=$(date +"%Y_%m_%d_%H_%M")
@@ -56,7 +56,7 @@ start_server() {
     local max_num_batched_tokens=$3
     local vllm_log=$4
     local profile_dir=$5
-    
+
     pkill -f vllm
 
     VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir vllm serve $MODEL \
@@ -73,9 +73,9 @@ start_server() {
 
     # wait for 10 minutes...
     server_started=0
-    for i in {1..60}; do  
+    for i in {1..60}; do
         RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
-        STATUS_CODE=$(echo "$RESPONSE" | tail -n 1) 
+        STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
         if [[ "$STATUS_CODE" -eq 200 ]]; then
             server_started=1
             break
@@ -98,10 +98,10 @@ update_best_profile() {
     selected_profile_file=
     if [[ "$SYSTEM" == "TPU" ]]; then
         selected_profile_file="${sorted_paths[$profile_index]}/*.xplane.pb"
-    fi 
+    fi
     if [[ "$SYSTEM" == "GPU" ]]; then
         selected_profile_file="${sorted_paths[$profile_index]}"
-    fi 
+    fi
     rm -f $PROFILE_PATH/*
     cp $selected_profile_file $PROFILE_PATH
 }
@@ -129,14 +129,14 @@ run_benchmark() {
         echo "server started."
     fi
     echo
-    
+
     echo "run benchmark test..."
     meet_latency_requirement=0
     # get a basic qps by using request-rate inf
     bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
     prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
 adjusted_input_len=$(( INPUT_LEN - prefix_len ))
-    python3 benchmarks/benchmark_serving.py \
+    vllm bench serve \
         --backend vllm \
         --model $MODEL  \
         --dataset-name random \
@@ -169,7 +169,7 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len ))
             curl -X POST http://0.0.0.0:8004/reset_prefix_cache
             sleep 5
             bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
-            python3 benchmarks/benchmark_serving.py \
+            vllm bench serve \
                 --backend vllm \
                 --model $MODEL  \
                 --dataset-name random \
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 4d2ea126b24a5..d8b960edaa468 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -11,6 +11,7 @@ from typing import Any, Optional
 
 import numpy as np
 from tqdm import tqdm
+from typing_extensions import deprecated
 
 import vllm.envs as envs
 from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
@@ -34,6 +35,10 @@ def save_to_pytorch_benchmark_format(
         write_to_json(pt_file, pt_records)
 
 
+@deprecated(
+    "benchmark_latency.py is deprecated and will be removed in a "
+    "future version. Please use 'vllm bench latency' instead.",
+)
 def main(args: argparse.Namespace):
     print(args)
 
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index c597fb1068aba..3affa18ae3a4f 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -38,6 +38,7 @@ from typing import Any, Literal, Optional
 import numpy as np
 from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase
+from typing_extensions import deprecated
 
 from backend_request_func import (
     ASYNC_REQUEST_FUNCS,
@@ -395,20 +396,6 @@ async def benchmark(
         tasks.append(asyncio.create_task(task))
     outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
 
-    if profile:
-        print("Stopping profiler...")
-        profile_input = RequestFuncInput(
-            model=model_id,
-            prompt=test_prompt,
-            api_url=base_url + "/stop_profile",
-            prompt_len=test_prompt_len,
-            output_len=test_output_len,
-            logprobs=logprobs,
-        )
-        profile_output = await request_func(request_func_input=profile_input)
-        if profile_output.success:
-            print("Profiler stopped")
-
     if pbar is not None:
         pbar.close()
 
@@ -426,6 +413,10 @@ async def benchmark(
 
     print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
     print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
+    if max_concurrency is not None:
+        print("{:<40} {:<10}".format("Maximum request concurrency:", max_concurrency))
+    if request_rate != float("inf"):
+        print("{:<40} {:<10.2f}".format("Request rate configured (RPS):", request_rate))
     print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
     print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
     print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
@@ -517,6 +508,20 @@ async def benchmark(
 
     print("=" * 50)
 
+    if profile:
+        print("Stopping profiler...")
+        profile_input = RequestFuncInput(
+            model=model_id,
+            prompt=test_prompt,
+            api_url=base_url + "/stop_profile",
+            prompt_len=test_prompt_len,
+            output_len=test_output_len,
+            logprobs=logprobs,
+        )
+        profile_output = await request_func(request_func_input=profile_input)
+        if profile_output.success:
+            print("Profiler stopped")
+
     return result
 
 
@@ -593,6 +598,10 @@ def save_to_pytorch_benchmark_format(
         write_to_json(pt_file, pt_records)
 
 
+@deprecated(
+    "benchmark_serving.py is deprecated and will be removed in a future "
+    "version. Please use 'vllm bench serve' instead.",
+)
 def main(args: argparse.Namespace):
     print(args)
     random.seed(args.seed)
diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py
index e23a5a9e2233d..2a22f122c78e6 100644
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -538,20 +538,6 @@ async def benchmark(
         )
     outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
 
-    if profile:
-        print("Stopping profiler...")
-        profile_input = RequestFuncInput(
-            model=model_id,
-            prompt=test_request.prompt,
-            api_url=base_url + "/stop_profile",
-            prompt_len=test_request.prompt_len,
-            output_len=test_request.expected_output_len,
-            extra_body={test_request.structure_type: test_request.schema},
-        )
-        profile_output = await request_func(request_func_input=profile_input)
-        if profile_output.success:
-            print("Profiler stopped")
-
     if pbar is not None:
         pbar.close()
 
@@ -569,6 +555,10 @@ async def benchmark(
 
     print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
     print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
+    if max_concurrency is not None:
+        print("{:<40} {:<10}".format("Maximum request concurrency:", max_concurrency))
+    if request_rate != float("inf"):
+        print("{:<40} {:<10.2f}".format("Request rate configured (RPS):", request_rate))
     print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
     print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
     print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
@@ -666,6 +656,20 @@ async def benchmark(
 
     print("=" * 50)
 
+    if profile:
+        print("Stopping profiler...")
+        profile_input = RequestFuncInput(
+            model=model_id,
+            prompt=test_request.prompt,
+            api_url=base_url + "/stop_profile",
+            prompt_len=test_request.prompt_len,
+            output_len=test_request.expected_output_len,
+            extra_body={test_request.structure_type: test_request.schema},
+        )
+        profile_output = await request_func(request_func_input=profile_input)
+        if profile_output.success:
+            print("Profiler stopped")
+
     return result, ret
 
 
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index c0a7f1d582505..c51b579686529 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -15,6 +15,7 @@ import torch
 import uvloop
 from tqdm import tqdm
 from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase
+from typing_extensions import deprecated
 
 from benchmark_dataset import (
     AIMODataset,
@@ -382,6 +383,10 @@ def get_requests(args, tokenizer):
     return dataset_cls(**common_kwargs).sample(**sample_kwargs)
 
 
+@deprecated(
+    "benchmark_throughput.py is deprecated and will be removed in a "
+    "future version. Please use 'vllm bench throughput' instead.",
+)
 def main(args: argparse.Namespace):
     if args.seed is None:
         args.seed = 0
diff --git a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
index 94999630bae12..92f97ffabea2a 100644
--- a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
@@ -3,7 +3,7 @@
 # benchmark the overhead of disaggregated prefill.
 # methodology:
 # - send all request to prefill vLLM instance. It will buffer KV cache.
-# - then send all request to decode instance. 
+# - then send all request to decode instance.
 # - The TTFT of decode instance is the overhead.
 
 set -ex
@@ -12,6 +12,8 @@ kill_gpu_processes() {
   # kill all processes on GPU.
   pgrep pt_main_thread | xargs -r kill -9
   pgrep python3 | xargs -r kill -9
+  # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
+  pgrep VLLM | xargs -r kill -9
   sleep 10
 
   # remove vllm config file
@@ -61,7 +63,7 @@ benchmark() {
     --gpu-memory-utilization 0.6 \
     --kv-transfer-config \
     '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
-    
+
 
   CUDA_VISIBLE_DEVICES=1 python3 \
     -m vllm.entrypoints.openai.api_server \
@@ -76,38 +78,38 @@ benchmark() {
   wait_for_server 8200
 
   # let the prefill instance finish prefill
-  python3 ../benchmark_serving.py \
-          --backend vllm \
-          --model $model \
-          --dataset-name $dataset_name \
-          --dataset-path $dataset_path \
-          --sonnet-input-len $input_len \
-          --sonnet-output-len "$output_len" \
-          --sonnet-prefix-len $prefix_len \
-          --num-prompts $num_prompts \
-          --port 8100 \
-          --save-result \
-          --result-dir $results_folder \
-          --result-filename disagg_prefill_tp1.json \
-          --request-rate "inf"
+  vllm bench serve \
+    --backend vllm \
+    --model $model \
+    --dataset-name $dataset_name \
+    --dataset-path $dataset_path \
+    --sonnet-input-len $input_len \
+    --sonnet-output-len "$output_len" \
+    --sonnet-prefix-len $prefix_len \
+    --num-prompts $num_prompts \
+    --port 8100 \
+    --save-result \
+    --result-dir $results_folder \
+    --result-filename disagg_prefill_tp1.json \
+    --request-rate "inf"
 
 
   # send the request to decode.
   # The TTFT of this command will be the overhead of disagg prefill impl.
-  python3 ../benchmark_serving.py \
-          --backend vllm \
-          --model $model \
-          --dataset-name $dataset_name \
-          --dataset-path $dataset_path \
-          --sonnet-input-len $input_len \
-          --sonnet-output-len "$output_len" \
-          --sonnet-prefix-len $prefix_len \
-          --num-prompts $num_prompts \
-          --port 8200 \
-          --save-result \
-          --result-dir $results_folder \
-          --result-filename disagg_prefill_tp1_overhead.json \
-          --request-rate "$qps"
+  vllm bench serve \
+    --backend vllm \
+    --model $model \
+    --dataset-name $dataset_name \
+    --dataset-path $dataset_path \
+    --sonnet-input-len $input_len \
+    --sonnet-output-len "$output_len" \
+    --sonnet-prefix-len $prefix_len \
+    --num-prompts $num_prompts \
+    --port 8200 \
+    --save-result \
+    --result-dir $results_folder \
+    --result-filename disagg_prefill_tp1_overhead.json \
+    --request-rate "$qps"
   kill_gpu_processes
 
 }
diff --git a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
index eb5d891d0d4a5..af2bcba3ea57a 100644
--- a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
@@ -18,6 +18,8 @@ kill_gpu_processes() {
   # kill all processes on GPU.
   pgrep pt_main_thread | xargs -r kill -9
   pgrep python3 | xargs -r kill -9
+  # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
+  pgrep VLLM | xargs -r kill -9
   for port in 8000 8100 8200; do lsof -t -i:$port | xargs -r kill -9; done
   sleep 1
 }
@@ -58,7 +60,7 @@ launch_chunked_prefill() {
 
 
 launch_disagg_prefill() {
-  model="meta-llama/Meta-Llama-3.1-8B-Instruct" 
+  model="meta-llama/Meta-Llama-3.1-8B-Instruct"
   # disagg prefill
   CUDA_VISIBLE_DEVICES=0 python3 \
     -m vllm.entrypoints.openai.api_server \
@@ -97,20 +99,20 @@ benchmark() {
   output_len=$2
   tag=$3
 
-  python3 ../benchmark_serving.py \
-          --backend vllm \
-          --model $model \
-          --dataset-name $dataset_name \
-          --dataset-path $dataset_path \
-          --sonnet-input-len $input_len \
-          --sonnet-output-len "$output_len" \
-          --sonnet-prefix-len $prefix_len \
-          --num-prompts $num_prompts \
-          --port 8000 \
-          --save-result \
-          --result-dir $results_folder \
-          --result-filename "$tag"-qps-"$qps".json \
-          --request-rate "$qps"
+  vllm bench serve \
+    --backend vllm \
+    --model $model \
+    --dataset-name $dataset_name \
+    --dataset-path $dataset_path \
+    --sonnet-input-len $input_len \
+    --sonnet-output-len "$output_len" \
+    --sonnet-prefix-len $prefix_len \
+    --num-prompts $num_prompts \
+    --port 8000 \
+    --save-result \
+    --result-dir $results_folder \
+    --result-filename "$tag"-qps-"$qps".json \
+    --request-rate "$qps"
 
   sleep 2
 }
diff --git a/benchmarks/kernels/benchmark_moe_align_block_size.py b/benchmarks/kernels/benchmark_moe_align_block_size.py
index 1af5a21caf465..f540cff6261a8 100644
--- a/benchmarks/kernels/benchmark_moe_align_block_size.py
+++ b/benchmarks/kernels/benchmark_moe_align_block_size.py
@@ -5,9 +5,8 @@ import itertools
 
 import torch
 
-from vllm import _custom_ops as ops
 from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
-    moe_align_block_size_triton,
+    moe_align_block_size,
 )
 from vllm.triton_utils import triton
 
@@ -21,60 +20,6 @@ def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor:
     )
 
 
-def check_correctness(num_tokens, num_experts=256, block_size=256, topk=8):
-    """
-    Verifies vllm vs. Triton
-    """
-    topk_ids = get_topk_ids(num_tokens, num_experts, topk)
-
-    # 1. malloc space for triton and vllm
-    # malloc enough space (max_num_tokens_padded) for the sorted ids
-    max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
-    sorted_ids_triton = torch.empty(
-        (max_num_tokens_padded,), dtype=torch.int32, device="cuda"
-    )
-    expert_ids_triton = torch.empty(
-        (max_num_tokens_padded // block_size,), dtype=torch.int32, device="cuda"
-    )
-    num_tokens_post_pad_triton = torch.empty((1,), dtype=torch.int32, device="cuda")
-
-    sorted_ids_vllm = torch.empty_like(sorted_ids_triton)
-    expert_ids_vllm = torch.empty_like(expert_ids_triton)
-    num_tokens_post_pad_vllm = torch.empty_like(num_tokens_post_pad_triton)
-
-    # 2. run implementations
-    moe_align_block_size_triton(
-        topk_ids,
-        num_experts,
-        block_size,
-        sorted_ids_triton,
-        expert_ids_triton,
-        num_tokens_post_pad_triton,
-    )
-
-    ops.moe_align_block_size(
-        topk_ids,
-        num_experts,
-        block_size,
-        sorted_ids_vllm,
-        expert_ids_vllm,
-        num_tokens_post_pad_vllm,
-    )
-    print(f"✅ VLLM implementation works with {num_experts} experts!")
-
-    # 3. compare results
-    if torch.allclose(expert_ids_triton, expert_ids_vllm) and torch.allclose(
-        num_tokens_post_pad_triton, num_tokens_post_pad_vllm
-    ):
-        print("✅ Triton and VLLM implementations match.")
-    else:
-        print("❌ Triton and VLLM implementations DO NOT match.")
-        print("Triton expert_ids:", expert_ids_triton)
-        print("VLLM expert_ids:", expert_ids_vllm)
-        print("Triton num_tokens_post_pad:", num_tokens_post_pad_triton)
-        print("VLLM num_tokens_post_pad:", num_tokens_post_pad_vllm)
-
-
 # test configurations
 num_tokens_range = [1, 16, 256, 4096]
 num_experts_range = [16, 64, 224, 256, 280, 512]
@@ -87,8 +32,8 @@ configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range
         x_names=["num_tokens", "num_experts", "topk"],
         x_vals=configs,
         line_arg="provider",
-        line_vals=["vllm", "triton"],  # "triton"
-        line_names=["VLLM", "Triton"],  # "Triton"
+        line_vals=["vllm"],
+        line_names=["vLLM"],
         plot_name="moe-align-block-size-performance",
         args={},
     )
@@ -98,36 +43,11 @@ def benchmark(num_tokens, num_experts, topk, provider):
     block_size = 256
     topk_ids = get_topk_ids(num_tokens, num_experts, topk)
 
-    max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
-    sorted_ids = torch.empty((max_num_tokens_padded,), dtype=torch.int32, device="cuda")
-    max_num_m_blocks = max_num_tokens_padded // block_size
-    expert_ids = torch.empty((max_num_m_blocks,), dtype=torch.int32, device="cuda")
-    num_tokens_post_pad = torch.empty((1,), dtype=torch.int32, device="cuda")
-
     quantiles = [0.5, 0.2, 0.8]
 
     if provider == "vllm":
         ms, min_ms, max_ms = triton.testing.do_bench(
-            lambda: ops.moe_align_block_size(
-                topk_ids,
-                num_experts,
-                block_size,
-                sorted_ids.clone(),
-                expert_ids.clone(),
-                num_tokens_post_pad.clone(),
-            ),
-            quantiles=quantiles,
-        )
-    elif provider == "triton":
-        ms, min_ms, max_ms = triton.testing.do_bench(
-            lambda: moe_align_block_size_triton(
-                topk_ids,
-                num_experts,
-                block_size,
-                sorted_ids.clone(),
-                expert_ids.clone(),
-                num_tokens_post_pad.clone(),
-            ),
+            lambda: moe_align_block_size(topk_ids, block_size, num_experts),
             quantiles=quantiles,
         )
 
@@ -151,6 +71,4 @@ if __name__ == "__main__":
     )
     args = parser.parse_args()
 
-    print("Running correctness check...")
-    check_correctness(num_tokens=1024, num_experts=args.num_experts, topk=args.topk)
     benchmark.run(print_data=True, show_plots=True)
diff --git a/benchmarks/kernels/benchmark_moe_permute_unpermute.py b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
index 4ed6900901442..04d2205aa3722 100644
--- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py
+++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
@@ -8,12 +8,13 @@ import ray
 import torch
 from transformers import AutoConfig
 
-from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
+from vllm.model_executor.layers.fused_moe.fused_moe import *
+from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
     _moe_permute,
     _moe_unpermute_and_reduce,
+    moe_permute,
+    moe_unpermute,
 )
-from vllm.model_executor.layers.fused_moe.fused_moe import *
-from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import *
 from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize
 from vllm.platforms import current_platform
 from vllm.utils import FlexibleArgumentParser
@@ -63,18 +64,19 @@ def benchmark_permute(
 
     def run():
         if use_customized_permute:
-            (permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = (
-                moe_permute(
-                    qhidden_states,
-                    topk_weights=topk_weights,
-                    topk_ids=topk_ids,
-                    token_expert_indices=token_expert_indices,
-                    topk=topk,
-                    n_expert=num_experts,
-                    n_local_expert=num_experts,
-                    expert_map=None,
-                    align_block_size=align_block_size,
-                )
+            (
+                permuted_hidden_states,
+                a1q_scale,
+                first_token_off,
+                inv_perm_idx,
+                m_indices,
+            ) = moe_permute(
+                qhidden_states,
+                a1q_scale=None,
+                topk_ids=topk_ids,
+                n_expert=num_experts,
+                expert_map=None,
+                align_block_size=align_block_size,
             )
         else:
             (
@@ -150,18 +152,19 @@ def benchmark_unpermute(
 
     def prepare():
         if use_customized_permute:
-            (permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = (
-                moe_permute(
-                    qhidden_states,
-                    topk_weights=topk_weights,
-                    topk_ids=topk_ids,
-                    token_expert_indices=token_expert_indices,
-                    topk=topk,
-                    n_expert=num_experts,
-                    n_local_expert=num_experts,
-                    expert_map=None,
-                    align_block_size=align_block_size,
-                )
+            (
+                permuted_hidden_states,
+                a1q_scale,
+                first_token_off,
+                inv_perm_idx,
+                m_indices,
+            ) = moe_permute(
+                qhidden_states,
+                a1q_scale=None,
+                topk_ids=topk_ids,
+                n_expert=num_experts,
+                expert_map=None,
+                align_block_size=align_block_size,
             )
             # convert to fp16/bf16 as gemm output
             return (
@@ -191,16 +194,19 @@ def benchmark_unpermute(
 
     def run(input: tuple):
         if use_customized_permute:
-            (permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = input
+            (
+                permuted_hidden_states,
+                first_token_off,
+                inv_perm_idx,
+                m_indices,
+            ) = input
+            output = torch.empty_like(hidden_states)
             moe_unpermute(
+                output,
                 permuted_hidden_states,
                 topk_weights,
-                topk_ids,
                 inv_perm_idx,
                 first_token_off,
-                topk,
-                num_experts,
-                num_experts,
             )
         else:
             (
@@ -211,7 +217,11 @@ def benchmark_unpermute(
                 inv_perm,
             ) = input
             _moe_unpermute_and_reduce(
-                output_hidden_states, permuted_hidden_states, inv_perm, topk_weights
+                output_hidden_states,
+                permuted_hidden_states,
+                inv_perm,
+                topk_weights,
+                True,
             )
 
     # JIT compilation & warmup
diff --git a/benchmarks/kernels/benchmark_per_token_group_quant.py b/benchmarks/kernels/benchmark_per_token_group_quant.py
new file mode 100644
index 0000000000000..1ccb5e08b3d57
--- /dev/null
+++ b/benchmarks/kernels/benchmark_per_token_group_quant.py
@@ -0,0 +1,159 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import argparse
+import math
+from contextlib import contextmanager
+from typing import Callable
+from unittest.mock import patch
+
+import torch
+
+from vllm.model_executor.layers.quantization.utils import fp8_utils, int8_utils
+from vllm.platforms import current_platform
+
+
+@contextmanager
+def _triton_mode():
+    """Temporarily force the Triton fallback path"""
+    with patch("vllm.platforms.current_platform.is_cuda", return_value=False):
+        yield
+
+
+def _time_cuda(
+    fn: Callable[[], tuple[torch.Tensor, torch.Tensor]],
+    warmup_iters: int,
+    bench_iters: int,
+) -> float:
+    # warmup
+    for _ in range(warmup_iters):
+        fn()
+    torch.cuda.synchronize()
+
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+
+    start.record()
+    for _ in range(bench_iters):
+        fn()
+    end.record()
+    torch.cuda.synchronize()
+
+    return start.elapsed_time(end) / bench_iters  # ms/iter
+
+
+def _run_single(
+    shape: tuple[int, int],
+    group_size: int,
+    dtype: str,
+    *,
+    column_major: bool = False,
+    scale_ue8m0: bool = False,
+    warmup_iters: int,
+    bench_iters: int,
+) -> None:
+    num_tokens, hidden_dim = shape
+
+    device = torch.device("cuda")
+    torch.manual_seed(42)
+    x = torch.randn(num_tokens, hidden_dim, device=device, dtype=torch.bfloat16) * 8
+
+    if dtype == "fp8":
+
+        def cuda_impl():
+            return fp8_utils.per_token_group_quant_fp8(
+                x,
+                group_size,
+                column_major_scales=column_major,
+                use_ue8m0=scale_ue8m0,
+            )
+
+        def triton_impl():
+            with _triton_mode():
+                return fp8_utils.per_token_group_quant_fp8(
+                    x,
+                    group_size,
+                    column_major_scales=column_major,
+                    use_ue8m0=scale_ue8m0,
+                )
+    elif dtype == "int8":
+
+        def cuda_impl():
+            return int8_utils.per_token_group_quant_int8(x, group_size)
+
+        def triton_impl():
+            with _triton_mode():
+                return int8_utils.per_token_group_quant_int8(x, group_size)
+    else:
+        raise ValueError("dtype must be 'fp8' or 'int8'")
+
+    cuda_ms = _time_cuda(cuda_impl, warmup_iters, bench_iters)
+    triton_ms = _time_cuda(triton_impl, warmup_iters, bench_iters)
+
+    speedup = triton_ms / cuda_ms if cuda_ms else math.inf
+
+    cfg_desc = (
+        f"shape={shape}  gs={group_size:<3}  col_major={column_major:<5}  "
+        f"ue8m0={scale_ue8m0:<5}  dtype={dtype}"
+    )
+    print(
+        f"{cfg_desc:55} | CUDA {cuda_ms:7.3f} ms  | Triton {triton_ms:7.3f} ms  | "
+        f"speed-up ×{speedup:5.2f}"
+    )
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--warmup-iters", type=int, default=10)
+    parser.add_argument("--bench-iters", type=int, default=100)
+    parser.add_argument("--dtype", choices=["fp8", "int8", "both"], default="both")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    if not current_platform.is_cuda():
+        raise RuntimeError("CUDA device is required to run this benchmark.")
+
+    args = parse_args()
+    warmup_iters, bench_iters = args.warmup_iters, args.bench_iters
+
+    shapes = [(32, 128), (64, 256), (16, 512)]
+    group_sizes = [64, 128]
+
+    dtypes = ["fp8", "int8"] if args.dtype == "both" else [args.dtype]
+
+    header = (
+        "Configuration".ljust(55)
+        + " | "
+        + "CUDA (ms)".center(12)
+        + " | "
+        + "Triton (ms)".center(13)
+        + " | "
+        + "Speed-up"
+    )
+    print(header)
+    print("-" * len(header))
+
+    for dtype in dtypes:
+        for shape in shapes:
+            for gs in group_sizes:
+                if dtype == "fp8":
+                    for col_major in (False, True):
+                        for ue8m0 in (False, True):
+                            _run_single(
+                                shape,
+                                gs,
+                                dtype,
+                                column_major=col_major,
+                                scale_ue8m0=ue8m0,
+                                warmup_iters=warmup_iters,
+                                bench_iters=bench_iters,
+                            )
+                else:  # INT8 has no col-major / ue8m0 switches
+                    _run_single(
+                        shape,
+                        gs,
+                        dtype,
+                        warmup_iters=warmup_iters,
+                        bench_iters=bench_iters,
+                    )
diff --git a/benchmarks/kernels/benchmark_trtllm_attention.py b/benchmarks/kernels/benchmark_trtllm_attention.py
index 8c980f930366c..68c48858e61cc 100644
--- a/benchmarks/kernels/benchmark_trtllm_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_attention.py
@@ -71,22 +71,20 @@ def benchmark_decode(
     if kv_cache_dtype.startswith("fp8"):
         kv_cache, _ = to_float8(kv_cache)
 
+    output_trtllm = torch.empty(q.shape, dtype=dtype)
+
     # Benchmark TRT decode
     def trt_decode():
         return flashinfer.decode.trtllm_batch_decode_with_kv_cache(
             q,
             kv_cache,
             workspace_buffer,
-            num_qo_heads,
-            num_kv_heads,
-            sm_scale,
             block_tables,
             kv_lens_tensor,
-            page_size,
             max_kv_len,
-            kv_cache_dtype,
-            k_scale,
-            v_scale,
+            bmm1_scale=k_scale * sm_scale,
+            bmm2_scale=v_scale,
+            out=output_trtllm,
         )
 
     def time_fn(fn, warmup=10, trials=20):
@@ -125,6 +123,8 @@ def benchmark_decode(
     kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
     kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
 
+    output_baseline = torch.empty(q.shape, dtype=dtype)
+
     wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
         workspace_buffer,
         kv_layout,
@@ -145,7 +145,7 @@ def benchmark_decode(
     )
 
     def baseline_decode():
-        return wrapper.run(q, kv_cache, sm_scale, k_scale, v_scale)
+        return wrapper.run(q, kv_cache, sm_scale, k_scale, v_scale, output_baseline)
 
     baseline_mean, baseline_std = time_fn(baseline_decode)
 
@@ -214,25 +214,39 @@ if __name__ == "__main__":
     max_seq_lens = [1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072]
     all_results = []
 
-    print("Running benchmark for kv_cache_dtype: bfloat16")
     print(
-        "\tnum_seqs\tmax_seq_len\ttrt_mean\ttrt_std\tbaseline_mean\tbaseline_std\tspeedup_percent"
+        "Running benchmark for q_dtype = bfloat16, kv_cache_dtype: bfloat16, "
+        "output_dtype: bfloat16"
+    )
+    print(
+        "\tnum_seqs\tmax_seq_len\ttrt_mean\ttrt_std\tbaseline_mean\t"
+        "baseline_std\tspeedup_percent"
     )
     for max_seq_len in max_seq_lens:
         for bs in num_seqs:
             result = benchmark_decode(
-                bs, max_seq_len, dtype=torch.bfloat16, kv_cache_dtype="auto"
+                bs,
+                max_seq_len,
+                dtype=torch.bfloat16,
+                kv_cache_dtype="auto",
             )
             all_results.append(result)
 
-    print("Running benchmark for q_dtype = bfloat16, kv_cache_dtype: fp8")
     print(
-        "\tnum_seqs\tmax_seq_len\ttrt_mean\ttrt_std\tbaseline_mean\tbaseline_std\tspeedup_percent"
+        "Running benchmark for q_dtype = bfloat16, kv_cache_dtype: fp8, "
+        "output_dtype: bfloat16"
+    )
+    print(
+        "\tnum_seqs\tmax_seq_len\ttrt_mean\ttrt_std\tbaseline_mean\t"
+        "baseline_std\tspeedup_percent"
     )
     for max_seq_len in max_seq_lens:
         for bs in num_seqs:
             result = benchmark_decode(
-                bs, max_seq_len, dtype=torch.bfloat16, kv_cache_dtype="fp8"
+                bs,
+                max_seq_len,
+                dtype=torch.bfloat16,
+                kv_cache_dtype="fp8",
             )
             all_results.append(result)
 
diff --git a/benchmarks/kernels/deepgemm/README.md b/benchmarks/kernels/deepgemm/README.md
index 917e814010f89..41e68e047be82 100644
--- a/benchmarks/kernels/deepgemm/README.md
+++ b/benchmarks/kernels/deepgemm/README.md
@@ -8,7 +8,7 @@ Currently this just includes dense GEMMs and only works on Hopper GPUs.
 
 You need to install vLLM in your usual fashion, then install DeepGEMM from source in its own directory:
 
-```
+```bash
 git clone --recursive https://github.com/deepseek-ai/DeepGEMM
 cd DeepGEMM
 python setup.py install
@@ -17,7 +17,7 @@ uv pip install -e .
 
 ## Usage
 
-```
+```console
 python benchmark_fp8_block_dense_gemm.py
 INFO 02-26 21:55:13 [__init__.py:207] Automatically detected platform cuda.
 ===== STARTING FP8 GEMM BENCHMARK =====
diff --git a/csrc/cpu/quant.cpp b/csrc/cpu/quant.cpp
index c1f7c64ea2f49..6e120b8d20a7e 100644
--- a/csrc/cpu/quant.cpp
+++ b/csrc/cpu/quant.cpp
@@ -16,12 +16,14 @@ struct KernelVecType<float> {
   using cvt_vec_type = vec_op::FP32Vec16;
 };
 
+#if !defined(__aarch64__) || defined(ARM_BF16_SUPPORT)
 template <>
 struct KernelVecType<c10::BFloat16> {
   using load_vec_type = vec_op::BF16Vec16;
   using azp_adj_load_vec_type = vec_op::INT32Vec16;
   using cvt_vec_type = vec_op::FP32Vec16;
 };
+#endif
 
 template <>
 struct KernelVecType<c10::Half> {
diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp
index f1738aee980b6..b20a054648428 100644
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -151,7 +151,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding);
 
   // Quantization
-#if defined(__AVX512F__) || defined(__aarch64__)
+#if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__))
   at::Tag stride_tag = at::Tag::needs_fixed_stride_order;
 
   // Compute int8 quantized tensor for given scaling factor.
diff --git a/csrc/moe/moe_permute_unpermute_op.cu b/csrc/moe/moe_permute_unpermute_op.cu
index a77471a7f2078..2922352a3f7cc 100644
--- a/csrc/moe/moe_permute_unpermute_op.cu
+++ b/csrc/moe/moe_permute_unpermute_op.cu
@@ -10,32 +10,28 @@
 
 void moe_permute(
     const torch::Tensor& input,                      // [n_token, hidden]
-    const torch::Tensor& topk_weights,               //[n_token, topk]
-    torch::Tensor& topk_ids,                         // [n_token, topk]
+    const torch::Tensor& topk_ids,                   // [n_token, topk]
     const torch::Tensor& token_expert_indices,       // [n_token, topk]
     const std::optional<torch::Tensor>& expert_map,  // [n_expert]
     int64_t n_expert, int64_t n_local_expert, int64_t topk,
     const std::optional<int64_t>& align_block_size,
-    torch::Tensor&
-        permuted_input,  // [topk * n_token/align_block_size_m, hidden]
+    torch::Tensor& permuted_input,             // [permuted_size, hidden]
     torch::Tensor& expert_first_token_offset,  // [n_local_expert + 1]
-    torch::Tensor& src_row_id2dst_row_id_map,  // [n_token, topk]
+    torch::Tensor& inv_permuted_idx,           // [n_token, topk]
+    torch::Tensor& permuted_idx,               // [permute_size]
     torch::Tensor& m_indices) {                // [align_expand_m]
-  TORCH_CHECK(topk_weights.scalar_type() == at::ScalarType::Float,
-              "topk_weights must be float32");
   TORCH_CHECK(expert_first_token_offset.scalar_type() == at::ScalarType::Long,
               "expert_first_token_offset must be int64");
   TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int,
               "topk_ids must be int32");
   TORCH_CHECK(token_expert_indices.scalar_type() == at::ScalarType::Int,
               "token_expert_indices must be int32");
-  TORCH_CHECK(src_row_id2dst_row_id_map.scalar_type() == at::ScalarType::Int,
-              "src_row_id2dst_row_id_map must be int32");
+  TORCH_CHECK(inv_permuted_idx.scalar_type() == at::ScalarType::Int,
+              "inv_permuted_idx must be int32");
   TORCH_CHECK(expert_first_token_offset.size(0) == n_local_expert + 1,
               "expert_first_token_offset shape != n_local_expert+1")
-  TORCH_CHECK(
-      src_row_id2dst_row_id_map.sizes() == token_expert_indices.sizes(),
-      "token_expert_indices shape must be same as src_row_id2dst_row_id_map");
+  TORCH_CHECK(inv_permuted_idx.sizes() == token_expert_indices.sizes(),
+              "token_expert_indices shape must be same as inv_permuted_idx");
   auto n_token = input.sizes()[0];
   auto n_hidden = input.sizes()[1];
   auto align_block_size_value =
@@ -46,8 +42,9 @@ void moe_permute(
   auto sort_workspace = torch::empty(
       {sorter_size},
       torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false));
+  auto copy_topk_ids = topk_ids.clone();  // copy topk_ids for preprocess
   auto permuted_experts_id = torch::empty_like(topk_ids);
-  auto dst_row_id2src_row_id_map = torch::empty_like(src_row_id2dst_row_id_map);
+  auto sorted_row_idx = torch::empty_like(inv_permuted_idx);
   auto align_expert_first_token_offset =
       torch::zeros_like(expert_first_token_offset);
 
@@ -67,24 +64,22 @@ void moe_permute(
     const int* expert_map_ptr = get_ptr<int>(expert_map.value());
     valid_num_ptr =
         get_ptr<int64_t>(expert_first_token_offset) + n_local_expert;
-    preprocessTopkIdLauncher(get_ptr<int>(topk_ids), n_token * topk,
+    preprocessTopkIdLauncher(get_ptr<int>(copy_topk_ids), n_token * topk,
                              expert_map_ptr, n_expert, stream);
   }
   // expert sort topk expert id and scan expert id get expert_first_token_offset
-  sortAndScanExpert(get_ptr<int>(topk_ids), get_ptr<int>(token_expert_indices),
-                    get_ptr<int>(permuted_experts_id),
-                    get_ptr<int>(dst_row_id2src_row_id_map),
-                    get_ptr<int64_t>(expert_first_token_offset), n_token,
-                    n_expert, n_local_expert, topk, sorter,
-                    get_ptr<int>(sort_workspace), stream);
+  sortAndScanExpert(
+      get_ptr<int>(copy_topk_ids), get_ptr<int>(token_expert_indices),
+      get_ptr<int>(permuted_experts_id), get_ptr<int>(sorted_row_idx),
+      get_ptr<int64_t>(expert_first_token_offset), n_token, n_expert,
+      n_local_expert, topk, sorter, get_ptr<int>(sort_workspace), stream);
 
   // dispatch expandInputRowsKernelLauncher
   MOE_DISPATCH(input.scalar_type(), [&] {
     expandInputRowsKernelLauncher<scalar_t>(
         get_ptr<scalar_t>(input), get_ptr<scalar_t>(permuted_input),
-        get_ptr<float>(topk_weights), get_ptr<int>(permuted_experts_id),
-        get_ptr<int>(dst_row_id2src_row_id_map),
-        get_ptr<int>(src_row_id2dst_row_id_map),
+        get_ptr<int>(permuted_experts_id), get_ptr<int>(sorted_row_idx),
+        get_ptr<int>(inv_permuted_idx), get_ptr<int>(permuted_idx),
         get_ptr<int64_t>(expert_first_token_offset), n_token, valid_num_ptr,
         n_hidden, topk, n_local_expert, align_block_size_value, stream);
   });
@@ -101,32 +96,34 @@ void moe_permute(
 }
 
 void moe_unpermute(
-    const torch::Tensor& permuted_hidden_states,     // [n_token * topk, hidden]
-    const torch::Tensor& topk_weights,               //[n_token, topk]
-    const torch::Tensor& topk_ids,                   // [n_token, topk]
-    const torch::Tensor& src_row_id2dst_row_id_map,  // [n_token, topk]
-    const torch::Tensor& expert_first_token_offset,  // [n_local_expert+1]
-    int64_t n_expert, int64_t n_local_expert, int64_t topk,
+    const torch::Tensor& permuted_hidden_states,  // [n_token * topk, hidden]
+    const torch::Tensor& topk_weights,            // [n_token, topk]
+    const torch::Tensor& inv_permuted_idx,        // [n_token, topk]
+    const std::optional<torch::Tensor>&
+        expert_first_token_offset,  // [n_local_expert+1]
+    int64_t topk,
     torch::Tensor& hidden_states  // [n_token, hidden]
 ) {
-  TORCH_CHECK(src_row_id2dst_row_id_map.sizes() == topk_ids.sizes(),
-              "topk_ids shape must be same as src_row_id2dst_row_id_map");
-  TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int,
-              "topk_ids must be int32");
   TORCH_CHECK(
       permuted_hidden_states.scalar_type() == hidden_states.scalar_type(),
-      "topk_ids dtype must be same as src_row_id2dst_row_id_map");
+      "permuted_hidden_states dtype must be same as hidden_states");
   auto n_token = hidden_states.size(0);
   auto n_hidden = hidden_states.size(1);
   auto stream = at::cuda::getCurrentCUDAStream().stream();
-  const int64_t* valid_ptr =
-      get_ptr<int64_t>(expert_first_token_offset) + n_local_expert;
+
+  int64_t const* valid_ptr = nullptr;
+  if (expert_first_token_offset.has_value()) {
+    int n_local_expert = expert_first_token_offset.value().size(0) - 1;
+    valid_ptr =
+        get_ptr<int64_t>(expert_first_token_offset.value()) + n_local_expert;
+  }
+
   MOE_DISPATCH(hidden_states.scalar_type(), [&] {
     finalizeMoeRoutingKernelLauncher<scalar_t, scalar_t>(
         get_ptr<scalar_t>(permuted_hidden_states),
         get_ptr<scalar_t>(hidden_states), get_ptr<float>(topk_weights),
-        get_ptr<int>(src_row_id2dst_row_id_map), get_ptr<int>(topk_ids),
-        n_token, n_hidden, topk, valid_ptr, stream);
+        get_ptr<int>(inv_permuted_idx), n_token, n_hidden, topk, valid_ptr,
+        stream);
   });
 }
 
diff --git a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu
index de2c153882d93..2271c1bc75b1f 100644
--- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu
+++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu
@@ -177,7 +177,7 @@ __global__ void getMIndicesKernel(int64_t* expert_first_token_offset,
   int tidx = threadIdx.x;
   extern __shared__ int64_t smem_expert_first_token_offset[];
   for (int i = tidx; i <= num_local_expert; i += blockDim.x) {
-    smem_expert_first_token_offset[tidx] = __ldg(expert_first_token_offset + i);
+    smem_expert_first_token_offset[i] = __ldg(expert_first_token_offset + i);
   }
   __syncthreads();
   auto last_token_offset = smem_expert_first_token_offset[eidx + 1];
diff --git a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h
index 43c29721cd16e..108091efbefa8 100644
--- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h
+++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h
@@ -57,31 +57,19 @@ void sortAndScanExpert(int* expert_for_source_row, const int* source_rows,
 
 template <typename T>
 void expandInputRowsKernelLauncher(
-    T const* unpermuted_input, T* permuted_output,
-    const float* unpermuted_scales, int* sorted_experts,
+    T const* unpermuted_input, T* permuted_output, int* sorted_experts,
     int const* expanded_dest_row_to_expanded_source_row,
-    int* expanded_source_row_to_expanded_dest_row,
+    int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
     int64_t* expert_first_token_offset, int64_t const num_rows,
     int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k,
     int num_local_experts, const int& align_block_size, cudaStream_t stream);
 
-// Final kernel to unpermute and scale
-// This kernel unpermutes the original data, does the k-way reduction and
-// performs the final skip connection.
-template <typename T, typename OutputType, bool CHECK_SKIPPED>
-__global__ void finalizeMoeRoutingKernel(
-    T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
-    float const* scales, int const* expanded_source_row_to_expanded_dest_row,
-    int const* expert_for_source_row, int64_t const orig_cols, int64_t const k,
-    int64_t const* num_valid_ptr);
-
 template <class T, class OutputType>
 void finalizeMoeRoutingKernelLauncher(
     T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
     float const* scales, int const* expanded_source_row_to_expanded_dest_row,
-    int const* expert_for_source_row, int64_t const num_rows,
-    int64_t const cols, int64_t const k, int64_t const* num_valid_ptr,
-    cudaStream_t stream);
+    int64_t const num_rows, int64_t const cols, int64_t const k,
+    int64_t const* num_valid_ptr, cudaStream_t stream);
 
 void preprocessTopkIdLauncher(int* topk_id_ptr, int size,
                               const int* expert_map_ptr, int num_experts,
diff --git a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl
index ad0d390665a00..449243b92a283 100644
--- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl
+++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl
@@ -2,10 +2,9 @@
 
 template <typename T, bool CHECK_SKIPPED, bool ALIGN_BLOCK_SIZE>
 __global__ void expandInputRowsKernel(
-    T const* unpermuted_input, T* permuted_output,
-    const float* unpermuted_scales, int* sorted_experts,
+    T const* unpermuted_input, T* permuted_output, int* sorted_experts,
     int const* expanded_dest_row_to_expanded_source_row,
-    int* expanded_source_row_to_expanded_dest_row,
+    int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
     int64_t* expert_first_token_offset, int64_t const num_rows,
     int64_t const* num_dest_rows, int64_t const cols, int64_t k,
     int num_local_experts, int align_block_size) {
@@ -54,6 +53,10 @@ __global__ void expandInputRowsKernel(
     assert(expanded_dest_row <= INT32_MAX);
     expanded_source_row_to_expanded_dest_row[expanded_source_row] =
         static_cast<int>(expanded_dest_row);
+    // skip non local expert token
+    if (!CHECK_SKIPPED || blockIdx.x < *num_dest_rows) {
+      permuted_idx[expanded_dest_row] = expanded_source_row;
+    }
   }
 
   if (!CHECK_SKIPPED || blockIdx.x < *num_dest_rows) {
@@ -62,7 +65,7 @@ __global__ void expandInputRowsKernel(
     using DataElem = cutlass::Array<T, ELEM_PER_THREAD>;
 
     // Duplicate and permute rows
-    int64_t const source_row = expanded_source_row % num_rows;
+    int64_t const source_row = expanded_source_row / k;
 
     auto const* source_row_ptr =
         reinterpret_cast<DataElem const*>(unpermuted_input + source_row * cols);
@@ -82,10 +85,9 @@ __global__ void expandInputRowsKernel(
 
 template <typename T>
 void expandInputRowsKernelLauncher(
-    T const* unpermuted_input, T* permuted_output,
-    const float* unpermuted_scales, int* sorted_experts,
+    T const* unpermuted_input, T* permuted_output, int* sorted_experts,
     int const* expanded_dest_row_to_expanded_source_row,
-    int* expanded_source_row_to_expanded_dest_row,
+    int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
     int64_t* expert_first_token_offset, int64_t const num_rows,
     int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k,
     int num_local_experts, const int& align_block_size, cudaStream_t stream) {
@@ -105,11 +107,11 @@ void expandInputRowsKernelLauncher(
   int64_t smem_size = sizeof(int64_t) * (num_local_experts + 1);
 
   func<<<blocks, threads, smem_size, stream>>>(
-      unpermuted_input, permuted_output, unpermuted_scales, sorted_experts,
+      unpermuted_input, permuted_output, sorted_experts,
       expanded_dest_row_to_expanded_source_row,
-      expanded_source_row_to_expanded_dest_row, expert_first_token_offset,
-      num_rows, num_valid_tokens_ptr, cols, k, num_local_experts,
-      align_block_size);
+      expanded_source_row_to_expanded_dest_row, permuted_idx,
+      expert_first_token_offset, num_rows, num_valid_tokens_ptr, cols, k,
+      num_local_experts, align_block_size);
 }
 
 template <class T, class U>
@@ -128,11 +130,9 @@ template <typename T, typename OutputType, bool CHECK_SKIPPED>
 __global__ void finalizeMoeRoutingKernel(
     T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
     float const* scales, int const* expanded_source_row_to_expanded_dest_row,
-    int const* expert_for_source_row, int64_t const orig_cols, int64_t const k,
-    int64_t const* num_valid_ptr) {
+    int64_t const orig_cols, int64_t const k, int64_t const* num_valid_ptr) {
   assert(orig_cols % 4 == 0);
   int64_t const original_row = blockIdx.x;
-  int64_t const num_rows = gridDim.x;
   auto const offset = original_row * orig_cols;
   OutputType* reduced_row_ptr = reduced_unpermuted_output + offset;
   int64_t const num_valid = *num_valid_ptr;
@@ -159,14 +159,13 @@ __global__ void finalizeMoeRoutingKernel(
     ComputeElem thread_output;
     thread_output.fill(0);
     for (int k_idx = 0; k_idx < k; ++k_idx) {
-      int64_t const expanded_original_row = original_row + k_idx * num_rows;
+      int64_t const expanded_original_row = original_row * k + k_idx;
       int64_t const expanded_permuted_row =
           expanded_source_row_to_expanded_dest_row[expanded_original_row];
 
       int64_t const k_offset = original_row * k + k_idx;
       float const row_scale = scales[k_offset];
 
-      // Check after row_rescale has accumulated
       if (CHECK_SKIPPED && expanded_permuted_row >= num_valid) {
         continue;
       }
@@ -189,9 +188,8 @@ template <class T, class OutputType>
 void finalizeMoeRoutingKernelLauncher(
     T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
     float const* scales, int const* expanded_source_row_to_expanded_dest_row,
-    int const* expert_for_source_row, int64_t const num_rows,
-    int64_t const cols, int64_t const k, int64_t const* num_valid_ptr,
-    cudaStream_t stream) {
+    int64_t const num_rows, int64_t const cols, int64_t const k,
+    int64_t const* num_valid_ptr, cudaStream_t stream) {
   int64_t const blocks = num_rows;
   int64_t const threads = 256;
   bool const check_finished = num_valid_ptr != nullptr;
@@ -201,6 +199,5 @@ void finalizeMoeRoutingKernelLauncher(
   auto* const kernel = func_map[check_finished];
   kernel<<<blocks, threads, 0, stream>>>(
       expanded_permuted_rows, reduced_unpermuted_output, scales,
-      expanded_source_row_to_expanded_dest_row, expert_for_source_row, cols, k,
-      num_valid_ptr);
+      expanded_source_row_to_expanded_dest_row, cols, k, num_valid_ptr);
 }
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index 97df311d04409..d96e082f6ef11 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -56,18 +56,17 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
       " -> Tensor");
 
   m.def(
-      "moe_permute(Tensor input, Tensor topk_weight, Tensor! topk_ids,"
+      "moe_permute(Tensor input, Tensor topk_ids,"
       "Tensor token_expert_indices, Tensor? expert_map, int n_expert,"
       "int n_local_expert,"
       "int topk, int? align_block_size,Tensor! permuted_input, Tensor! "
-      "expert_first_token_offset, Tensor! src_row_id2dst_row_id_map, Tensor! "
-      "m_indices)->()");
+      "expert_first_token_offset, Tensor! inv_permuted_idx, Tensor! "
+      "permuted_idx, Tensor! m_indices)->()");
 
   m.def(
       "moe_unpermute(Tensor permuted_hidden_states, Tensor topk_weights,"
-      "Tensor topk_ids,Tensor src_row_id2dst_row_id_map, Tensor "
-      "expert_first_token_offset, int n_expert, int n_local_expert,int "
-      "topk, Tensor! hidden_states)->()");
+      "Tensor inv_permuted_idx, Tensor? expert_first_token_offset, "
+      "int topk, Tensor! hidden_states)->()");
 
   m.def("moe_permute_unpermute_supported() -> bool");
   m.impl("moe_permute_unpermute_supported", &moe_permute_unpermute_supported);
diff --git a/csrc/ops.h b/csrc/ops.h
index 97a247d9d628c..207291eceb169 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -292,6 +292,11 @@ void per_token_group_quant_fp8(const torch::Tensor& input,
                                torch::Tensor& output_q, torch::Tensor& output_s,
                                int64_t group_size, double eps, double fp8_min,
                                double fp8_max, bool scale_ue8m0);
+
+void per_token_group_quant_int8(const torch::Tensor& input,
+                                torch::Tensor& output_q,
+                                torch::Tensor& output_s, int64_t group_size,
+                                double eps, double int8_min, double int8_max);
 #endif
 
 void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
diff --git a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
index 5cd2ac179768b..d8369108d0bd3 100644
--- a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
+++ b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
@@ -1,6 +1,10 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <torch/all.h>
 
+#ifndef USE_ROCM
+  #include "../per_token_group_quant_8bit.h"
+#endif
+
 #include <cmath>
 
 #include "../../dispatch_utils.h"
@@ -336,3 +340,13 @@ void dynamic_scaled_int8_quant(
         }
       });
 }
+
+#ifndef USE_ROCM
+void per_token_group_quant_int8(const torch::Tensor& input,
+                                torch::Tensor& output_q,
+                                torch::Tensor& output_s, int64_t group_size,
+                                double eps, double int8_min, double int8_max) {
+  per_token_group_quant_8bit(input, output_q, output_s, group_size, eps,
+                             int8_min, int8_max);
+}
+#endif
diff --git a/csrc/quantization/cutlass_w8a8/Epilogues.md b/csrc/quantization/cutlass_w8a8/Epilogues.md
index a30e1fdf3ac77..15a66913e97a3 100644
--- a/csrc/quantization/cutlass_w8a8/Epilogues.md
+++ b/csrc/quantization/cutlass_w8a8/Epilogues.md
@@ -86,6 +86,7 @@ D = s_a s_b \widehat A \widehat B
 ```
 
 Epilogue parameters:
+
 - `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
 - `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
 
@@ -135,7 +136,7 @@ That is precomputed and stored in `azp_with_adj` as a row-vector.
 Epilogue parameters:
 
 - `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
-  - Generally this will be per-tensor as the zero-points are per-tensor.
+    - Generally this will be per-tensor as the zero-points are per-tensor.
 - `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
 - `azp_with_adj` is the precomputed zero-point term ($` z_a J_a \widehat B `$), is per-channel (row-vector).
 - `bias` is the bias, is always per-channel (row-vector).
@@ -152,7 +153,7 @@ That means the zero-point term $` z_a J_a \widehat B `$ becomes an outer product
 Epilogue parameters:
 
 - `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
-  - Generally this will be per-token as the zero-points are per-token.
+    - Generally this will be per-token as the zero-points are per-token.
 - `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
 - `azp_adj` is the precomputed zero-point adjustment term ($` \mathbf 1 \widehat B `$), is per-channel (row-vector).
 - `azp` is the zero-point (`z_a`), is per-token (column-vector).
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu
index e092c61abc249..1db6c41bf9535 100644
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu
@@ -1,6 +1,5 @@
 #include "scaled_mm_kernels.hpp"
 #include "scaled_mm_sm90_fp8_dispatch.cuh"
-#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
 
 namespace vllm {
 
@@ -13,11 +12,11 @@ void cutlass_scaled_mm_sm90_fp8(torch::Tensor& out, torch::Tensor const& a,
   if (bias) {
     TORCH_CHECK(bias->dtype() == out.dtype(),
                 "currently bias dtype must match output dtype ", out.dtype());
-    return cutlass_scaled_mm_sm90_fp8_epilogue<c3x::ScaledEpilogueBias>(
-        out, a, b, a_scales, b_scales, *bias);
+    return cutlass_scaled_mm_sm90_fp8_epilogue<true>(out, a, b, a_scales,
+                                                     b_scales, *bias);
   } else {
-    return cutlass_scaled_mm_sm90_fp8_epilogue<c3x::ScaledEpilogue>(
-        out, a, b, a_scales, b_scales);
+    return cutlass_scaled_mm_sm90_fp8_epilogue<false>(out, a, b, a_scales,
+                                                      b_scales);
   }
 }
 
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8_dispatch.cuh
index 32ea5db3321bc..4ff3e65f2b2e1 100644
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8_dispatch.cuh
@@ -2,6 +2,7 @@
 
 #include "scaled_mm.cuh"
 #include "cutlass_gemm_caller.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
 
 /**
  * This file defines Gemm kernel configurations for SM90 (fp8) based on the Gemm
@@ -12,8 +13,91 @@ namespace vllm {
 
 using c3x::cutlass_gemm_caller;
 
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
+template <typename ElementAB_, typename ElementD_,
+          template <typename, typename, typename> typename Epilogue_,
+          typename TileShape, typename ClusterShape, typename KernelSchedule,
+          typename EpilogueSchedule, bool swap_ab_ = false>
+struct cutlass_3x_gemm_sm90_fp8 {
+  using ElementAB = ElementAB_;
+  using ElementC = ElementD_;
+  using ElementD = ElementD_;
+  using ElementAcc =
+      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
+                                float>::type;
+
+  using Epilogue = Epilogue_<ElementAcc, ElementD, TileShape>;
+
+  using EVTCompute = typename Epilogue::EVTCompute;
+
+  static constexpr int AlignmentAB =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+  static constexpr int AlignmentCD =
+      128 / cutlass::sizeof_bits<ElementD>::value;
+
+  // Compile-time swap_ab flag
+  static constexpr bool swap_ab = swap_ab_;
+
+  // -----------------------------------------------------------
+  // Layout definitions
+  // -----------------------------------------------------------
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutA_T = typename cutlass::layout::LayoutTranspose<LayoutA>::type;
+
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutB_T = typename cutlass::layout::LayoutTranspose<LayoutB>::type;
+
+  using LayoutD = cutlass::layout::RowMajor;
+  using LayoutD_Transpose =
+      typename cutlass::layout::LayoutTranspose<LayoutD>::type;
+
+  using LayoutC = LayoutD;
+  using LayoutC_Transpose = LayoutD_Transpose;
+
+  // -----------------------------------------------------------
+  // Collective epilogue (conditionally swap operands and layouts)
+  // -----------------------------------------------------------
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
+          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
+          ElementAcc, float, ElementC,
+          conditional_t<swap_ab, LayoutC_Transpose, LayoutC>, AlignmentCD,
+          ElementD, conditional_t<swap_ab, LayoutD_Transpose, LayoutD>,
+          AlignmentCD, EpilogueSchedule, EVTCompute>::CollectiveOp;
+
+  static constexpr size_t CEStorageSize =
+      sizeof(typename CollectiveEpilogue::SharedStorage);
+
+  using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(CEStorageSize)>;
+
+  // -----------------------------------------------------------
+  // Collective mainloop (conditionally swap operands and layouts)
+  // -----------------------------------------------------------
+  using CollectiveMainloop = conditional_t<
+      swap_ab,
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, ElementAB,
+          LayoutB_T, AlignmentAB,             // Swapped B (as A)
+          ElementAB, LayoutA_T, AlignmentAB,  // Swapped A (as B)
+          ElementAcc, TileShape, ClusterShape, Stages,
+          KernelSchedule>::CollectiveOp,
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, ElementAB,
+          LayoutA, AlignmentAB, ElementAB, LayoutB, AlignmentAB, ElementAcc,
+          TileShape, ClusterShape, Stages, KernelSchedule>::CollectiveOp>;
+
+  // -----------------------------------------------------------
+  // Kernel definition
+  // -----------------------------------------------------------
+  using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
+      cute::Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
+      cutlass::gemm::PersistentScheduler>>;
+
+  struct GemmKernel : public KernelType {};
+};
+
+template <typename InType, typename OutType, bool EnableBias>
 struct sm90_fp8_config_default {
   // M in (128, inf)
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
@@ -22,13 +106,17 @@ struct sm90_fp8_config_default {
   using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
   using TileShape = Shape<_128, _128, _128>;
   using ClusterShape = Shape<_2, _1, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
+
+  using Cutlass3xGemm = conditional_t<
+      EnableBias,
+      cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogueBias,
+                               TileShape, ClusterShape, KernelSchedule,
+                               EpilogueSchedule>,
+      cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogue, TileShape,
+                               ClusterShape, KernelSchedule, EpilogueSchedule>>;
 };
 
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
+template <typename InType, typename OutType, bool EnableBias>
 struct sm90_fp8_config_M128 {
   // M in (64, 128]
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
@@ -37,33 +125,146 @@ struct sm90_fp8_config_M128 {
   using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
   using TileShape = Shape<_64, _128, _128>;
   using ClusterShape = Shape<_2, _1, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
+  using Cutlass3xGemm = conditional_t<
+      EnableBias,
+      cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogueBias,
+                               TileShape, ClusterShape, KernelSchedule,
+                               EpilogueSchedule>,
+      cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogue, TileShape,
+                               ClusterShape, KernelSchedule, EpilogueSchedule>>;
 };
 
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_fp8_config_M64 {
-  // M in [1, 64]
+template <typename InType, typename OutType, bool EnableBias>
+struct sm90_fp8_config_M64_N1280 {
+  // M in (16, 64], N in [1 1280]
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  using KernelSchedule =
-      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
   using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _64, _128>;
-  using ClusterShape = Shape<_1, _8, _1>;
+  using TileShape = Shape<_64, _16, _256>;
+  using ClusterShape = Shape<_1, _4, _1>;
 
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
+  // enable swap AB for M < 64
+  using Cutlass3xGemm = conditional_t<
+      EnableBias,
+      cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogueColumnBias,
+                               TileShape, ClusterShape, KernelSchedule,
+                               EpilogueSchedule, true>,
+      cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogue, TileShape,
+                               ClusterShape, KernelSchedule, EpilogueSchedule,
+                               true>>;
 };
 
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue,
+template <typename InType, typename OutType, bool EnableBias>
+struct sm90_fp8_config_M64_N8192 {
+  // M in (16, 64], N > 1280
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+
+  // enable swap AB for M < 64
+  using Cutlass3xGemm = conditional_t<
+      EnableBias,
+      cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogueColumnBias,
+                               TileShape, ClusterShape, KernelSchedule,
+                               EpilogueSchedule, true>,
+      cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogue, TileShape,
+                               ClusterShape, KernelSchedule, EpilogueSchedule,
+                               true>>;
+};
+
+template <typename InType, typename OutType, bool EnableBias>
+struct sm90_fp8_config_M16_N1280 {
+  // M in [1, 16], N in [1, 1280]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _16, _256>;
+  using ClusterShape = Shape<_1, _2, _1>;
+
+  // enable swap AB for M < 64
+  using Cutlass3xGemm = conditional_t<
+      EnableBias,
+      cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogueColumnBias,
+                               TileShape, ClusterShape, KernelSchedule,
+                               EpilogueSchedule, true>,
+      cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogue, TileShape,
+                               ClusterShape, KernelSchedule, EpilogueSchedule,
+                               true>>;
+};
+
+template <typename InType, typename OutType, bool EnableBias>
+struct sm90_fp8_config_M16_N8192 {
+  // M in [1, 16], N > 1280
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _16, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+
+  // enable swap AB for M < 64
+  using Cutlass3xGemm = conditional_t<
+      EnableBias,
+      cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogueColumnBias,
+                               TileShape, ClusterShape, KernelSchedule,
+                               EpilogueSchedule, true>,
+      cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogue, TileShape,
+                               ClusterShape, KernelSchedule, EpilogueSchedule,
+                               true>>;
+};
+
+template <typename Gemm, typename... EpilogueArgs>
+void cutlass_gemm_caller_sm90_fp8(torch::Tensor& out, torch::Tensor const& a,
+                                  torch::Tensor const& b,
+                                  EpilogueArgs&&... epilogue_params) {
+  static constexpr bool swap_ab = Gemm::swap_ab;
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+  using GemmKernel = typename Gemm::GemmKernel;
+
+  using StrideA = typename Gemm::GemmKernel::StrideA;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using StrideC = typename Gemm::GemmKernel::StrideC;
+
+  int32_t m = a.size(0), n = b.size(1), k = a.size(1);
+  auto prob_shape =
+      swap_ab ? cute::make_shape(n, m, k, 1) : cute::make_shape(m, n, k, 1);
+
+  StrideA a_stride =
+      cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(m, k, 1));
+  StrideB b_stride =
+      cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, 1));
+  StrideC c_stride = cutlass::make_cute_packed_stride(
+      StrideC{},
+      swap_ab ? cute::make_shape(n, m, 1) : cute::make_shape(m, n, 1));
+
+  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
+  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+
+  typename GemmKernel::MainloopArguments mainloop_args =
+      swap_ab ? typename GemmKernel::MainloopArguments{b_ptr, b_stride, a_ptr,
+                                                       a_stride}
+              : typename GemmKernel::MainloopArguments{a_ptr, a_stride, b_ptr,
+                                                       b_stride};
+
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      Gemm::Epilogue::prepare_args(
+          std::forward<EpilogueArgs>(epilogue_params)...),
+      c_ptr, c_stride, c_ptr, c_stride};
+
+  c3x::cutlass_gemm_caller<GemmKernel>(a.device(), prob_shape, mainloop_args,
+                                       epilogue_args);
+}
+
+template <typename InType, typename OutType, bool EnableBias,
           typename... EpilogueArgs>
 inline void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out,
                                            torch::Tensor const& a,
                                            torch::Tensor const& b,
+                                           torch::Tensor const& a_scales,
+                                           torch::Tensor const& b_scales,
                                            EpilogueArgs&&... args) {
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
   TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
@@ -71,50 +272,75 @@ inline void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out,
 
   using Cutlass3xGemmDefault =
       typename sm90_fp8_config_default<InType, OutType,
-                                       Epilogue>::Cutlass3xGemm;
-  using Cutlass3xGemmM64 =
-      typename sm90_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
+                                       EnableBias>::Cutlass3xGemm;
   using Cutlass3xGemmM128 =
-      typename sm90_fp8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
+      typename sm90_fp8_config_M128<InType, OutType, EnableBias>::Cutlass3xGemm;
+
+  using Cutlass3xGemmM64_N1280 =
+      typename sm90_fp8_config_M64_N1280<InType, OutType,
+                                         EnableBias>::Cutlass3xGemm;
+  using Cutlass3xGemmM64_N8192 =
+      typename sm90_fp8_config_M64_N8192<InType, OutType,
+                                         EnableBias>::Cutlass3xGemm;
+  using Cutlass3xGemmM16_N1280 =
+      typename sm90_fp8_config_M16_N1280<InType, OutType,
+                                         EnableBias>::Cutlass3xGemm;
+  using Cutlass3xGemmM16_N8192 =
+      typename sm90_fp8_config_M16_N8192<InType, OutType,
+                                         EnableBias>::Cutlass3xGemm;
 
   uint32_t const m = a.size(0);
-  uint32_t const mp2 =
-      std::max(static_cast<uint32_t>(64), next_pow_2(m));  // next power of 2
+  uint32_t const n = b.size(1);
 
-  if (mp2 <= 64) {
-    // m in [1, 64]
-    return cutlass_gemm_caller<Cutlass3xGemmM64>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  } else if (mp2 <= 128) {
+  if (m <= 16) {
+    // m in [1, 16]
+    if (n <= 1280) {
+      return cutlass_gemm_caller_sm90_fp8<Cutlass3xGemmM16_N1280>(
+          out, a, b, b_scales, a_scales, std::forward<EpilogueArgs>(args)...);
+    }
+    return cutlass_gemm_caller_sm90_fp8<Cutlass3xGemmM16_N8192>(
+        out, a, b, b_scales, a_scales, std::forward<EpilogueArgs>(args)...);
+  } else if (m <= 64) {
+    // m in (16, 64]
+    if (n <= 1280) {
+      return cutlass_gemm_caller_sm90_fp8<Cutlass3xGemmM64_N1280>(
+          out, a, b, b_scales, a_scales, std::forward<EpilogueArgs>(args)...);
+    }
+    return cutlass_gemm_caller_sm90_fp8<Cutlass3xGemmM64_N8192>(
+        out, a, b, b_scales, a_scales, std::forward<EpilogueArgs>(args)...);
+  } else if (m <= 128) {
     // m in (64, 128]
-    return cutlass_gemm_caller<Cutlass3xGemmM128>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
+    return cutlass_gemm_caller_sm90_fp8<Cutlass3xGemmM128>(
+        out, a, b, a_scales, b_scales, std::forward<EpilogueArgs>(args)...);
   } else {
     // m in (128, inf)
-    return cutlass_gemm_caller<Cutlass3xGemmDefault>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
+    return cutlass_gemm_caller_sm90_fp8<Cutlass3xGemmDefault>(
+        out, a, b, a_scales, b_scales, std::forward<EpilogueArgs>(args)...);
   }
 }
 
-template <template <typename, typename, typename> typename Epilogue,
-          typename... EpilogueArgs>
+template <bool EnableBias, typename... EpilogueArgs>
 void cutlass_scaled_mm_sm90_fp8_epilogue(torch::Tensor& out,
                                          torch::Tensor const& a,
                                          torch::Tensor const& b,
+                                         torch::Tensor const& a_scales,
+                                         torch::Tensor const& b_scales,
                                          EpilogueArgs&&... epilogue_args) {
   TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
   TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
 
   if (out.dtype() == torch::kBFloat16) {
     return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
-                                          cutlass::bfloat16_t, Epilogue>(
-        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+                                          cutlass::bfloat16_t, EnableBias>(
+        out, a, b, a_scales, b_scales,
+        std::forward<EpilogueArgs>(epilogue_args)...);
   } else {
     TORCH_CHECK(out.dtype() == torch::kFloat16);
     return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
-                                          cutlass::half_t, Epilogue>(
-        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+                                          cutlass::half_t, EnableBias>(
+        out, a, b, a_scales, b_scales,
+        std::forward<EpilogueArgs>(epilogue_args)...);
   }
 }
 
-}  // namespace vllm
\ No newline at end of file
+}  // namespace vllm
diff --git a/csrc/quantization/fp8/per_token_group_quant.cu b/csrc/quantization/fp8/per_token_group_quant.cu
index afc41faeca902..f5b40e35b6e5a 100644
--- a/csrc/quantization/fp8/per_token_group_quant.cu
+++ b/csrc/quantization/fp8/per_token_group_quant.cu
@@ -1,10 +1,10 @@
 #include <ATen/cuda/CUDAContext.h>
-#include <c10/util/Float8_e4m3fn.h>
+
+#include "../per_token_group_quant_8bit.h"
 
 #include <cmath>
 
-#include <cuda_fp16.h>
-#include <cuda_bf16.h>
+#include <cuda_fp8.h>
 
 #include <torch/all.h>
 
@@ -120,7 +120,7 @@ void per_token_group_quant_8bit(const torch::Tensor& input,
                                 torch::Tensor& output_q,
                                 torch::Tensor& output_s, int64_t group_size,
                                 double eps, double min_8bit, double max_8bit,
-                                bool scale_ue8m0 = false) {
+                                bool scale_ue8m0) {
   TORCH_CHECK(input.is_contiguous());
   TORCH_CHECK(output_q.is_contiguous());
 
@@ -197,7 +197,9 @@ void per_token_group_quant_8bit(const torch::Tensor& input,
   VLLM_DISPATCH_FLOATING_TYPES(
       input.scalar_type(), "per_token_group_quant_8bit", ([&] {
         if (dst_type == at::ScalarType::Float8_e4m3fn) {
-          LAUNCH_KERNEL(scalar_t, c10::Float8_e4m3fn);
+          LAUNCH_KERNEL(scalar_t, __nv_fp8_e4m3);
+        } else if (dst_type == at::ScalarType::Char) {
+          LAUNCH_KERNEL(scalar_t, int8_t);
         }
       }));
 
diff --git a/csrc/quantization/per_token_group_quant_8bit.h b/csrc/quantization/per_token_group_quant_8bit.h
new file mode 100644
index 0000000000000..537b61bc4303f
--- /dev/null
+++ b/csrc/quantization/per_token_group_quant_8bit.h
@@ -0,0 +1,10 @@
+#pragma once
+#include <torch/all.h>
+
+// TODO(wentao): refactor the folder to 8bit, then includes fp8 and int8 folders
+// 8-bit per-token-group quantization helper used by both FP8 and INT8
+void per_token_group_quant_8bit(const torch::Tensor& input,
+                                torch::Tensor& output_q,
+                                torch::Tensor& output_s, int64_t group_size,
+                                double eps, double min_8bit, double max_8bit,
+                                bool scale_ue8m0 = false);
\ No newline at end of file
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 95f8541bc9e2d..85b6abef00b03 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -624,6 +624,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.impl("per_token_group_fp8_quant", torch::kCUDA,
            &per_token_group_quant_fp8);
 
+  // Compute per-token-group INT8 quantized tensor and scaling factor.
+  ops.def(
+      "per_token_group_quant_int8(Tensor input, Tensor! output_q, Tensor! "
+      "output_s, int group_size, float eps, float int8_min, float int8_max) -> "
+      "()");
+  ops.impl("per_token_group_quant_int8", torch::kCUDA,
+           &per_token_group_quant_int8);
+
   // reorder weight for AllSpark Ampere W8A16 Fused Gemm kernel
   ops.def(
       "rearrange_kn_weight_as_n32k16_order(Tensor b_qweight, Tensor b_scales, "
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 2e8c15bbd32f5..43522ef8fb8dd 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -164,9 +164,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 # see https://github.com/pytorch/pytorch/pull/123243
 ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0 12.0'
 ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
-# Override the arch list for flash-attn to reduce the binary size
-ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
-ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
 #################### BASE BUILD IMAGE ####################
 
 #################### WHEEL BUILD IMAGE ####################
@@ -209,16 +206,7 @@ ARG SCCACHE_REGION_NAME=us-west-2
 ARG SCCACHE_S3_NO_CREDENTIALS=0
 
 # Flag to control whether to use pre-built vLLM wheels
-ARG VLLM_USE_PRECOMPILED
-# TODO: in setup.py VLLM_USE_PRECOMPILED is sensitive to truthiness, it will take =0 as "true", this should be fixed
-ENV VLLM_USE_PRECOMPILED=""
-RUN if [ "${VLLM_USE_PRECOMPILED}" = "1" ]; then \
-        export VLLM_USE_PRECOMPILED=1 && \
-        echo "Using precompiled wheels"; \
-    else \
-        unset VLLM_USE_PRECOMPILED && \
-        echo "Leaving VLLM_USE_PRECOMPILED unset to build wheels from source"; \
-    fi
+ARG VLLM_USE_PRECOMPILED=""
 
 # if USE_SCCACHE is set, use sccache to speed up compilation
 RUN --mount=type=cache,target=/root/.cache/uv \
@@ -235,6 +223,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
         && export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
         && export SCCACHE_IDLE_TIMEOUT=0 \
         && export CMAKE_BUILD_TYPE=Release \
+        && export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" \
+        && export VLLM_DOCKER_BUILD_CONTEXT=1 \
         && sccache --show-stats \
         && python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
         && sccache --show-stats; \
@@ -248,9 +238,22 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
         # Clean any existing CMake artifacts
         rm -rf .deps && \
         mkdir -p .deps && \
+        export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" && \
+        export VLLM_DOCKER_BUILD_CONTEXT=1 && \
         python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
     fi
 
+# When using precompiled wheels, keep only the newest manylinux1 wheel and delete others
+RUN if [ "$VLLM_USE_PRECOMPILED" = "1" ]; then \
+        echo "Cleaning up extra wheels in dist/..." && \
+        # Identify the most recent manylinux1_x86_64 wheel
+        KEEP_WHEEL=$(ls -t dist/*manylinux1_x86_64.whl 2>/dev/null | head -n1) && \
+        if [ -n "$KEEP_WHEEL" ]; then \
+            echo "Keeping wheel: $KEEP_WHEEL"; \
+            find dist/ -type f -name "*.whl" ! -path "${KEEP_WHEEL}" -delete; \
+        fi; \
+    fi
+
 # Check the size of the wheel if RUN_WHEEL_CHECK is true
 COPY .buildkite/check-wheel-size.py check-wheel-size.py
 # sync the default value with .buildkite/check-wheel-size.py
@@ -386,7 +389,9 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 
 # Install FlashInfer from source
 ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
-ARG FLASHINFER_GIT_REF="v0.2.9rc1"
+# Keep this in sync with https://github.com/vllm-project/vllm/blob/main/requirements/cuda.txt
+# We use `--force-reinstall --no-deps` to avoid issues with the existing FlashInfer wheel.
+ARG FLASHINFER_GIT_REF="v0.2.9rc2"
 RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
   . /etc/environment
     git clone --depth 1 --recursive --shallow-submodules \
@@ -408,7 +413,7 @@ RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
         TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
             python3 -m flashinfer.aot
         TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
-            uv pip install --system --no-build-isolation .
+            uv pip install --system --no-build-isolation --force-reinstall --no-deps .
     popd
     rm -rf flashinfer
 BASH
diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu
index 5e49e87131ece..1a0981f8ea6d6 100644
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -19,16 +19,14 @@
 #   VLLM_CPU_AVX512VNNI=false (default)|true
 #
 
-######################### BASE IMAGE #########################
-FROM ubuntu:22.04 AS base
+######################### COMMON BASE IMAGE #########################
+FROM ubuntu:22.04 AS base-common
 
 WORKDIR /workspace/
 
 ARG PYTHON_VERSION=3.12
 ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
 
-ENV LD_PRELOAD=""
-
 # Install minimal dependencies and uv
 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
     --mount=type=cache,target=/var/lib/apt,sharing=locked \
@@ -63,17 +61,18 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 ARG TARGETARCH
 ENV TARGETARCH=${TARGETARCH}
 
-RUN if [ "$TARGETARCH" = "arm64" ]; then \
-        PRELOAD_PATH="/usr/lib/aarch64-linux-gnu/libtcmalloc_minimal.so.4"; \
-    else \
-        PRELOAD_PATH="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/venv/lib/libiomp5.so"; \
-    fi && \
-    echo "export LD_PRELOAD=$PRELOAD_PATH" >> ~/.bashrc
+######################### x86_64 BASE IMAGE #########################
+FROM base-common AS base-amd64
 
-# Ensure that the LD_PRELOAD environment variable for export is in effect.
-SHELL ["/bin/bash", "-c"]
+ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/venv/lib/libiomp5.so"
 
-ENV LD_PRELOAD=${LD_PRELOAD}
+######################### arm64 BASE IMAGE #########################
+FROM base-common AS base-arm64
+
+ENV LD_PRELOAD="/usr/lib/aarch64-linux-gnu/libtcmalloc_minimal.so.4"
+
+######################### BASE IMAGE #########################
+FROM base-${TARGETARCH} AS base
 
 RUN echo 'ulimit -c 0' >> ~/.bashrc
 
diff --git a/docker/Dockerfile.nightly_torch b/docker/Dockerfile.nightly_torch
index 8d43de77aad59..e147b97f0e056 100644
--- a/docker/Dockerfile.nightly_torch
+++ b/docker/Dockerfile.nightly_torch
@@ -114,9 +114,6 @@ RUN cat torch_build_versions.txt
 # explicitly set the list to avoid issues with torch 2.2
 # see https://github.com/pytorch/pytorch/pull/123243
 
-# Override the arch list for flash-attn to reduce the binary size
-ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
-ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
 #################### BASE BUILD IMAGE ####################
 
 #################### WHEEL BUILD IMAGE ####################
diff --git a/docker/Dockerfile.tpu b/docker/Dockerfile.tpu
index 3474ff50de7bd..b9fc9def88190 100644
--- a/docker/Dockerfile.tpu
+++ b/docker/Dockerfile.tpu
@@ -1,4 +1,4 @@
-ARG NIGHTLY_DATE="20250714"
+ARG NIGHTLY_DATE="20250724"
 ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.12_tpuvm_$NIGHTLY_DATE"
 
 FROM $BASE_IMAGE
diff --git a/docs/.nav.yml b/docs/.nav.yml
index ab54dc3e535bd..ad742be3d6947 100644
--- a/docs/.nav.yml
+++ b/docs/.nav.yml
@@ -56,9 +56,7 @@ nav:
       - contributing/model/tests.md
       - contributing/model/multimodal.md
     - CI: contributing/ci
-    - Design Documents:
-      - V0: design
-      - V1: design/v1
+    - Design Documents: design
   - API Reference:
     - Summary: api/README.md
     - Contents:
diff --git a/docs/assets/design/fused_moe_modular_kernel/fused_experts_blocks.png b/docs/assets/design/fused_moe_modular_kernel/fused_experts_blocks.png
new file mode 100644
index 0000000000000..5721d5582c7f1
Binary files /dev/null and b/docs/assets/design/fused_moe_modular_kernel/fused_experts_blocks.png differ
diff --git a/docs/assets/design/fused_moe_modular_kernel/fused_moe_batched.png b/docs/assets/design/fused_moe_modular_kernel/fused_moe_batched.png
new file mode 100644
index 0000000000000..8168155b9dbaf
Binary files /dev/null and b/docs/assets/design/fused_moe_modular_kernel/fused_moe_batched.png differ
diff --git a/docs/assets/design/fused_moe_modular_kernel/fused_moe_non_batched.png b/docs/assets/design/fused_moe_modular_kernel/fused_moe_non_batched.png
new file mode 100644
index 0000000000000..bc6cc0aaaf47b
Binary files /dev/null and b/docs/assets/design/fused_moe_modular_kernel/fused_moe_non_batched.png differ
diff --git a/docs/assets/design/fused_moe_modular_kernel/prepare_and_finalize_blocks.png b/docs/assets/design/fused_moe_modular_kernel/prepare_and_finalize_blocks.png
new file mode 100644
index 0000000000000..94364e593fe68
Binary files /dev/null and b/docs/assets/design/fused_moe_modular_kernel/prepare_and_finalize_blocks.png differ
diff --git a/docs/assets/design/v1/metrics/intervals-1.png b/docs/assets/design/metrics/intervals-1.png
similarity index 100%
rename from docs/assets/design/v1/metrics/intervals-1.png
rename to docs/assets/design/metrics/intervals-1.png
diff --git a/docs/assets/design/v1/metrics/intervals-2.png b/docs/assets/design/metrics/intervals-2.png
similarity index 100%
rename from docs/assets/design/v1/metrics/intervals-2.png
rename to docs/assets/design/metrics/intervals-2.png
diff --git a/docs/assets/design/v1/metrics/intervals-3.png b/docs/assets/design/metrics/intervals-3.png
similarity index 100%
rename from docs/assets/design/v1/metrics/intervals-3.png
rename to docs/assets/design/metrics/intervals-3.png
diff --git a/docs/assets/kernel/k_vecs.png b/docs/assets/design/paged_attention/k_vecs.png
similarity index 100%
rename from docs/assets/kernel/k_vecs.png
rename to docs/assets/design/paged_attention/k_vecs.png
diff --git a/docs/assets/kernel/key.png b/docs/assets/design/paged_attention/key.png
similarity index 100%
rename from docs/assets/kernel/key.png
rename to docs/assets/design/paged_attention/key.png
diff --git a/docs/assets/kernel/logits_vec.png b/docs/assets/design/paged_attention/logits_vec.png
similarity index 100%
rename from docs/assets/kernel/logits_vec.png
rename to docs/assets/design/paged_attention/logits_vec.png
diff --git a/docs/assets/kernel/q_vecs.png b/docs/assets/design/paged_attention/q_vecs.png
similarity index 100%
rename from docs/assets/kernel/q_vecs.png
rename to docs/assets/design/paged_attention/q_vecs.png
diff --git a/docs/assets/kernel/query.png b/docs/assets/design/paged_attention/query.png
similarity index 100%
rename from docs/assets/kernel/query.png
rename to docs/assets/design/paged_attention/query.png
diff --git a/docs/assets/kernel/v_vec.png b/docs/assets/design/paged_attention/v_vec.png
similarity index 100%
rename from docs/assets/kernel/v_vec.png
rename to docs/assets/design/paged_attention/v_vec.png
diff --git a/docs/assets/kernel/value.png b/docs/assets/design/paged_attention/value.png
similarity index 100%
rename from docs/assets/kernel/value.png
rename to docs/assets/design/paged_attention/value.png
diff --git a/docs/assets/design/v1/prefix_caching/example-time-1.png b/docs/assets/design/prefix_caching/example-time-1.png
similarity index 100%
rename from docs/assets/design/v1/prefix_caching/example-time-1.png
rename to docs/assets/design/prefix_caching/example-time-1.png
diff --git a/docs/assets/design/v1/prefix_caching/example-time-3.png b/docs/assets/design/prefix_caching/example-time-3.png
similarity index 100%
rename from docs/assets/design/v1/prefix_caching/example-time-3.png
rename to docs/assets/design/prefix_caching/example-time-3.png
diff --git a/docs/assets/design/v1/prefix_caching/example-time-4.png b/docs/assets/design/prefix_caching/example-time-4.png
similarity index 100%
rename from docs/assets/design/v1/prefix_caching/example-time-4.png
rename to docs/assets/design/prefix_caching/example-time-4.png
diff --git a/docs/assets/design/v1/prefix_caching/example-time-5.png b/docs/assets/design/prefix_caching/example-time-5.png
similarity index 100%
rename from docs/assets/design/v1/prefix_caching/example-time-5.png
rename to docs/assets/design/prefix_caching/example-time-5.png
diff --git a/docs/assets/design/v1/prefix_caching/example-time-6.png b/docs/assets/design/prefix_caching/example-time-6.png
similarity index 100%
rename from docs/assets/design/v1/prefix_caching/example-time-6.png
rename to docs/assets/design/prefix_caching/example-time-6.png
diff --git a/docs/assets/design/v1/prefix_caching/example-time-7.png b/docs/assets/design/prefix_caching/example-time-7.png
similarity index 100%
rename from docs/assets/design/v1/prefix_caching/example-time-7.png
rename to docs/assets/design/prefix_caching/example-time-7.png
diff --git a/docs/assets/design/v1/prefix_caching/free.png b/docs/assets/design/prefix_caching/free.png
similarity index 100%
rename from docs/assets/design/v1/prefix_caching/free.png
rename to docs/assets/design/prefix_caching/free.png
diff --git a/docs/assets/design/v1/prefix_caching/overview.png b/docs/assets/design/prefix_caching/overview.png
similarity index 100%
rename from docs/assets/design/v1/prefix_caching/overview.png
rename to docs/assets/design/prefix_caching/overview.png
diff --git a/docs/assets/design/tpu/most_model_len.png b/docs/assets/design/tpu/most_model_len.png
new file mode 100644
index 0000000000000..344a81ed90801
Binary files /dev/null and b/docs/assets/design/tpu/most_model_len.png differ
diff --git a/docs/cli/README.md b/docs/cli/README.md
index dfb6051a8c8a6..b1371c82a4c4d 100644
--- a/docs/cli/README.md
+++ b/docs/cli/README.md
@@ -6,13 +6,13 @@ toc_depth: 4
 
 The vllm command-line tool is used to run and manage vLLM models. You can start by viewing the help message with:
 
-```
+```bash
 vllm --help
 ```
 
 Available Commands:
 
-```
+```bash
 vllm {chat,complete,serve,bench,collect-env,run-batch}
 ```
 
diff --git a/docs/configuration/tpu.md b/docs/configuration/tpu.md
new file mode 100644
index 0000000000000..a2941c80bd27c
--- /dev/null
+++ b/docs/configuration/tpu.md
@@ -0,0 +1,111 @@
+# TPU Optimization Tips
+
+This doc serves as a collection of handy tips for optimizing your vLLM on TPU workload.
+
+## Get started
+
+Looking for setup and installation instructions? Find them [here](../getting_started/installation/google_tpu.md).
+
+### TPU workload sizing
+
+When selecting the ideal number of chips for a single serving instance, it's important to account for both the model size and the average request context length. Adequate HBM for the KV cache is essential to ensure a sufficient number of concurrent requests can be processed.
+
+The following colab [calculator](https://colab.research.google.com/github/ericehanley/rightsize-vllm/blob/main/HBM_Calculator.ipynb) will tell you:
+
+- KV cache size requirement per token and per request
+- TPU/GPU memory consumed by the model weights
+- TPU/GPU memory allocated for the KV cache
+- Maximum \# of requests you can approximately set (--max-num-seqs)
+
+This approach serves as a general rule of thumb.
+
+#### Latency-throughput tradeoff
+
+As with rightsizing the number of chips for your workload, consider adjusting `--max-num-seqs` to fine-tune the latency-throughput balance. Decreasing `--max-num-seqs` and/or increasing the number of chips can help reduce latency.
+
+`--max-num-seqs` defines the number of concurrent decode slots, effectively limiting the number of requests the server can process tokens for simultaneously. Increasing this value allows the server to pre-allocate more HBM to handle a higher number of concurrent requests, which can maximize overall throughput. However, this often increases the end-to-end (e2e) latency per request.
+
+Therefore, carefully tuning `--max-num-seqs` is crucial to achieving the desired balance between latency and throughput for your specific workload.
+
+In a similar way, `--max-num-batch-tokens` can be adjusted down to improve latency, or adjusted up to improve throughput.
+
+#### Compilation and Caching
+
+Coming from a GPU background, one of the key differences you'll notice with TPUs is an initial compilation step. TPUs are specialized accelerators (ASICs) that achieve maximum performance by executing pre-compiled, static computation graphs via the XLA compiler. Unlike GPUs, which can handle dynamic input shapes more flexibly, TPUs require a specific compiled graph for each tensor shape (e.g., batch size and sequence length) they process.
+
+To manage this, vLLM performs a one-time "warmup" process when you first launch the server. During this phase, it pre-compiles the model for various common input shapes and saves these compiled graphs to a cache on disk or remote storage (located at `~/.cache/vllm/xla_cache` by default). This process can range significantly, anywhere from a few minutes to an hour depending on the size of the model and context length used.
+
+Although the first compilation can take some time, for all subsequent server launches, vLLM can load these graphs directly from the cache, eliminating the compilation time for future runs.
+
+Use `VLLM_XLA_CACHE_PATH` environment variable to write to shareable storage for future deployed nodes (like when using autoscaling).
+
+#### Reducing compilation time
+
+This initial compilation time ranges significantly and is impacted by many of the arguments discussed in this optimization doc. Factors that influence the length of time to compile are things like model size and `--max-num-batch-tokens`. Other arguments you can tune are things like `VLLM_TPU_MOST_MODEL_LEN`.
+
+### Optimize based on your data
+
+#### max model len vs. most model len
+
+![most_model_len](../assets/design/tpu/most_model_len.png)
+
+If most of your requests are shorter than the maximum model length but you still need to accommodate occasional longer requests, setting a high maximum model length can negatively impact performance. In these cases, you can try introducing most model len by specifying the `VLLM_TPU_MOST_MODEL_LEN` environment variable.
+
+For example, 1% requests are 32k length and 99% requests are 2k length. You can pass 32k into `--max-model-len 32768` and use `VLLM_TPU_MOST_MODEL_LEN=2048`.
+
+The requests get subdivided into max-model-len and most-model-len categories, for the latter category, we can gain better performance since the server can process more requests at a time.
+
+#### Padding
+
+For online serving with latency requirements, consider switching to bucket padding by setting the `VLLM_TPU_BUCKET_PADDING_GAP` environment variable. Because of the layout of the TPU, try using increments of 128: 128, 256, etc.
+
+The server pads the requests into fixed lengths before sending them to the model to avoid recompilation. To read more about tpu padding, see [here](https://cloud.google.com/tpu/docs/performance-guide#xla-efficiencies). Currently, there are 2 ways to pad the requests:
+
+1) the default exponential padding (pad to the nearest power of 2)
+2) bucket padding (pad to the nearest linearly increasing bucket).
+
+When using bucket padding, the buckets start from 16, end at max_model_len, and increment by `VLLM_TPU_BUCKET_PADDING_GAP`.
+
+For example, max_model_len=512, padding_gap=64, the buckets will be [16, 32, 64, 128, 192, 256, 320, 384, 448, 512].
+
+The fewer tokens we pad, the less unnecessary computation TPU does, the better performance we can get. For example, if num_tokens=300, with exponential padding, we pad to 512, with the bucket_padding above, we pad to 320.
+
+However, you need to be careful to choose the padding gap. If the gap is too small, it means the number of buckets is large, leading to increased warmup (precompile) time and higher memory to store the compiled graph. Too many compilaed graphs may lead to HBM OOM. Conversely, an overly large gap yields no performance improvement compared to the default exponential padding.
+
+#### Quantization
+
+If possible, use the precision that matches the chip’s hardware acceleration:
+
+- v5e has int4/int8 hardware acceleration in the MXU
+- v6e has int4/int8 hardware acceleration in the MXU
+
+Supported quantized formats and features in vLLM on TPU [Jul '25]:
+
+- INT8 W8A8
+- INT8 W8A16
+- FP8 KV cache
+- [WIP] FP8 W8A8
+- [WIP] AWQ
+- [WIP] FP4 W4A8
+
+#### Parallelization
+
+Don't set TP to be less than the number of chips on a single-host deployment.
+
+Although it’s common to do this with GPUs, don't try to fragment 2 or 8 different workloads across 8 chips on a single host. If you need 1 or 4 chips, just create an instance with 1 or 4 chips (these are partial-host machine types).
+
+### Tune your workloads
+
+Although we try to have great default configs, we strongly recommend you check out the [vLLM auto-tuner](../../benchmarks/auto_tune/README.md) to optimize your workloads for your use case.
+
+### Future Topics We'll Cover
+
+#### Profiling
+
+The auto-tuner provides a profile of optimized configurations as its final step. However, interpreting this profile can be challenging for new users. We plan to expand this section in the future with more detailed guidance. In the meantime, you can learn how to collect a TPU profile using vLLM's native profiling tools [here](../examples/offline_inference/profiling_tpu.md). This profile can provide valuable insights into your workload's performance.
+
+#### SPMD
+
+More details to come.
+
+**Want us to cover something that isn't listed here? Open up an issue please and cite this doc. We'd love to hear your questions or tips.**
diff --git a/docs/contributing/README.md b/docs/contributing/README.md
index e3ae5055b9988..5a2a70d57e85f 100644
--- a/docs/contributing/README.md
+++ b/docs/contributing/README.md
@@ -26,6 +26,8 @@ See <gh-file:LICENSE>.
 
 ## Developing
 
+--8<-- "docs/getting_started/installation/python_env_setup.inc.md"
+
 Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation.
 Check out the [building from source][build-from-source] documentation for details.
 
@@ -42,7 +44,7 @@ For an optimized workflow when iterating on C++/CUDA kernels, see the [Increment
 Install MkDocs along with the [plugins](https://github.com/vllm-project/vllm/blob/main/mkdocs.yaml) used in the vLLM documentation, as well as required dependencies:
 
 ```bash
-pip install -r requirements/docs.txt
+uv pip install -r requirements/docs.txt
 ```
 
 !!! note
@@ -98,13 +100,14 @@ For additional features and advanced configurations, refer to the official [MkDo
 ??? console "Commands"
 
     ```bash
-    pip install -r requirements/common.txt -r requirements/dev.txt
+    # These commands are only for Nvidia CUDA platforms.
+    uv pip install -r requirements/common.txt -r requirements/dev.txt --torch-backend=auto
 
     # Linting, formatting and static type checking
-    pre-commit install --hook-type pre-commit --hook-type commit-msg
+    pre-commit install
 
     # You can manually run pre-commit with
-    pre-commit run --all-files
+    pre-commit run --all-files --show-diff-on-failure
 
     # To manually run something from CI that does not run
     # locally by default, you can run:
@@ -122,6 +125,10 @@ For additional features and advanced configurations, refer to the official [MkDo
 
     Therefore, we recommend developing with Python 3.12 to minimise the chance of your local environment clashing with our CI environment.
 
+!!! note "Install python3-dev if Python.h is missing"
+    If any of the above commands fails with `Python.h: No such file or directory`, install
+    `python3-dev` with `sudo apt install python3-dev`.
+
 !!! note
     Currently, the repository is not fully checked by `mypy`.
 
@@ -153,7 +160,7 @@ Using `-s` with `git commit` will automatically add this header.
 
 !!! tip
     You can enable automatic sign-off via your IDE:
-  
+
     - **PyCharm**: Click on the `Show Commit Options` icon to the right of the `Commit and Push...` button in the `Commit` window.
       It will bring up a `git` window where you can modify the `Author` and enable `Sign-off commit`.
     - **VSCode**: Open the [Settings editor](https://code.visualstudio.com/docs/configure/settings)
diff --git a/docs/contributing/ci/failures.md b/docs/contributing/ci/failures.md
index 573efb3b05f6e..d7e2dfbca8760 100644
--- a/docs/contributing/ci/failures.md
+++ b/docs/contributing/ci/failures.md
@@ -20,19 +20,19 @@ the failure?
 
 - **Use this title format:**
 
-    ```
+    ```text
     [CI Failure]: failing-test-job - regex/matching/failing:test
     ```
 
 - **For the environment field:**
 
-    ```
- Still failing on main as of commit abcdef123
+    ```text
+    Still failing on main as of commit abcdef123
     ```
 
 - **In the description, include failing tests:**
 
-    ```
+    ```text
     FAILED failing/test.py:failing_test1 - Failure description
     FAILED failing/test.py:failing_test2 - Failure description
     https://github.com/orgs/vllm-project/projects/20
diff --git a/docs/contributing/ci/update_pytorch_version.md b/docs/contributing/ci/update_pytorch_version.md
index 5046db11a4715..3a6026d450a67 100644
--- a/docs/contributing/ci/update_pytorch_version.md
+++ b/docs/contributing/ci/update_pytorch_version.md
@@ -57,8 +57,7 @@ cc the PyTorch release team to initiate discussion on how to address them.
 
 ## Update CUDA version
 
-The PyTorch release matrix includes both stable and experimental [CUDA versions](https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix). Due to limitations, only the latest stable CUDA version (for example,
-`torch2.7.0+cu12.6`) is uploaded to PyPI. However, vLLM may require a different CUDA version,
+The PyTorch release matrix includes both stable and experimental [CUDA versions](https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix). Due to limitations, only the latest stable CUDA version (for example, torch `2.7.1+cu126`) is uploaded to PyPI. However, vLLM may require a different CUDA version,
 such as 12.8 for Blackwell support.
 This complicates the process as we cannot use the out-of-the-box
 `pip install torch torchvision torchaudio` command. The solution is to use
@@ -107,6 +106,7 @@ releases (which would take too much time), they can be built from
 source to unblock the update process.
 
 ### FlashInfer
+
 Here is how to build and install it from source with `torch2.7.0+cu128` in vLLM [Dockerfile](https://github.com/vllm-project/vllm/blob/27bebcd89792d5c4b08af7a65095759526f2f9e1/docker/Dockerfile#L259-L271):
 
 ```bash
@@ -122,6 +122,7 @@ public location for immediate installation, such as [this FlashInfer wheel link]
 team if you want to get the package published there.
 
 ### xFormers
+
 Similar to FlashInfer, here is how to build and install xFormers from source:
 
 ```bash
@@ -139,7 +140,7 @@ uv pip install --system \
 
 ### causal-conv1d
 
-```
+```bash
 uv pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
 ```
 
diff --git a/docs/contributing/deprecation_policy.md b/docs/contributing/deprecation_policy.md
index ff69cbae08b23..904ef4ca058c0 100644
--- a/docs/contributing/deprecation_policy.md
+++ b/docs/contributing/deprecation_policy.md
@@ -31,7 +31,7 @@ Features that fall under this policy include (at a minimum) the following:
 The deprecation process consists of several clearly defined stages that span
 multiple Y releases:
 
-**1. Deprecated (Still On By Default)**
+### 1. Deprecated (Still On By Default)
 
 - **Action**: Feature is marked as deprecated.
 - **Timeline**: A removal version is explicitly stated in the deprecation
@@ -46,7 +46,7 @@ warning (e.g., "This will be removed in v0.10.0").
     - GitHub Issue (RFC) for feedback
     - Documentation and use of the `@typing_extensions.deprecated` decorator for Python APIs
 
-**2.Deprecated (Off By Default)**
+### 2.Deprecated (Off By Default)
 
 - **Action**: Feature is disabled by default, but can still be re-enabled via a
 CLI flag or environment variable. Feature throws an error when used without
@@ -55,7 +55,7 @@ re-enabling.
 while signaling imminent removal. Ensures any remaining usage is clearly
 surfaced and blocks silent breakage before full removal.
 
-**3. Removed**
+### 3. Removed
 
 - **Action**: Feature is completely removed from the codebase.
 - **Note**: Only features that have passed through the previous deprecation
diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md
index a5851cfe963d2..74627e9062167 100644
--- a/docs/contributing/profiling.md
+++ b/docs/contributing/profiling.md
@@ -5,14 +5,22 @@
 
 ## Profile with PyTorch Profiler
 
-We support tracing vLLM workers using the `torch.profiler` module. You can enable tracing by setting the `VLLM_TORCH_PROFILER_DIR` environment variable to the directory where you want to save the traces: `VLLM_TORCH_PROFILER_DIR=/mnt/traces/`
+We support tracing vLLM workers using the `torch.profiler` module. You can enable tracing by setting the `VLLM_TORCH_PROFILER_DIR` environment variable to the directory where you want to save the traces: `VLLM_TORCH_PROFILER_DIR=/mnt/traces/`. Additionally, you can control the profiling content by specifying the following environment variables:
+
+- `VLLM_TORCH_PROFILER_RECORD_SHAPES=1` to enable recording Tensor Shapes, off by default
+- `VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY=1` to record memory, off by default
+- `VLLM_TORCH_PROFILER_WITH_STACK=1` to enable recording stack information, on by default
+- `VLLM_TORCH_PROFILER_WITH_FLOPS=1` to enable recording FLOPs, off by default
 
 The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` environment variable set.
 
-When using `benchmarks/benchmark_serving.py`, you can enable profiling by passing the `--profile` flag.
+When using `vllm bench serve`, you can enable profiling by passing the `--profile` flag.
 
 Traces can be visualized using <https://ui.perfetto.dev/>.
 
+!!! tip
+You can directly call bench module without installing vllm using `python -m vllm.entrypoints.cli.main bench`.
+
 !!! tip
     Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly.
 
@@ -35,10 +43,10 @@ VLLM_TORCH_PROFILER_DIR=./vllm_profile \
     --model meta-llama/Meta-Llama-3-70B
 ```
 
-benchmark_serving.py:
+vllm bench command:
 
 ```bash
-python benchmarks/benchmark_serving.py \
+vllm bench serve \
     --backend vllm \
     --model meta-llama/Meta-Llama-3-70B \
     --dataset-name sharegpt \
@@ -69,13 +77,13 @@ apt install nsight-systems-cli
 
 For basic usage, you can just append `nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace=node` before any existing script you would run for offline inference.
 
-The following is an example using the `benchmarks/benchmark_latency.py` script:
+The following is an example using the `vllm bench latency` script:
 
 ```bash
 nsys profile -o report.nsys-rep \
     --trace-fork-before-exec=true \
     --cuda-graph-trace=node \
-    python benchmarks/benchmark_latency.py \
+vllm bench latency \
     --model meta-llama/Llama-3.1-8B-Instruct \
     --num-iters-warmup 5 \
     --num-iters 1 \
@@ -98,7 +106,7 @@ nsys profile -o report.nsys-rep \
     vllm serve meta-llama/Llama-3.1-8B-Instruct
 
 # client
-python benchmarks/benchmark_serving.py \
+vllm bench serve \
     --backend vllm \
     --model meta-llama/Llama-3.1-8B-Instruct \
     --num-prompts 1 \
@@ -109,13 +117,13 @@ python benchmarks/benchmark_serving.py \
 
 In practice, you should set the `--duration` argument to a large value. Whenever you want the server to stop profiling, run:
 
-```
+```bash
 nsys sessions list
 ```
 
 to get the session id in the form of `profile-XXXXX`, then run:
 
-```
+```bash
 nsys stop --session=profile-XXXXX
 ```
 
@@ -132,7 +140,7 @@ You can view these profiles either as summaries in the CLI, using `nsys stats [p
     ...
     ** CUDA GPU Kernel Summary (cuda_gpu_kern_sum):
 
-    Time (%)  Total Time (ns)  Instances   Avg (ns)     Med (ns)    Min (ns)  Max (ns)   StdDev (ns)                                                  Name                                                
+    Time (%)  Total Time (ns)  Instances   Avg (ns)     Med (ns)    Min (ns)  Max (ns)   StdDev (ns)                                                  Name
     --------  ---------------  ---------  -----------  -----------  --------  ---------  -----------  ----------------------------------------------------------------------------------------------------
         46.3   10,327,352,338     17,505    589,965.9    144,383.0    27,040  3,126,460    944,263.8  sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize128x128x64_warpgroupsize1x1x1_execute_segment_k_of…
         14.8    3,305,114,764      5,152    641,520.7    293,408.0   287,296  2,822,716    867,124.9  sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize256x128x64_warpgroupsize2x1x1_execute_segment_k_of…
@@ -143,7 +151,7 @@ You can view these profiles either as summaries in the CLI, using `nsys stats [p
         2.6      587,283,113     37,824     15,526.7      3,008.0     2,719  2,517,756    139,091.1  std::enable_if<T2>(int)0&&vllm::_typeConvert<T1>::exists, void>::type vllm::fused_add_rms_norm_kern…
         1.9      418,362,605     18,912     22,121.5      3,871.0     3,328  2,523,870    175,248.2  void vllm::rotary_embedding_kernel<c10::BFloat16, (bool)1>(const long *, T1 *, T1 *, const T1 *, in…
         0.7      167,083,069     18,880      8,849.7      2,240.0     1,471  2,499,996    101,436.1  void vllm::reshape_and_cache_flash_kernel<__nv_bfloat16, __nv_bfloat16, (vllm::Fp8KVCacheDataType)0…
-    ... 
+    ...
     ```
 
 GUI example:
diff --git a/docs/contributing/vulnerability_management.md b/docs/contributing/vulnerability_management.md
index e20b10f8f7b32..847883f742974 100644
--- a/docs/contributing/vulnerability_management.md
+++ b/docs/contributing/vulnerability_management.md
@@ -32,9 +32,9 @@ We prefer to keep all vulnerability-related communication on the security report
 on GitHub. However, if you need to contact the VMT directly for an urgent issue,
 you may contact the following individuals:
 
-- Simon Mo - simon.mo@hey.com
-- Russell Bryant - rbryant@redhat.com
-- Huzaifa Sidhpurwala - huzaifas@redhat.com
+- Simon Mo - <simon.mo@hey.com>
+- Russell Bryant - <rbryant@redhat.com>
+- Huzaifa Sidhpurwala - <huzaifas@redhat.com>
 
 ## Slack Discussion
 
diff --git a/docs/deployment/docker.md b/docs/deployment/docker.md
index e500751896b34..1f19f2fecfab1 100644
--- a/docs/deployment/docker.md
+++ b/docs/deployment/docker.md
@@ -10,23 +10,23 @@ The image can be used to run OpenAI compatible server and is available on Docker
 ```bash
 docker run --runtime nvidia --gpus all \
     -v ~/.cache/huggingface:/root/.cache/huggingface \
-    --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
+    --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
     -p 8000:8000 \
     --ipc=host \
     vllm/vllm-openai:latest \
-    --model mistralai/Mistral-7B-v0.1
+    --model Qwen/Qwen3-0.6B
 ```
 
 This image can also be used with other container engines such as [Podman](https://podman.io/).
 
 ```bash
-podman run --gpus all \
+podman run --device nvidia.com/gpu=all \
   -v ~/.cache/huggingface:/root/.cache/huggingface \
   --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
   -p 8000:8000 \
   --ipc=host \
-  vllm/vllm-openai:latest \
-  --model mistralai/Mistral-7B-v0.1
+  docker.io/vllm/vllm-openai:latest \
+  --model Qwen/Qwen3-0.6B
 ```
 
 You can add any other [engine-args](../configuration/engine_args.md) you need after the image tag (`vllm/vllm-openai:latest`).
@@ -106,8 +106,7 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `--
     -t vllm/vllm-gh200-openai:latest \
     --build-arg max_jobs=66 \
     --build-arg nvcc_threads=2 \
-    --build-arg torch_cuda_arch_list="9.0 10.0+PTX" \
-    --build-arg vllm_fa_cmake_gpu_arches="90-real"
+    --build-arg torch_cuda_arch_list="9.0 10.0+PTX"
     ```
 
 !!! note
diff --git a/docs/deployment/frameworks/anything-llm.md b/docs/deployment/frameworks/anything-llm.md
index d6b28a358cc3d..e62a33b2085ca 100644
--- a/docs/deployment/frameworks/anything-llm.md
+++ b/docs/deployment/frameworks/anything-llm.md
@@ -19,9 +19,9 @@ vllm serve Qwen/Qwen1.5-32B-Chat-AWQ --max-model-len 4096
 - Download and install [Anything LLM desktop](https://anythingllm.com/desktop).
 
 - On the bottom left of open settings, AI Prooviders --> LLM:
-  - LLM Provider: Generic OpenAI
-  - Base URL: http://{vllm server host}:{vllm server port}/v1
-  - Chat Model Name: `Qwen/Qwen1.5-32B-Chat-AWQ`
+    - LLM Provider: Generic OpenAI
+    - Base URL: http://{vllm server host}:{vllm server port}/v1
+    - Chat Model Name: `Qwen/Qwen1.5-32B-Chat-AWQ`
 
 ![](../../assets/deployment/anything-llm-provider.png)
 
@@ -30,9 +30,9 @@ vllm serve Qwen/Qwen1.5-32B-Chat-AWQ --max-model-len 4096
 ![](../../assets/deployment/anything-llm-chat-without-doc.png)
 
 - Click the upload button:
-  - upload the doc
-  - select the doc and move to the workspace
-  - save and embed
+    - upload the doc
+    - select the doc and move to the workspace
+    - save and embed
 
 ![](../../assets/deployment/anything-llm-upload-doc.png)
 
diff --git a/docs/deployment/frameworks/chatbox.md b/docs/deployment/frameworks/chatbox.md
index 15f92ed1e34df..cbca6e6282fc6 100644
--- a/docs/deployment/frameworks/chatbox.md
+++ b/docs/deployment/frameworks/chatbox.md
@@ -19,11 +19,11 @@ vllm serve qwen/Qwen1.5-0.5B-Chat
 - Download and install [Chatbox desktop](https://chatboxai.app/en#download).
 
 - On the bottom left of settings, Add Custom Provider
-  - API Mode: `OpenAI API Compatible`
-  - Name: vllm
-  - API Host: `http://{vllm server host}:{vllm server port}/v1`
-  - API Path: `/chat/completions`
-  - Model: `qwen/Qwen1.5-0.5B-Chat`
+    - API Mode: `OpenAI API Compatible`
+    - Name: vllm
+    - API Host: `http://{vllm server host}:{vllm server port}/v1`
+    - API Path: `/chat/completions`
+    - Model: `qwen/Qwen1.5-0.5B-Chat`
 
 ![](../../assets/deployment/chatbox-settings.png)
 
diff --git a/docs/deployment/frameworks/dify.md b/docs/deployment/frameworks/dify.md
index a3063194fb513..35f02c33cb02b 100644
--- a/docs/deployment/frameworks/dify.md
+++ b/docs/deployment/frameworks/dify.md
@@ -34,11 +34,11 @@ docker compose up -d
 - In the top-right user menu (under the profile icon), go to Settings, then click `Model Provider`, and locate the `vLLM` provider to install it.
 
 - Fill in the model provider details as follows:
-  - **Model Type**: `LLM`
-  - **Model Name**: `Qwen/Qwen1.5-7B-Chat`
-  - **API Endpoint URL**: `http://{vllm_server_host}:{vllm_server_port}/v1`
-  - **Model Name for API Endpoint**: `Qwen/Qwen1.5-7B-Chat`
-  - **Completion Mode**: `Completion`
+    - **Model Type**: `LLM`
+    - **Model Name**: `Qwen/Qwen1.5-7B-Chat`
+    - **API Endpoint URL**: `http://{vllm_server_host}:{vllm_server_port}/v1`
+    - **Model Name for API Endpoint**: `Qwen/Qwen1.5-7B-Chat`
+    - **Completion Mode**: `Completion`
 
 ![](../../assets/deployment/dify-settings.png)
 
diff --git a/docs/deployment/frameworks/haystack.md b/docs/deployment/frameworks/haystack.md
index a18d68142cabb..70b4b48d4543e 100644
--- a/docs/deployment/frameworks/haystack.md
+++ b/docs/deployment/frameworks/haystack.md
@@ -1,7 +1,5 @@
 # Haystack
 
-# Haystack
-
 [Haystack](https://github.com/deepset-ai/haystack) is an end-to-end LLM framework that allows you to build applications powered by LLMs, Transformer models, vector search and more. Whether you want to perform retrieval-augmented generation (RAG), document search, question answering or answer generation, Haystack can orchestrate state-of-the-art embedding models and LLMs into pipelines to build end-to-end NLP applications and solve your use case.
 
 It allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints.
diff --git a/docs/deployment/frameworks/retrieval_augmented_generation.md b/docs/deployment/frameworks/retrieval_augmented_generation.md
index 96dd99e7118b6..d5f2ec302b6cd 100644
--- a/docs/deployment/frameworks/retrieval_augmented_generation.md
+++ b/docs/deployment/frameworks/retrieval_augmented_generation.md
@@ -3,6 +3,7 @@
 [Retrieval-augmented generation (RAG)](https://en.wikipedia.org/wiki/Retrieval-augmented_generation) is a technique that enables generative artificial intelligence (Gen AI) models to retrieve and incorporate new information. It modifies interactions with a large language model (LLM) so that the model responds to user queries with reference to a specified set of documents, using this information to supplement information from its pre-existing training data. This allows LLMs to use domain-specific and/or updated information. Use cases include providing chatbot access to internal company data or generating responses based on authoritative sources.
 
 Here are the integrations:
+
 - vLLM + [langchain](https://github.com/langchain-ai/langchain) + [milvus](https://github.com/milvus-io/milvus)
 - vLLM + [llamaindex](https://github.com/run-llama/llama_index) + [milvus](https://github.com/milvus-io/milvus)
 
diff --git a/docs/deployment/integrations/production-stack.md b/docs/deployment/integrations/production-stack.md
index 497f9f1a92a5d..fae392589c060 100644
--- a/docs/deployment/integrations/production-stack.md
+++ b/docs/deployment/integrations/production-stack.md
@@ -140,11 +140,12 @@ The core vLLM production stack configuration is managed with YAML. Here is the e
     ```
 
 In this YAML configuration:
+
 * **`modelSpec`** includes:
-  * `name`: A nickname that you prefer to call the model.
-  * `repository`: Docker repository of vLLM.
-  * `tag`: Docker image tag.
-  * `modelURL`: The LLM model that you want to use.
+    * `name`: A nickname that you prefer to call the model.
+    * `repository`: Docker repository of vLLM.
+    * `tag`: Docker image tag.
+    * `modelURL`: The LLM model that you want to use.
 * **`replicaCount`**: Number of replicas.
 * **`requestCPU` and `requestMemory`**: Specifies the CPU and memory resource requests for the pod.
 * **`requestGPU`**: Specifies the number of GPUs required.
diff --git a/docs/deployment/k8s.md b/docs/deployment/k8s.md
index f244b0858eb6e..cad801a4312cc 100644
--- a/docs/deployment/k8s.md
+++ b/docs/deployment/k8s.md
@@ -5,7 +5,7 @@ Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine le
 - [Deployment with CPUs](#deployment-with-cpus)
 - [Deployment with GPUs](#deployment-with-gpus)
 - [Troubleshooting](#troubleshooting)
-  - [Startup Probe or Readiness Probe Failure, container log contains "KeyboardInterrupt: terminated"](#startup-probe-or-readiness-probe-failure-container-log-contains-keyboardinterrupt-terminated)
+    - [Startup Probe or Readiness Probe Failure, container log contains "KeyboardInterrupt: terminated"](#startup-probe-or-readiness-probe-failure-container-log-contains-keyboardinterrupt-terminated)
 - [Conclusion](#conclusion)
 
 Alternatively, you can deploy vLLM to Kubernetes using any of the following:
diff --git a/docs/design/automatic_prefix_caching.md b/docs/design/automatic_prefix_caching.md
deleted file mode 100644
index 60e21f6ad0fcb..0000000000000
--- a/docs/design/automatic_prefix_caching.md
+++ /dev/null
@@ -1,40 +0,0 @@
-# Automatic Prefix Caching
-
-The core idea of [PagedAttention](https://blog.vllm.ai/2023/06/20/vllm.html) is to partition the KV cache of each request into KV Blocks. Each block contains the attention keys and values for a fixed number of tokens. The PagedAttention algorithm allows these blocks to be stored in non-contiguous physical memory so that we can eliminate memory fragmentation by allocating the memory on demand.
-
-To automatically cache the KV cache, we utilize the following key observation: Each KV block can be uniquely identified by the tokens within the block and the tokens in the prefix before the block.
-
-```text
-                    Block 1                  Block 2                  Block 3
-         [A gentle breeze stirred] [the leaves as children] [laughed in the distance]
-Block 1: |<--- block tokens ---->|
-Block 2: |<------- prefix ------>| |<--- block tokens --->|
-Block 3: |<------------------ prefix -------------------->| |<--- block tokens ---->|
-```
-
-In the example above, the KV cache in the first block can be uniquely identified with the tokens “A gentle breeze stirred”. The third block can be uniquely identified with the tokens in the block “laughed in the distance”, along with the prefix tokens “A gentle breeze stirred the leaves as children”. Therefore, we can build the following one-to-one mapping:
-
-```text
-hash(prefix tokens + block tokens) <--> KV Block
-```
-
-With this mapping, we can add another indirection in vLLM’s KV cache management. Previously, each sequence in vLLM maintained a mapping from their logical KV blocks to physical blocks. To achieve automatic caching of KV blocks, we map the logical KV blocks to their hash value and maintain a global hash table of all the physical blocks. In this way, all the KV blocks sharing the same hash value (e.g., shared prefix blocks across two requests) can be mapped to the same physical block and share the memory space.
-
-This design achieves automatic prefix caching without the need of maintaining a tree structure among the KV blocks. More specifically, all of the blocks are independent of each other and can be allocated and freed by itself, which enables us to manages the KV cache as ordinary caches in operating system.
-
-## Generalized Caching Policy
-
-Keeping all the KV blocks in a hash table enables vLLM to cache KV blocks from earlier requests to save memory and accelerate the computation of future requests. For example, if a new request shares the system prompt with the previous request, the KV cache of the shared prompt can directly be used for the new request without recomputation. However, the total KV cache space is limited and we have to decide which KV blocks to keep or evict when the cache is full.
-
-Managing KV cache with a hash table allows us to implement flexible caching policies. As an example, in current vLLM, we implement the following eviction policy:
-
-* When there are no free blocks left, we will evict a KV block with reference count (i.e., number of current requests using the block) equals 0.
-* If there are multiple blocks with reference count equals to 0, we prioritize to evict the least recently used block (LRU).
-* If there are multiple blocks whose last access time are the same, we prioritize the eviction of the block that is at the end of the longest prefix (i.e., has the maximum number of blocks before it).
-
-Note that this eviction policy effectively implements the exact policy as in [RadixAttention](https://lmsys.org/blog/2024-01-17-sglang/) when applied to models with full attention, which prioritizes to evict reference count zero and least recent used leaf nodes in the prefix tree.
-
-However, the hash-based KV cache management gives us the flexibility to handle more complicated serving scenarios and implement more complicated eviction policies beyond the policy above:
-
-* Multi-LoRA serving. When serving requests for multiple LoRA adapters, we can simply let the hash of each KV block to also include the LoRA ID the request is querying for to enable caching for all adapters. In this way, we can jointly manage the KV blocks for different adapters, which simplifies the system implementation and improves the global cache hit rate and efficiency.
-* Multi-modal models. When the user input includes more than just discrete tokens, we can use different hashing methods to handle the caching of inputs of different modalities. For example, perceptual hashing for images to cache similar input images.
diff --git a/docs/design/fused_moe_modular_kernel.md b/docs/design/fused_moe_modular_kernel.md
new file mode 100644
index 0000000000000..3ef1232051b07
--- /dev/null
+++ b/docs/design/fused_moe_modular_kernel.md
@@ -0,0 +1,259 @@
+# Fused MoE Modular Kernel
+
+## Introduction
+
+FusedMoEModularKernel is implemented [here](gh-file:/vllm/model_executor/layers/fused_moe/modular_kernel.py)
+
+Based on the format of the input activations, FusedMoE implementations are broadly classified into 2 types.
+
+* Contiguous / Standard / Non-Batched, and
+* Batched
+
+!!! note
+    The terms Contiguous, Standard, and Non-Batched are used interchangeably throughout the document.
+
+The input activation format completely depends on the All2All Dispatch being used.
+
+* In the Contiguous variant, the All2All Dispatch returns the activations as a contiguous tensor of shape (M, K) along with TopK Ids and TopK weights of shape (M, num_topk). Look at `DeepEPHTPrepareAndFinalize` for an example.
+* In the Batched variant, the All2All Dispatch returns the activations as a tensor of shape (num_experts, max_tokens, K). Here, the activations/tokens that subscribe to the same expert are batched together. Note that not all entries of the tensor are valid. The activations tensor is typically accompanied by an `expert_num_tokens` tensor of size `num_experts`, where `expert_num_tokens[i]` indicates the number of valid tokens that subscribe to the ith expert. Look at `PplxPrepareAndFinalize` or `DeepEPLLPrepareAndFinalize` for an example.
+
+The FusedMoE operation is generally made of multiple operations, in both the Contiguous and Batched variants, as described in the diagrams below
+
+![](../assets/design/fused_moe_modular_kernel/fused_moe_non_batched.png "FusedMoE Non-Batched")
+
+![](../assets/design/fused_moe_modular_kernel/fused_moe_batched.png "FusedMoE Batched")
+
+!!! note
+    The main difference, in terms of operations, between the Batched and Non-Batched cases is the Permute / Unpermute operations. All other operations remain.
+
+## Motivation
+
+As can be seen from the diagrams, there are a lot of operations and there can be a variety of implementations for each operation. The set of ways the operations can be put together to make a valid FusedMoE implementation quickly becomes intractable. The Modular Kernel framework addresses this issue,  by grouping the operations into logical components. This broad categorization makes the combinations manageable and prevents code-duplication. This also decouples the All2All Dispatch & Combine implementations from the FusedMoE implementations and allows for their independent development and testing. Furthermore, the Modular Kernel framework introduces Abstract classes for the different components thus providing a well-defined skeleton for future implementations.
+
+The rest of the document will focus on the Contiguous / Non-Batched case. Extrapolating to the Batched case should be straight-forward.
+
+## ModularKernel Components
+
+FusedMoEModularKernel splits the FusedMoE operation into 3 parts,
+
+1. TopKWeightAndReduce
+2. FusedMoEPrepareAndFinalize
+3. FusedMoEPermuteExpertsUnpermute
+
+### TopKWeightAndReduce
+
+The TopK Weight Application and Reduction components happen right after the Unpermute operation and before the All2All Combine. Note that the `FusedMoEPermuteExpertsUnpermute` is responsible for the Unpermute and `FusedMoEPrepareAndFinalize` is responsible for the All2All Combine. There is value in doing the TopK Weight Application and Reduction in the `FusedMoEPermuteExpertsUnpermute`. But some implementations choose to do it `FusedMoEPrepareAndFinalize`. In order to enable this flexibility, we have a TopKWeightAndReduce abstract class.
+
+Please find the implementations of TopKWeightAndReduce [here](gh-file:vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py).
+
+`FusedMoEPrepareAndFinalize::finalize()` method accepts a `TopKWeightAndReduce` argument that is invoked inside the method.
+The `FusedMoEModularKernel` acts as a bridge between the `FusedMoEPermuteExpertsUnpermute` and `FusedMoEPerpareAndFinalize` implementations to determine where the TopK Weight Application and Reduction happens.
+
+* `FusedMoEPermuteExpertsUnpermute::finalize_weight_and_reduce_impl` method returns `TopKWeightAndReduceNoOp` if the `FusedMoEPermuteExpertsUnpermute` implementation does the weight application and reduction itself.
+* `FusedMoEPermuteExpertsUnpermute::finalize_weight_and_reduce_impl` method returns `TopKWeightAndReduceContiguous` / `TopKWeightAndReduceNaiveBatched` / `TopKWeightAndReduceDelegate` if the `FusedMoEPermuteExpertsUnpermute` implementation needs the `FusedMoEPrepareAndFinalize::finalize()` to do the weight application and reduction.
+
+### FusedMoEPrepareAndFinalize
+
+The `FusedMoEPrepareAndFinalize` abstract class exposes `prepare` and `finalize` functions.
+The `prepare` function is responsible for input activation Quantization and All2All Dispatch. The `finalize` function is responsible for invoking the All2All Combine. Additionally the `finalize` function may or may not do the TopK weight application and reduction (Please refer to the TopKWeightAndReduce section)
+
+![](../assets/design/fused_moe_modular_kernel/prepare_and_finalize_blocks.png "FusedMoEPrepareAndFinalize Blocks")
+
+### FusedMoEPermuteExpertsUnpermute
+
+The `FusedMoEPermuteExpertsUnpermute` class is where the crux of the MoE operations happen. The `FusedMoEPermuteExpertsUnpermute` abstract class exposes a few important functions,
+
+* apply()
+* workspace_shapes()
+* finalize_weight_and_reduce_impl()
+
+#### apply()
+
+The `apply` method is where the implementations perform
+
+* Permute
+* Matmul with weight W1
+* Act + Mul
+* Quantization
+* Matmul with weight W2
+* Unpermute
+* Maybe TopK Weight Application + Reduction
+
+#### workspace_shapes()
+
+The core FusedMoE implementation performs a series of operations. It would be inefficient to create output memory for each of these operations separately. To that effect, implementations are required to declare 2 workspace shapes, the workspace datatype and the FusedMoE output shape as outputs of the workspace_shapes() method. This information is used to allocate the workspace tensors and the output tensor in `FusedMoEModularKernel::forward()` and passed on to the `FusedMoEPermuteExpertsUnpermute::apply()` method. The workspaces could then be used as intermediate buffers in the FusedMoE implementation.
+
+#### finalize_weight_and_reduce_impl()
+
+It is sometimes efficient to perform TopK weight application and Reduction inside the `FusedMoEPermuteExpertsUnpermute::apply()`. Find an example [here](https://github.com/vllm-project/vllm/pull/20228). We have a `TopKWeightAndReduce` abstract class to facilitate such implementations. Please refer to the TopKWeightAndReduce section.
+`FusedMoEPermuteExpertsUnpermute::finalize_weight_and_reduce_impl()` returns the `TopKWeightAndReduce` object that the implementation wants the `FusedMoEPrepareAndFinalize::finalize()` to use.
+
+![](../assets/design/fused_moe_modular_kernel/fused_experts_blocks.png "FusedMoEPermuteExpertsUnpermute Blocks")
+
+### FusedMoEModularKernel
+
+`FusedMoEModularKernel` is composed of the `FusedMoEPrepareAndFinalize` and `FusedMoEPermuteExpertsUnpermute` objects.
+`FusedMoEModularKernel` pseudocode/sketch,
+
+```py
+class FusedMoEModularKernel:
+    def __init__(self,
+                 prepare_finalize: FusedMoEPrepareAndFinalize,
+                 fused_experts: FusedMoEPermuteExpertsUnpermute):
+
+        self.prepare_finalize = prepare_finalize
+        self.fused_experts = fused_experts
+
+    def forward(self, DP_A):
+
+        Aq, A_scale, _, _, _ = self.prepare_finalize.prepare(DP_A, ...)
+
+        workspace13_shape, workspace2_shape, _, _ = self.fused_experts.workspace_shapes(...)
+
+        # allocate workspaces
+        workspace_13 = torch.empty(workspace13_shape, ...)
+        workspace_2 = torch.empty(workspace2_shape, ...)
+
+        # execute fused_experts
+        fe_out = self.fused_experts.apply(Aq, A_scale, workspace13, workspace2, ...)
+
+        # war_impl is an object of type TopKWeightAndReduceNoOp if the fused_experts implementations
+        # performs the TopK Weight Application and Reduction.
+        war_impl = self.fused_experts.finalize_weight_and_reduce_impl()
+
+        output = self.prepare_finalize.finalize(fe_out, war_impl,...)
+
+        return output
+```
+
+## How-To
+
+### How To Add a FusedMoEPrepareAndFinalize Type
+
+Typically a FusedMoEPrepareAndFinalize type is backed by an All2All Dispatch & Combine implementation / kernel. For example,
+
+* PplxPrepareAndFinalize type is backed by Pplx All2All kernels,
+* DeepEPHTPrepareAndFinalize type is backed by DeepEP High-Throughtput All2All kernels, and
+* DeepEPLLPrepareAndFinalize type is backed by DeepEP Low-Latency All2All kernels.
+
+#### Step 1: Add an All2All manager
+
+The purpose of the All2All Manager is to setup the All2All kernel implementations. The `FusedMoEPrepareAndFinalize` implementations typically fetch a kernel-implementation "handle" from the All2All Manager to invoke the Dispatch and Combine functions. Please look at the All2All Manager implementations [here](gh-file:vllm/distributed/device_communicators/all2all.py).
+
+#### Step 2: Add a FusedMoEPrepareAndFinalize Type
+
+This section describes the significance of the various functions exposed by the `FusedMoEPrepareAndFinalize` abstract class.
+
+`FusedMoEPrepareAndFinalize::prepare()`: The prepare method implements the Quantization and All2All Dispatch. Typically the Dispatch function from the relevant All2All Manager is invoked.
+
+`FusedMoEPrepareAndFinalize::finalize()`: Maybe perform TopK Weight Application and Reduction and All2All Combine. Typically the Combine function from the relevant All2AllManager is invoked.
+
+`FusedMoEPrepareAndFinalize::activation_format()`: Return `FusedMoEActivationFormat.BatchedExperts` if the output of the prepare method (i.e. the All2All dispatch) is Batched. Return `FusedMoEActivationFormat.Standard` otherwise.
+
+`FusedMoEPrepareAndFinalize::topk_indices_dtype()`: Data type of the TopK ids. Some All2All kernels have strict requirements pertaining to the data type of the TopK ids. This requirement is passed on to the `FusedMoe::select_experts` function so it could be respected. If there are no strict requirements return None.
+
+`FusedMoEPrepareAndFinalize::max_num_tokens_per_rank()`: This is the maximum number of tokens that would be submitted to the All2All Dispatch at once.
+
+`FusedMoEPrepareAndFinalize::num_dispatchers()`: Total number of dispatching units. This value determines the size of the Dispatch output. The Dispatch output is of shape (num_local_experts, max_num_tokens, K). Here max_num_tokens = num_dispatchers() * max_num_tokens_per_rank().
+
+We suggest picking an already existing `FusedMoEPrepareAndFinalize` implementation that matches your All2All implementation closely and using it as a reference.
+
+### How To Add a FusedMoEPermuteExpertsUnpermute Type
+
+FusedMoEPermuteExpertsUnpermute performs the core of the FusedMoE operations. The various functions exposed by the abstract class and their significance is as follows,
+
+`FusedMoEPermuteExpertsUnpermute::activation_formats()`: Return the supported Input and Output activation formats. i.e. Contiguous / Batched format.
+
+`FusedMoEPermuteExpertsUnpermute::supports_chunking()`: Return True if the implementation supports chunking. Typically
+implementations that input `FusedMoEActivationFormat.Standard` support chunking and `FusedMoEActivationFormat.BatchedExperts` do not.
+
+`FusedMoEPermuteExpertsUnpermute::supports_expert_map()`: Return True if the implementation supports expert map.
+
+`FusedMoEPermuteExpertsUnpermute::workspace_shapes()` /
+`FusedMoEPermuteExpertsUnpermute::finalize_weight_and_reduce_impl` /
+`FusedMoEPermuteExpertsUnpermute::apply`: Refer to `FusedMoEPermuteExpertsUnpermute` section above.
+
+### FusedMoEModularKernel Initialization
+
+`FusedMoEMethodBase` class has 2 methods that are collectively responsible in creating the `FusedMoEModularKernel` object. They are,
+
+* select_gemm_impl, and
+* init_prepare_finalize
+
+#### select_gemm_impl
+
+The `select_gemm_impl` method is undefined in the base class. It is the responsibility of the derived class to implement a method that constructs a valid/appropriate `FusedMoEPermuteExpertsUnpermute` object.
+Please refer to the implementations in,
+
+* `UnquantizedFusedMoEMethod`
+* `CompressedTensorsW8A8Fp8MoEMethod`
+* `CompressedTensorsW8A8Fp8MoECutlassMethod`
+* `Fp8MoEMethod`
+* `ModelOptNvFp4FusedMoE`
+dervied classes.
+
+#### init_prepare_finalize
+
+Based on the input and env settings, the `init_prepare_finalize` method creates the appropriate `FusedMoEPrepareAndFinalize` object. The method then queries `select_gemm_impl` for the appropriate `FusedMoEPermuteExpertsUnpermute` object and builds the `FusedMoEModularKernel` object
+
+Please take a look at [init_prepare_finalize](https://github.com/vllm-project/vllm/blob/1cbf951ba272c230823b947631065b826409fa62/vllm/model_executor/layers/fused_moe/layer.py#L188).
+**Important**: The `FusedMoEMethodBase` derived classes use the `FusedMoEMethodBase::fused_experts` object in their `apply` methods. When settings permit the construction of a valid `FusedMoEModularKernel` object, we override `FusedMoEMethodBase::fused_experts` with it. This essentially makes the derived classes agnostic to what FusedMoE implementation is used.
+
+### How To Unit Test
+
+We have `FusedMoEModularKernel` unit tests at [test_modular_kernel_combinations.py](gh-file:tests/kernels/moe/test_modular_kernel_combinations.py).
+
+The unit test iterates through all combinations of `FusedMoEPrepareAndFinalize` and `FusedMoEPremuteExpertsUnpermute` types and if they are
+compatible, runs some correctness tests.
+If you are adding some `FusedMoEPrepareAndFinalize` / `FusedMoEPermuteExpertsUnpermute` implementations,
+
+1. Add the implementation type to `MK_ALL_PREPARE_FINALIZE_TYPES` and `MK_FUSED_EXPERT_TYPES` in [mk_objects.py](gh-file:tests/kernels/moe/modular_kernel_tools/mk_objects.py) respectively.
+2. Update `Config::is_batched_prepare_finalize()`, `Config::is_batched_fused_experts()`, `Config::is_standard_fused_experts()`,
+`Config::is_fe_16bit_supported()`,  `Config::is_fe_fp8_supported()`, `Config::is_fe_block_fp8_supported()`,
+`Config::is_fe_supports_chunking()` methods in [/tests/kernels/moe/modular_kernel_tools/common.py](gh-file:tests/kernels/moe/modular_kernel_tools/common.py)
+
+Doing this will add the new implementation to the test suite.
+
+### How To Check `FusedMoEPrepareAndFinalize` & `FusedMoEPermuteExpertsUnpermute` Compatibility
+
+The unit test file [test_modular_kernel_combinations.py](gh-file:tests/kernels/moe/test_modular_kernel_combinations.py) can also be executed as a standalone script.
+Example: `python3 -m tests.kernels.moe.test_modular_kernel_combinations --pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts`
+As a side-effect, this script can be used to test `FusedMoEPrepareAndFinalize` & `FusedMoEPermuteExpertsUnpermute` compatibility. When invoked
+with incompatible types, the script will error.
+
+### How To Profile
+
+Please take a look at [profile_modular_kernel.py](gh-file:tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py)
+The script can be used to generate Torch traces for a single `FusedMoEModularKernel::forward()` call for any compatible
+`FusedMoEPrepareAndFinalize` and `FusedMoEPermuteExpertsUnpermute` types.
+Example: `python3 -m tests.kernels.moe.modular_kernel_tools.profile_modular_kernel --pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts`
+
+## FusedMoEPrepareAndFinalize Implementations
+
+The following table lists the `FusedMoEPrepareAndFinalize` implementations at the time of writing,
+
+| Implementation | Type | Comments |
+| :--- | :--- | :--- |
+| DeepEPHTPrepareAndFinalize | Contiguous / Non-Batched | Uses the DeepEP High-Throughput all2all kernels. |
+| DeepEPLLPrepareAndFinalize | Batched | Uses the DeepEP Low-Latency all2all kernels. |
+| PplxPrepareAndFinalize | Batched | Uses the Perplexity all2all kernels. |
+| FlashInferCutlassMoEPrepareAndFinalize | Contiguous | |
+| MoEPrepareAndFinalizeNoEP | Contiguous | This implementation is used when there is no EP. i.e. no all2all kernels are invoked. |
+| BatchedPrepareAndFinalize | Batched | A reference prepare/finalize class that reorganizes the tokens into expert batched format, i.e. E x max_num_tokens x K. (Doesn’t use any all2all kernels. This is primarily used in unit testing) |
+
+## FusedMoEPermuteExpertsUnpermute
+
+The following table lists the `FusedMoEPermuteExpertsUnpermute` implementations at the time of writing,
+
+| Implementation | Type | Comment |
+| :--- | :--- | :--- |
+| BatchedDeepGemmExperts | Batched | Uses the DeepGemm’s Masked Grouped Gemm kernels for the fused_moe operation. |
+| BatchedTritonExperts | Batched | Uses a Triton Kernel for the Batched matmuls. |
+| BatchedTritonOrDeepGemmExperts | Batched | Chooses either the `BatchedDeepGemmExperts` or `BatchedTritonExperts` based on environment settings. |
+| DeepGemmExperts | Contiguous / Non-Batched | Uses DeepGemm’s Grouped Gemm kernels for fused_moe operation. |
+| TritonExperts | Contiguous / Non-Batched | Uses a Triton Kernel for fused_moe matmuls. |
+| TritonOrDeepGemmExperts | Contiguous / Non-Batched | Chooses either the `DeepGemmExperts` or `TritonExperts` based on fused_moe inputs. |
+| CutlassExpertsFP8 | Supports both Batched and Contiguous formats | Uses Cutlass Grouped Gemm implementations for the fp8 matmuls. |
+| CutlassExpertsFP4 | Supports both Batched and Contiguous formats | Uses Cutlass Grouped Gemm implementations for the fp4 matmuls. |
+| FlashInferExperts | Contiguous | Uses fused_moe operation from FlashInfer |
+| NaiveBatchedExperts | Batched | Reference Batched Experts implementation. Primarily used in unit tests. |
diff --git a/docs/design/huggingface_integration.md b/docs/design/huggingface_integration.md
index 7b01313ddb00a..5a7582c86d49f 100644
--- a/docs/design/huggingface_integration.md
+++ b/docs/design/huggingface_integration.md
@@ -1,4 +1,4 @@
-# Integration with HuggingFace
+# Integration with Hugging Face
 
 This document describes how vLLM integrates with HuggingFace libraries. We will explain step by step what happens under the hood when we run `vllm serve`.
 
diff --git a/docs/design/v1/metrics.md b/docs/design/metrics.md
similarity index 99%
rename from docs/design/v1/metrics.md
rename to docs/design/metrics.md
index 52cd320dd4e11..1f65331d3c0a9 100644
--- a/docs/design/v1/metrics.md
+++ b/docs/design/metrics.md
@@ -223,7 +223,7 @@ And the calculated intervals are:
 
 Put another way:
 
-![Interval calculations - common case](../../assets/design/v1/metrics/intervals-1.png)
+![Interval calculations - common case](../assets/design/metrics/intervals-1.png)
 
 We explored the possibility of having the frontend calculate these
 intervals using the timing of events visible by the frontend. However,
@@ -238,13 +238,13 @@ When a preemption occurs during decode, since any already generated
 tokens are reused, we consider the preemption as affecting the
 inter-token, decode, and inference intervals.
 
-![Interval calculations - preempted decode](../../assets/design/v1/metrics/intervals-2.png)
+![Interval calculations - preempted decode](../assets/design/metrics/intervals-2.png)
 
 When a preemption occurs during prefill (assuming such an event
 is possible), we consider the preemption as affecting the
 time-to-first-token and prefill intervals.
 
-![Interval calculations - preempted prefill](../../assets/design/v1/metrics/intervals-3.png)
+![Interval calculations - preempted prefill](../assets/design/metrics/intervals-3.png)
 
 ### Frontend Stats Collection
 
@@ -361,7 +361,7 @@ instances in Prometheus.
 
 We use this concept for the `vllm:cache_config_info` metric:
 
-```
+```text
 # HELP vllm:cache_config_info Information of the LLMEngine CacheConfig
 # TYPE vllm:cache_config_info gauge
 vllm:cache_config_info{block_size="16",cache_dtype="auto",calculate_kv_scales="False",cpu_offload_gb="0",enable_prefix_caching="False",gpu_memory_utilization="0.9",...} 1.0
@@ -686,7 +686,7 @@ documentation for this option states:
 The metrics were added by <gh-pr:7089> and who up in an OpenTelemetry trace
 as:
 
-```
+```text
 -> gen_ai.latency.time_in_scheduler: Double(0.017550230026245117)
 -> gen_ai.latency.time_in_model_forward: Double(3.151565277099609)
 -> gen_ai.latency.time_in_model_execute: Double(3.6468167304992676)
diff --git a/docs/design/v1/multiprocessing.md b/docs/design/multiprocessing.md
similarity index 100%
rename from docs/design/v1/multiprocessing.md
rename to docs/design/multiprocessing.md
diff --git a/docs/design/v1/p2p_nccl_connector.md b/docs/design/p2p_nccl_connector.md
similarity index 94%
rename from docs/design/v1/p2p_nccl_connector.md
rename to docs/design/p2p_nccl_connector.md
index 9f6acf3291dd2..94af8bedd24d2 100644
--- a/docs/design/v1/p2p_nccl_connector.md
+++ b/docs/design/p2p_nccl_connector.md
@@ -1,27 +1,30 @@
+# P2P NCCL Connector
+
 An implementation of xPyD with dynamic scaling based on point-to-point communication, partly inspired by Dynamo.
 
-# Detailed Design
+## Detailed Design
 
-## Overall Process
-As shown in Figure 1, the overall process of this **PD disaggregation** solution is described through a request flow:  
+### Overall Process
 
-1. The client sends an HTTP request to the Proxy/Router's `/v1/completions` interface.  
-2. The Proxy/Router selects a **1P1D (1 Prefill instance + 1 Decode instance)** through either through round-robin or random selection, generates a `request_id` (rules to be introduced later), modifies the `max_tokens` in the HTTP request message to **1**, and then forwards the request to the **P instance**.  
-3. Immediately afterward, the Proxy/Router forwards the **original HTTP request** to the **D instance**.  
-4. The **P instance** performs **Prefill** and then **actively sends the generated KV cache** to the D instance (using **PUT_ASYNC** mode). The D instance's `zmq_addr` can be resolved through the `request_id`.  
-5. The **D instance** has a **dedicated thread** for receiving the KV cache (to avoid blocking the main process). The received KV cache is saved into the **GPU memory buffer**, the size of which is determined by the vLLM startup parameter `kv_buffer_size`. When the GPU buffer is full, the KV cache is stored in the **local Tensor memory pool**.  
-6. During the **Decode**, the D instance's main process retrieves the KV cache (transmitted by the P instance) from either the **GPU buffer** or the **memory pool**, thereby **skipping Prefill**.  
+As shown in Figure 1, the overall process of this **PD disaggregation** solution is described through a request flow:
+
+1. The client sends an HTTP request to the Proxy/Router's `/v1/completions` interface.
+2. The Proxy/Router selects a **1P1D (1 Prefill instance + 1 Decode instance)** through either through round-robin or random selection, generates a `request_id` (rules to be introduced later), modifies the `max_tokens` in the HTTP request message to **1**, and then forwards the request to the **P instance**.
+3. Immediately afterward, the Proxy/Router forwards the **original HTTP request** to the **D instance**.
+4. The **P instance** performs **Prefill** and then **actively sends the generated KV cache** to the D instance (using **PUT_ASYNC** mode). The D instance's `zmq_addr` can be resolved through the `request_id`.
+5. The **D instance** has a **dedicated thread** for receiving the KV cache (to avoid blocking the main process). The received KV cache is saved into the **GPU memory buffer**, the size of which is determined by the vLLM startup parameter `kv_buffer_size`. When the GPU buffer is full, the KV cache is stored in the **local Tensor memory pool**.
+6. During the **Decode**, the D instance's main process retrieves the KV cache (transmitted by the P instance) from either the **GPU buffer** or the **memory pool**, thereby **skipping Prefill**.
 7. After completing **Decode**, the D instance returns the result to the **Proxy/Router**, which then forwards it to the **client**.
 
 ![image1](https://github.com/user-attachments/assets/fb01bde6-755b-49f7-ad45-48a94b1e10a7)
 
-## Proxy/Router (Demo)
+### Proxy/Router (Demo)
 
 A simple HTTP service acts as the entry point for client requests and starts a background thread to listen for P/D instances reporting their HTTP IP and PORT, as well as ZMQ IP and PORT. It maintains a dictionary of `http_addr -> zmq_addr`. The `http_addr` is the IP:PORT for the vLLM instance's request, while the `zmq_addr` is the address for KV cache handshake and metadata reception.
 
 The Proxy/Router is responsible for selecting 1P1D based on the characteristics of the client request, such as the prompt, and generating a corresponding `request_id`, for example:
 
-```
+```text
 cmpl-___prefill_addr_10.0.1.2:21001___decode_addr_10.0.1.3:22001_93923d63113b4b338973f24d19d4bf11-0
 ```
 
@@ -29,13 +32,13 @@ Currently, to quickly verify whether xPyD can work, a round-robin selection of 1
 
 Each P/D instance periodically sends a heartbeat packet to the Proxy/Router (currently every 3 seconds) to register (i.e., report `http_addr -> zmq_addr`) and keep the connection alive. If an instance crashes and fails to send a ping for a certain period of time, the Proxy/Router will remove the timed-out instance (this feature has not yet been developed).
 
-## KV Cache Transfer Methods
+### KV Cache Transfer Methods
 
 There are three methods for KVCache transfer: PUT, GET, and PUT_ASYNC. These methods can be specified using the `--kv-transfer-config` and `kv_connector_extra_config` parameters, specifically through the `send_type` field. Both PUT and PUT_ASYNC involve the P instance actively sending KVCache to the D instance. The difference is that PUT is a synchronous transfer method that blocks the main process, while PUT_ASYNC is an asynchronous transfer method. PUT_ASYNC uses a dedicated thread for sending KVCache, which means it does not block the main process. In contrast, the GET method involves the P instance saving the KVCache to the memory buffer after computing the prefill. The D instance then actively retrieves the computed KVCache from the P instance once it has allocated space for the KVCache.
 
 Experimental results have shown that the performance of these methods, from highest to lowest, is as follows: PUT_ASYNC → GET → PUT.
 
-## P2P Communication via ZMQ & NCCL
+### P2P Communication via ZMQ & NCCL
 
 As long as the address of the counterpart is known, point-to-point KV cache transfer (using NCCL) can be performed, without being constrained by rank and world size. To support dynamic scaling (expansion and contraction) of instances with PD disaggregation. This means that adding or removing P/D instances does not require a full system restart.
 
@@ -43,7 +46,7 @@ Each P/D instance only needs to create a single `P2pNcclEngine` instance. This i
 
 When a P instance and a D instance transmit KVCache for the first time, they need to establish a ZMQ connection and an NCCL group. For subsequent KVCache transmissions, this ZMQ connection and NCCL group are reused. The NCCL group consists of only two ranks, meaning the world size is equal to 2. This design is intended to support dynamic scaling, which means that adding or removing P/D instances does not require a full system restart. As long as the address of the counterpart is known, point-to-point KVCache transmission can be performed, without being restricted by rank or world size.
 
-## NCCL Group Topology
+### NCCL Group Topology
 
 Currently, only symmetric TP (Tensor Parallelism) methods are supported for KVCache transmission. Asymmetric TP and PP (Pipeline Parallelism) methods will be supported in the future. Figure 2 illustrates the 1P2D setup, where each instance has a TP (Tensor Parallelism) degree of 2. There are a total of 7 NCCL groups: three vLLM instances each have one NCCL group with TP=2. Additionally, the 0th GPU card of the P instance establishes an NCCL group with the 0th GPU card of each D instance. Similarly, the 1st GPU card of the P instance establishes an NCCL group with the 1st GPU card of each D instance.
 
@@ -51,7 +54,7 @@ Currently, only symmetric TP (Tensor Parallelism) methods are supported for KVCa
 
 Each NCCL group occupies a certain amount of GPU memory buffer for communication, the size of which is primarily influenced by the `NCCL_MAX_NCHANNELS` environment variable. When `NCCL_MAX_NCHANNELS=16`, an NCCL group typically occupies 100MB, while when `NCCL_MAX_NCHANNELS=8`, it usually takes up 52MB. For large-scale xPyD configurations—such as DeepSeek's 96P144D—this implementation is currently not feasible. Moving forward, we are considering using RDMA for point-to-point communication and are also keeping an eye on UCCL.
 
-## GPU Memory Buffer and Tensor Memory Pool
+### GPU Memory Buffer and Tensor Memory Pool
 
 The trade-off in the size of the memory buffer is as follows: For P instances, the memory buffer is not required in PUT and PUT_ASYNC modes, but it is necessary in GET mode. For D instances, a memory buffer is needed in all three modes. The memory buffer for D instances should not be too large. Similarly, for P instances in GET mode, the memory buffer should also not be too large. The memory buffer of D instances is used to temporarily store KVCache sent by P instances. If it is too large, it will reduce the KVCache space available for normal inference by D instances, thereby decreasing the inference batch size and ultimately leading to a reduction in output throughput. The size of the memory buffer is configured by the parameter `kv_buffer_size`, measured in bytes, and is typically set to 5%～10% of the memory size.
 
@@ -59,15 +62,16 @@ If the `--max-num-seqs` parameter for P instances is set to a large value, due t
 
 To address the above issues, I have designed and developed a local Tensor memory pool for storing KVCache, inspired by the buddy system used in Linux memory modules. Since the memory is sufficiently large, typically in the TB range on servers, there is no need to consider prefix caching or using block-based designs to reuse memory, thereby saving space. When the memory buffer is insufficient, KVCache can be directly stored in the Tensor memory pool, and D instances can subsequently retrieve KVCache from it. The read and write speed is that of PCIe, with PCIe 4.0 having a speed of approximately 21 GB/s, which is usually faster than the Prefill speed. Otherwise, solutions like Mooncake and lmcache would not be necessary. The Tensor memory pool acts as a flood diversion area, typically unused except during sudden traffic surges. In the worst-case scenario, my solution performs no worse than the normal situation with a Cache store.
 
-# Install vLLM
+## Install vLLM
 
 ```shell
 pip install "vllm>=0.9.2"
 ```
 
-# Run xPyD
+## Run xPyD
+
+### Instructions
 
-## Instructions
 - The following examples are run on an A800 (80GB) device, using the Meta-Llama-3.1-8B-Instruct model.
 - Pay attention to the setting of the `kv_buffer_size` (in bytes). The empirical value is 10% of the GPU memory size. This is related to the kvcache size. If it is too small, the GPU memory buffer for temporarily storing the received kvcache will overflow, causing the kvcache to be stored in the tensor memory pool, which increases latency. If it is too large, the kvcache available for inference will be reduced, leading to a smaller batch size and decreased throughput.
 - For Prefill instances, when using non-GET mode, the `kv_buffer_size` can be set to 1, as Prefill currently does not need to receive kvcache. However, when using GET mode, a larger `kv_buffer_size` is required because it needs to store the kvcache sent to the D instance.
@@ -79,16 +83,16 @@ pip install "vllm>=0.9.2"
 - Supports multiple nodes; you just need to modify the `proxy_ip` and `proxy_port` in `--kv-transfer-config`.
 - In the following examples, it is assumed that **the proxy's IP is 10.0.1.1**.
 
-## Run 1P3D
+### Run 1P3D
 
-### Proxy (e.g. 10.0.1.1)
+#### Proxy (e.g. 10.0.1.1)
 
 ```shell
 cd {your vllm directory}/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/
 python3 disagg_proxy_p2p_nccl_xpyd.py &
 ```
 
-### Prefill1 (e.g. 10.0.1.2 or 10.0.1.1)
+#### Prefill1 (e.g. 10.0.1.2 or 10.0.1.1)
 
 ??? console "Command"
 
@@ -110,7 +114,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
         '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20001"}}' > /var/vllm.log 2>&1 &
     ```
 
-### Decode1 (e.g. 10.0.1.3 or 10.0.1.1)
+#### Decode1 (e.g. 10.0.1.3 or 10.0.1.1)
 
 ??? console "Command"
 
@@ -132,7 +136,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
         '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20002"}}' > /var/vllm.log 2>&1 &
     ```
 
-### Decode2 (e.g. 10.0.1.4 or 10.0.1.1)
+#### Decode2 (e.g. 10.0.1.4 or 10.0.1.1)
 
 ??? console "Command"
 
@@ -154,7 +158,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
         '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003"}}' > /var/vllm.log 2>&1 &
     ```
 
-### Decode3 (e.g. 10.0.1.5 or 10.0.1.1)
+#### Decode3 (e.g. 10.0.1.5 or 10.0.1.1)
 
 ??? console "Command"
 
@@ -176,16 +180,16 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
         '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20004"}}' > /var/vllm.log 2>&1 &
     ```
 
-## Run 3P1D
+### Run 3P1D
 
-### Proxy (e.g. 10.0.1.1)
+#### Proxy (e.g. 10.0.1.1)
 
 ```shell
 cd {your vllm directory}/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/
 python3 disagg_proxy_p2p_nccl_xpyd.py &
 ```
 
-### Prefill1 (e.g. 10.0.1.2 or 10.0.1.1)
+#### Prefill1 (e.g. 10.0.1.2 or 10.0.1.1)
 
 ??? console "Command"
 
@@ -207,7 +211,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
         '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20001"}}' > /var/vllm.log 2>&1 &
     ```
 
-### Prefill2 (e.g. 10.0.1.3 or 10.0.1.1)
+#### Prefill2 (e.g. 10.0.1.3 or 10.0.1.1)
 
 ??? console "Command"
 
@@ -229,7 +233,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
         '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20002"}}' > /var/vllm.log 2>&1 &
     ```
 
-### Prefill3 (e.g. 10.0.1.4 or 10.0.1.1)
+#### Prefill3 (e.g. 10.0.1.4 or 10.0.1.1)
 
 ??? console "Command"
 
@@ -251,7 +255,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
         '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003"}}' > /var/vllm.log 2>&1 &
     ```
 
-### Decode1 (e.g. 10.0.1.5 or 10.0.1.1)
+#### Decode1 (e.g. 10.0.1.5 or 10.0.1.1)
 
 ??? console "Command"
 
@@ -273,7 +277,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
         '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20004"}}' > /var/vllm.log 2>&1 &
     ```
 
-# Single request
+## Single request
 
 ```shell
 curl -X POST -s http://10.0.1.1:10001/v1/completions \
@@ -286,12 +290,12 @@ curl -X POST -s http://10.0.1.1:10001/v1/completions \
 }'
 ```
 
-# Benchmark
+## Benchmark
 
 ??? console "Command"
 
     ```shell
-    python3 benchmark_serving.py \
+    vllm bench serve \
         --backend vllm \
         --model base_model \
         --tokenizer meta-llama/Llama-3.1-8B-Instruct \
@@ -310,14 +314,14 @@ curl -X POST -s http://10.0.1.1:10001/v1/completions \
         --num-prompts 1000
     ```
 
-# Shut down
+## Shut down
 
 ```shell
 pgrep python | xargs kill -9 && pkill -f python
 ```
 
-# Test data
+## Test data
 
-## **Scenario**: 1K input & 200 output tokens, E2E P99 latency ~2s
+### **Scenario**: 1K input & 200 output tokens, E2E P99 latency ~2s
 
 ![testdata](https://github.com/user-attachments/assets/cef0953b-4567-4bf9-b940-405b92a28eb1)
diff --git a/docs/design/kernel/paged_attention.md b/docs/design/paged_attention.md
similarity index 94%
rename from docs/design/kernel/paged_attention.md
rename to docs/design/paged_attention.md
index 94bfa97ee2217..fb991a35caf30 100644
--- a/docs/design/kernel/paged_attention.md
+++ b/docs/design/paged_attention.md
@@ -1,4 +1,8 @@
-# vLLM Paged Attention
+# Paged Attention
+
+!!! warning
+    This is a historical document based on the [original paper for vLLM](https://arxiv.org/abs/2309.06180).
+    It no longer describes the code used in vLLM today.
 
 Currently, vLLM utilizes its own implementation of a multi-head query
 attention kernel (`csrc/attention/attention_kernels.cu`).
@@ -136,7 +140,7 @@ const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE;
 ```
 
 <figure markdown="span">
-  ![](../../assets/kernel/query.png){ align="center" alt="query" width="70%" }
+  ![](../assets/design/paged_attention/query.png){ align="center" alt="query" width="70%" }
 </figure>
 
 Each thread defines its own `q_ptr` which points to the assigned
@@ -145,7 +149,7 @@ and `HEAD_SIZE` is 128, the `q_ptr` points to data that contains
 total of 128 elements divided into 128 / 4 = 32 vecs.
 
 <figure markdown="span">
-  ![](../../assets/kernel/q_vecs.png){ align="center" alt="q_vecs" width="70%" }
+  ![](../assets/design/paged_attention/q_vecs.png){ align="center" alt="q_vecs" width="70%" }
 </figure>
 
 ```cpp
@@ -184,7 +188,7 @@ points to key token data based on `k_cache` at assigned block,
 assigned head and assigned token.
 
 <figure markdown="span">
-  ![](../../assets/kernel/key.png){ align="center" alt="key" width="70%" }
+  ![](../assets/design/paged_attention/key.png){ align="center" alt="key" width="70%" }
 </figure>
 
 The diagram above illustrates the memory layout for key data. It
@@ -199,7 +203,7 @@ elements for one token) that will be processed by 2 threads (one
 thread group) separately.
 
 <figure markdown="span">
-  ![](../../assets/kernel/k_vecs.png){ align="center" alt="k_vecs" width="70%" }
+  ![](../assets/design/paged_attention/k_vecs.png){ align="center" alt="k_vecs" width="70%" }
 </figure>
 
 ```cpp
@@ -358,15 +362,15 @@ later steps. Now, it should store the normalized softmax result of
 ## Value
 
 <figure markdown="span">
-  ![](../../assets/kernel/value.png){ align="center" alt="value" width="70%" }
+  ![](../assets/design/paged_attention/value.png){ align="center" alt="value" width="70%" }
 </figure>
 
 <figure markdown="span">
-  ![](../../assets/kernel/logits_vec.png){ align="center" alt="logits_vec" width="50%" }
+  ![](../assets/design/paged_attention/logits_vec.png){ align="center" alt="logits_vec" width="50%" }
 </figure>
 
 <figure markdown="span">
-  ![](../../assets/kernel/v_vec.png){ align="center" alt="v_vec" width="70%" }
+  ![](../assets/design/paged_attention/v_vec.png){ align="center" alt="v_vec" width="70%" }
 </figure>
 
 Now we need to retrieve the value data and perform dot multiplication
@@ -495,3 +499,14 @@ for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
 Finally, we need to iterate over different assigned head positions
 and write out the corresponding accumulated result based on the
 `out_ptr`.
+
+## Citation
+
+```bibtex
+@inproceedings{kwon2023efficient,
+  title={Efficient Memory Management for Large Language Model Serving with PagedAttention},
+  author={Woosuk Kwon and Zhuohan Li and Siyuan Zhuang and Ying Sheng and Lianmin Zheng and Cody Hao Yu and Joseph E. Gonzalez and Hao Zhang and Ion Stoica},
+  booktitle={Proceedings of the ACM SIGOPS 29th Symposium on Operating Systems Principles},
+  year={2023}
+}
+```
diff --git a/docs/design/plugin_system.md b/docs/design/plugin_system.md
index 23a05ac719ce2..ca1c2c2305d91 100644
--- a/docs/design/plugin_system.md
+++ b/docs/design/plugin_system.md
@@ -1,4 +1,4 @@
-# vLLM's Plugin System
+# Plugin System
 
 The community frequently requests the ability to extend vLLM with custom features. To facilitate this, vLLM includes a plugin system that allows users to add custom features without modifying the vLLM codebase. This document explains how plugins work in vLLM and how to create a plugin for vLLM.
 
diff --git a/docs/design/v1/prefix_caching.md b/docs/design/prefix_caching.md
similarity index 90%
rename from docs/design/v1/prefix_caching.md
rename to docs/design/prefix_caching.md
index 2d3c8412894a6..9941837bf1652 100644
--- a/docs/design/v1/prefix_caching.md
+++ b/docs/design/prefix_caching.md
@@ -18,10 +18,12 @@ In the example above, the KV cache in the first block can be uniquely identified
 * Block tokens: A tuple of tokens in this block. The reason to include the exact tokens is to reduce potential hash value collision.
 * Extra hashes: Other values required to make this block unique, such as LoRA IDs, multi-modality input hashes (see the example below), and cache salts to isolate caches in multi-tenant environments.
 
-> **Note 1:** We only cache full blocks.
+!!! note "Note 1"
+    We only cache full blocks.
 
-> **Note 2:** The above hash key structure is not 100% collision free. Theoretically it’s still possible for the different prefix tokens to have the same hash value. To avoid any hash collisions **in a multi-tenant setup, we advise to use SHA256** as hash function instead of the default builtin hash.
-SHA256 is supported since vLLM v0.8.3 and must be enabled with a command line argument. It comes with a performance impact of about 100-200ns per token (~6ms for 50k tokens of context).
+!!! note "Note 2"
+    The above hash key structure is not 100% collision free. Theoretically it’s still possible for the different prefix tokens to have the same hash value. To avoid any hash collisions **in a multi-tenant setup, we advise to use SHA256** as hash function instead of the default builtin hash.
+    SHA256 is supported since vLLM v0.8.3 and must be enabled with a command line argument. It comes with a performance impact of about 100-200ns per token (~6ms for 50k tokens of context).
 
 **A hashing example with multi-modality inputs**  
 In this example, we illustrate how prefix caching works with multi-modality inputs (e.g., images). Assuming we have a request with the following messages:
@@ -92,7 +94,8 @@ To improve privacy in shared environments, vLLM supports isolating prefix cache
 
 With this setup, cache sharing is limited to users or requests that explicitly agree on a common salt, enabling cache reuse within a trust group while isolating others.
 
-> **Note:** Cache isolation is not supported in engine V0.
+!!! note
+    Cache isolation is not supported in engine V0.
 
 ## Data Structure
 
@@ -122,7 +125,7 @@ There are two design points to highlight:
 
 As a result, we will have the following components when the KV cache manager is initialized:
 
-![Component Overview](../../assets/design/v1/prefix_caching/overview.png)
+![Component Overview](../assets/design/prefix_caching/overview.png)
 
 * Block Pool: A list of KVCacheBlock.  
 * Free Block Queue: Only store the pointers of head and tail blocks for manipulations.  
@@ -192,7 +195,7 @@ As can be seen, block 3 is a new full block and is cached. However, it is redund
 
 When a request is finished, we free all its blocks if no other requests are using them (reference count = 0). In this example, we free request 1 and block 2, 3, 4, 8 associated with it. We can see that the freed blocks are added to the tail of the free queue in the *reverse* order. This is because the last block of a request must hash more tokens and is less likely to be reused by other requests. As a result, it should be evicted first.
 
-![Free queue after a request us freed](../../assets/design/v1/prefix_caching/free.png)
+![Free queue after a request us freed](../assets/design/prefix_caching/free.png)
 
 ### Eviction (LRU)
 
@@ -208,24 +211,24 @@ In this example, we assume the block size is 4 (each block can cache 4 tokens),
 
 **Time 1: The cache is empty and a new request comes in.** We allocate 4 blocks. 3 of them are already full and cached. The fourth block is partially full with 3 of 4 tokens.
 
-![Example Time 1](../../assets/design/v1/prefix_caching/example-time-1.png)
+![Example Time 1](../assets/design/prefix_caching/example-time-1.png)
 
 **Time 3: Request 0 makes the block 3 full and asks for a new block to keep decoding.** We cache block 3 and allocate block 4.
 
-![Example Time 3](../../assets/design/v1/prefix_caching/example-time-3.png)
+![Example Time 3](../assets/design/prefix_caching/example-time-3.png)
 
 **Time 4: Request 1 comes in with the 14 prompt tokens, where the first 10 tokens are the same as request 0.** We can see that only the first 2 blocks (8 tokens) hit the cache, because the 3rd block only matches 2 of 4 tokens.
 
-![Example Time 4](../../assets/design/v1/prefix_caching/example-time-4.png)
+![Example Time 4](../assets/design/prefix_caching/example-time-4.png)
 
 **Time 5: Request 0 is finished and free.** Blocks 2, 3 and 4 are added to the free queue in the reverse order (but block 2 and 3 are still cached). Block 0 and 1 are not added to the free queue because they are being used by Request 1.
 
-![Example Time 5](../../assets/design/v1/prefix_caching/example-time-5.png)
+![Example Time 5](../assets/design/prefix_caching/example-time-5.png)
 
 **Time 6: Request 1 is finished and free.**
 
-![Example Time 6](../../assets/design/v1/prefix_caching/example-time-6.png)
+![Example Time 6](../assets/design/prefix_caching/example-time-6.png)
 
 **Time 7: Request 2 comes in with the 29 prompt tokens, where the first 12 tokens are the same as request 0\.** Note that even the block order in the free queue was `7 - 8 - 9 - 4 - 3 - 2 - 6 - 5 - 1 - 0`, the cache hit blocks (i.e., 0, 1, 2) are touched and removed from the queue before allocation, so the free queue becomes `7 - 8 - 9 - 4 - 3 - 6 - 5`. As a result, the allocated blocks are 0 (cached), 1 (cached), 2 (cached), 7, 8, 9, 4, 3 (evicted).
 
-![Example Time 7](../../assets/design/v1/prefix_caching/example-time-7.png)
+![Example Time 7](../assets/design/prefix_caching/example-time-7.png)
diff --git a/docs/design/v1/torch_compile.md b/docs/design/torch_compile.md
similarity index 99%
rename from docs/design/v1/torch_compile.md
rename to docs/design/torch_compile.md
index ea5d8ac212f7a..47ac4958dbf7f 100644
--- a/docs/design/v1/torch_compile.md
+++ b/docs/design/torch_compile.md
@@ -1,4 +1,4 @@
-# vLLM's `torch.compile` integration
+# `torch.compile` integration
 
 In vLLM's V1 architecture, `torch.compile` is enabled by default and is a critical part of the framework. This document gives a simple walk-through example to show how to understand the `torch.compile` usage.
 
@@ -8,7 +8,7 @@ Throughout the example, we will run a common Llama model using v1, and turn on d
 
 In the very verbose logs, we can see:
 
-```
+```console
 INFO 03-07 03:06:55 [backends.py:409] Using cache directory: ~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0 for vLLM's torch.compile
 ```
 
@@ -75,7 +75,7 @@ Every submodule can be identified by its index, and will be processed individual
 
 In the very verbose logs, we can also see:
 
-```
+```console
 DEBUG 03-07 03:52:37 [backends.py:134] store the 0-th graph for shape None from inductor via handle ('fpegyiq3v3wzjzphd45wkflpabggdbjpylgr7tta4hj6uplstsiw', '~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/inductor_cache/iw/ciwzrk3ittdqatuzwonnajywvno3llvjcs2vfdldzwzozn3zi3iy.py')
 DEBUG 03-07 03:52:39 [backends.py:134] store the 1-th graph for shape None from inductor via handle ('f7fmlodmf3h3by5iiu2c4zarwoxbg4eytwr3ujdd2jphl4pospfd', '~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/inductor_cache/ly/clyfzxldfsj7ehaluis2mca2omqka4r7mgcedlf6xfjh645nw6k2.py')
 ...
@@ -93,7 +93,7 @@ One more detail: you can see that the 1-th graph and the 15-th graph have the sa
 
 If we already have the cache directory (e.g. run the same code for the second time), we will see the following logs:
 
-```
+```console
 DEBUG 03-07 04:00:45 [backends.py:86] Directly load the 0-th graph for shape None from inductor via handle ('fpegyiq3v3wzjzphd45wkflpabggdbjpylgr7tta4hj6uplstsiw', '~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/inductor_cache/iw/ciwzrk3ittdqatuzwonnajywvno3llvjcs2vfdldzwzozn3zi3iy.py')
 ```
 
diff --git a/docs/features/compatibility_matrix.md b/docs/features/compatibility_matrix.md
index 8be1585f8e76b..5b08b3810776c 100644
--- a/docs/features/compatibility_matrix.md
+++ b/docs/features/compatibility_matrix.md
@@ -34,23 +34,26 @@ th:not(:first-child) {
 }
 </style>
 
-| Feature | [CP][chunked-prefill] | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | [SD](spec_decode.md) | CUDA graph | <abbr title="Pooling Models">pooling</abbr> | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | <abbr title="Logprobs">logP</abbr> | <abbr title="Prompt Logprobs">prmpt logP</abbr> | <abbr title="Async Output Processing">async output</abbr> | multi-step | <abbr title="Multimodal Inputs">mm</abbr> | best-of | beam-search |
+| Feature | [CP][chunked-prefill] | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | [SD](spec_decode.md) | CUDA graph | [pooling](../models/pooling_models.md) | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | <abbr title="Logprobs">logP</abbr> | <abbr title="Prompt Logprobs">prmpt logP</abbr> | <abbr title="Async Output Processing">async output</abbr> | multi-step | <abbr title="Multimodal Inputs">mm</abbr> | best-of | beam-search |
 |---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
-| [CP][chunked-prefill] | ✅ | | | | | | | | | | | | | | |
-| [APC](automatic_prefix_caching.md) | ✅ | ✅ | | | | | | | | | | | | | |
-| [LoRA](lora.md) | ✅ | ✅ | ✅ | | | | | | | | | | | | |
+| [CP][chunked-prefill] | ✅ | | | | | | | | | | | | | |
+| [APC](automatic_prefix_caching.md) | ✅ | ✅ | | | | | | | | | | | | |
+| [LoRA](lora.md) | ✅ | ✅ | ✅ | | | | | | | | | | | |
 | [SD](spec_decode.md) | ✅ | ✅ | ❌ | ✅ | | | | | | | | | | |
 | CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | |
-| <abbr title="Pooling Models">pooling</abbr> | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | | | | | | | | |
+| [pooling](../models/pooling_models.md) | 🟠\* | 🟠\* | ✅ | ❌ | ✅ | ✅ | | | | | | | | |
 | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ❌ | [❌](gh-issue:7366) | ❌ | [❌](gh-issue:7366) | ✅ | ✅ | ✅ | | | | | | | |
 | <abbr title="Logprobs">logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | |
 | <abbr title="Prompt Logprobs">prmpt logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | | | | | |
 | <abbr title="Async Output Processing">async output</abbr> | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | | | | |
 | multi-step | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | | | |
-| <abbr title="Multimodal Inputs">mm</abbr> | ✅ | [🟠](gh-pr:8348) | [🟠](gh-pr:4194) | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | |
+| [mm](multimodal_inputs.md) | ✅ | ✅ | [🟠](gh-pr:4194)<sup>^</sup> | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | |
 | best-of | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ✅ | ✅ | |
 | beam-search | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ❔ | ✅ | ✅ |
 
+\* Chunked prefill and prefix caching are only applicable to last-token pooling.  
+<sup>^</sup> LoRA is only applicable to the language backbone of multimodal models.
+
 [](){ #feature-x-hardware }
 
 ## Feature x Hardware
@@ -62,9 +65,9 @@ th:not(:first-child) {
 | [LoRA](lora.md)                                           | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅ |
 | [SD](spec_decode.md)                                      | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
 | CUDA graph                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ✅     | ❌ |
-| <abbr title="Pooling Models">pooling</abbr>               | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❔     | ❌ |
+| [pooling](../models/pooling_models.md)                    | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
 | <abbr title="Encoder-Decoder Models">enc-dec</abbr>       | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❌     | ❌ |
-| <abbr title="Multimodal Inputs">mm</abbr>                 | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
+| [mm](multimodal_inputs.md)                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
 | <abbr title="Logprobs">logP</abbr>                        | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
 | <abbr title="Prompt Logprobs">prmpt logP</abbr>           | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
 | <abbr title="Async Output Processing">async output</abbr> | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ❌     | ❌ |
diff --git a/docs/features/lora.md b/docs/features/lora.md
index ea1b495138c1b..a4e05dae11c2e 100644
--- a/docs/features/lora.md
+++ b/docs/features/lora.md
@@ -119,6 +119,7 @@ export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True
 ```
 
 ### Using API Endpoints
+
 Loading a LoRA Adapter:
 
 To dynamically load a LoRA adapter, send a POST request to the `/v1/load_lora_adapter` endpoint with the necessary
@@ -156,6 +157,7 @@ curl -X POST http://localhost:8000/v1/unload_lora_adapter \
 ```
 
 ### Using Plugins
+
 Alternatively, you can use the LoRAResolver plugin to dynamically load LoRA adapters. LoRAResolver plugins enable you to load LoRA adapters from both local and remote sources such as local file system and S3. On every request, when there's a new model name that hasn't been loaded yet, the LoRAResolver will try to resolve and load the corresponding LoRA adapter.
 
 You can set up multiple LoRAResolver plugins if you want to load LoRA adapters from different sources. For example, you might have one resolver for local files and another for S3 storage. vLLM will load the first LoRA adapter that it finds.
diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md
index e820ace4f8fe7..b8677f11a1d3c 100644
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@@ -177,6 +177,70 @@ Multi-image input can be extended to perform video captioning. We show this with
 You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary
 instead of using multi-image input.
 
+Instead of NumPy arrays, you can also pass `'torch.Tensor'` instances, as shown in this example using Qwen2.5-VL:
+
+??? code
+
+    ```python
+    from transformers import AutoProcessor
+    from vllm import LLM, SamplingParams
+    from qwen_vl_utils import process_vision_info
+
+    model_path = "Qwen/Qwen2.5-VL-3B-Instruct/"
+    video_path = "https://content.pexels.com/videos/free-videos.mp4"
+
+    llm = LLM(
+        model=model_path,
+        gpu_memory_utilization=0.8,
+        enforce_eager=True,
+        limit_mm_per_prompt={"video": 1},
+    )
+
+    sampling_params = SamplingParams(
+        max_tokens=1024,
+    )
+
+    video_messages = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": [
+                {"type": "text", "text": "describe this video."},
+                {
+                    "type": "video",
+                    "video": video_path,
+                    "total_pixels": 20480 * 28 * 28,
+                    "min_pixels": 16 * 28 * 28
+                }
+            ]
+        },
+    ]
+
+    messages = video_messages
+    processor = AutoProcessor.from_pretrained(model_path)
+    prompt = processor.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True,
+    )
+
+    image_inputs, video_inputs = process_vision_info(messages)
+    mm_data = {}
+    if video_inputs is not None:
+        mm_data["video"] = video_inputs
+
+    llm_inputs = {
+        "prompt": prompt,
+        "multi_modal_data": mm_data,
+    }
+
+    outputs = llm.generate([llm_inputs], sampling_params=sampling_params)
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+    ```
+
+    !!! note
+        'process_vision_info' is only applicable to Qwen2.5-VL and similar models.
+
 Full example: <gh-file:examples/offline_inference/vision_language.py>
 
 ### Audio Inputs
@@ -279,7 +343,7 @@ Here is a simple example using Phi-3.5-Vision.
 First, launch the OpenAI-compatible server:
 
 ```bash
-vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
+vllm serve microsoft/Phi-3.5-vision-instruct --runner generate \
   --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt '{"image":2}'
 ```
 
@@ -358,7 +422,7 @@ Instead of `image_url`, you can pass a video file via `video_url`. Here is a sim
 First, launch the OpenAI-compatible server:
 
 ```bash
-vllm serve llava-hf/llava-onevision-qwen2-0.5b-ov-hf --task generate --max-model-len 8192
+vllm serve llava-hf/llava-onevision-qwen2-0.5b-ov-hf --runner generate --max-model-len 8192
 ```
 
 Then, you can use the OpenAI client as follows:
@@ -524,7 +588,9 @@ Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for
 
 To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
 pass a tensor of shape to the corresponding field of the multi-modal dictionary.
+
 #### Image Embedding Inputs
+
 For image embeddings, you can pass the base64-encoded tensor to the `image_embeds` field.
 The following example demonstrates how to pass image embeddings to the OpenAI server:
 
diff --git a/docs/features/prompt_embeds.md b/docs/features/prompt_embeds.md
index 6f5616e05d8c1..83993bd0140fa 100644
--- a/docs/features/prompt_embeds.md
+++ b/docs/features/prompt_embeds.md
@@ -34,7 +34,7 @@ Prompt embeddings are passed in as base64 encoded torch tensors.
 First, launch the OpenAI-compatible server:
 
 ```bash
-vllm serve meta-llama/Llama-3.2-1B-Instruct --task generate \
+vllm serve meta-llama/Llama-3.2-1B-Instruct --runner generate \
   --max-model-len 4096 --enable-prompt-embeds
 ```
 
diff --git a/docs/features/quantization/README.md b/docs/features/quantization/README.md
index e8c3b11230786..e18c128f30fc9 100644
--- a/docs/features/quantization/README.md
+++ b/docs/features/quantization/README.md
@@ -6,6 +6,7 @@ Contents:
 
 - [Supported Hardware](supported_hardware.md)
 - [AutoAWQ](auto_awq.md)
+- [AutoRound](auto_round.md)
 - [BitsAndBytes](bnb.md)
 - [BitBLAS](bitblas.md)
 - [GGUF](gguf.md)
diff --git a/docs/features/quantization/auto_round.md b/docs/features/quantization/auto_round.md
new file mode 100644
index 0000000000000..ac766d5e29228
--- /dev/null
+++ b/docs/features/quantization/auto_round.md
@@ -0,0 +1,103 @@
+# AutoRound
+
+[AutoRound](https://github.com/intel/auto-round) is Intel’s advanced quantization algorithm designed to produce highly efficient **INT2, INT3, INT4, and INT8**
+quantized large language models—striking an optimal balance between accuracy and deployment performance.
+
+AutoRound applies weight-only quantization to transformer-based models, enabling significant memory savings and faster
+inference while maintaining near-original accuracy. It supports a wide range of hardware platforms, including **CPUs,
+Intel GPUs, HPUs, and CUDA-enabled devices**.
+
+Please refer to the [AutoRound guide](https://github.com/intel/auto-round/blob/main/docs/step_by_step.md) for more details.
+
+Key Features:
+
+✅ **AutoRound, AutoAWQ, AutoGPTQ, and GGUF** are supported
+
+✅ **10+ vision-language models (VLMs)** are supported
+
+✅ **Per-layer mixed-bit quantization** for fine-grained control
+
+✅ **RTN (Round-To-Nearest) mode** for quick quantization with slight accuracy loss
+
+✅ **Multiple quantization recipes**: best, base, and light
+
+✅ Advanced utilities such as immediate packing and support for **10+ backends**
+
+## Installation
+
+```bash
+uv pip install auto-round
+```
+
+## Quantizing a model
+
+For VLMs, please change to `auto-round-mllm` in CLI usage and `AutoRoundMLLM` in API usage.
+
+### CLI usage
+
+```bash
+auto-round \
+    --model Qwen/Qwen3-0.6B \
+    --bits 4 \
+    --group_size 128 \
+    --format "auto_round" \
+    --output_dir ./tmp_autoround
+```
+
+```bash
+auto-round \
+    --model Qwen/Qwen3-0.6B \
+    --format "gguf:q4_k_m" \
+    --output_dir ./tmp_autoround
+```
+
+### API usage
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from auto_round import AutoRound
+
+model_name = "Qwen/Qwen3-0.6B"
+model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+bits, group_size, sym = 4, 128, True
+autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym)
+
+# the best accuracy, 4-5X slower, low_gpu_mem_usage could save ~20G but ~30% slower
+# autoround = AutoRound(model, tokenizer, nsamples=512, iters=1000, low_gpu_mem_usage=True, bits=bits, group_size=group_size, sym=sym)
+
+# 2-3X speedup, slight accuracy drop at W4G128
+# autoround = AutoRound(model, tokenizer, nsamples=128, iters=50, lr=5e-3, bits=bits, group_size=group_size, sym=sym )
+
+output_dir = "./tmp_autoround"
+# format= 'auto_round'(default), 'auto_gptq', 'auto_awq'
+autoround.quantize_and_save(output_dir, format="auto_round")
+```
+
+## Running a quantized model with vLLM
+
+Here is some example code to run auto-round format in vLLM:
+
+```python
+from vllm import LLM, SamplingParams
+
+prompts = [
+    "Hello, my name is",
+]
+sampling_params = SamplingParams(temperature=0.6, top_p=0.95)
+model_name = "Intel/DeepSeek-R1-0528-Qwen3-8B-int4-AutoRound"
+llm = LLM(model=model_name)
+
+outputs = llm.generate(prompts, sampling_params)
+
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+```
+
+## Acknowledgement
+
+Special thanks to open-source low precision libraries such as AutoGPTQ, AutoAWQ, GPTQModel, Triton, Marlin, and
+ExLLaMAV2 for providing low-precision CUDA kernels, which are leveraged in AutoRound.
diff --git a/docs/features/quantization/int4.md b/docs/features/quantization/int4.md
index 1df32a11ed9db..127e403989944 100644
--- a/docs/features/quantization/int4.md
+++ b/docs/features/quantization/int4.md
@@ -134,8 +134,8 @@ lm_eval --model vllm \
 - Employ the chat template or instruction template that the model was trained with
 - If you've fine-tuned a model, consider using a sample of your training data for calibration
 - Tune key hyperparameters to the quantization algorithm:
-  - `dampening_frac` sets how much influence the GPTQ algorithm has. Lower values can improve accuracy, but can lead to numerical instabilities that cause the algorithm to fail.
-  - `actorder` sets the activation ordering. When compressing the weights of a layer weight, the order in which channels are quantized matters. Setting `actorder="weight"` can improve accuracy without added latency.
+    - `dampening_frac` sets how much influence the GPTQ algorithm has. Lower values can improve accuracy, but can lead to numerical instabilities that cause the algorithm to fail.
+    - `actorder` sets the activation ordering. When compressing the weights of a layer weight, the order in which channels are quantized matters. Setting `actorder="weight"` can improve accuracy without added latency.
 
 The following is an example of an expanded quantization recipe you can tune to your own use case:
 
diff --git a/docs/features/quantization/quantized_kvcache.md b/docs/features/quantization/quantized_kvcache.md
index c54ec43658a43..b2b417309e92b 100644
--- a/docs/features/quantization/quantized_kvcache.md
+++ b/docs/features/quantization/quantized_kvcache.md
@@ -50,6 +50,7 @@ Here is an example of how to enable FP8 quantization:
     ```
 
 The `kv_cache_dtype` argument specifies the data type for KV cache storage:
+
 - `"auto"`: Uses the model's default "unquantized" data type
 - `"fp8"` or `"fp8_e4m3"`: Supported on CUDA 11.8+ and ROCm (AMD GPU)
 - `"fp8_e5m2"`: Supported on CUDA 11.8+
diff --git a/docs/features/quantization/quark.md b/docs/features/quantization/quark.md
index 5abfae35eeec4..e8ed2155375d4 100644
--- a/docs/features/quantization/quark.md
+++ b/docs/features/quantization/quark.md
@@ -213,6 +213,7 @@ lm_eval --model vllm \
 ```
 
 ## Quark Quantization Script
+
 In addition to the example of Python API above, Quark also offers a
 [quantization script](https://quark.docs.amd.com/latest/pytorch/example_quark_torch_llm_ptq.html)
 to quantize large language models more conveniently. It supports quantizing models with variety
diff --git a/docs/features/quantization/supported_hardware.md b/docs/features/quantization/supported_hardware.md
index 70a6a499562a3..f53e69ecc6115 100644
--- a/docs/features/quantization/supported_hardware.md
+++ b/docs/features/quantization/supported_hardware.md
@@ -2,19 +2,26 @@
 
 The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:
 
+<style>
+th {
+  white-space: nowrap;
+  min-width: 0 !important;
+}
+</style>
+
 | Implementation        | Volta   | Turing   | Ampere   | Ada   | Hopper   | AMD GPU   | Intel GPU   | Intel Gaudi | x86 CPU   | AWS Neuron   | Google TPU   |
 |-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-------------|-----------|--------------|--------------|
-| AWQ                   | ❌      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌        | ✅︎          | ❌         | ✅︎        | ❌           | ❌           |
-| GPTQ                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎        | ❌        | ✅︎          | ❌         | ✅︎        | ❌           | ❌           |
-| Marlin (GPTQ/AWQ/FP8) | ❌      | ❌      | ✅︎       | ✅︎    | ✅︎       | ❌        | ❌          | ❌         | ❌        | ❌          | ❌           |
-| INT8 (W8A8)           | ❌      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌        | ❌          | ❌         | ✅︎        | ✅︎           | ✅︎            |
-| FP8 (W8A8)            | ❌      | ❌      | ❌       | ✅︎    | ✅︎      | ✅︎         | ❌          | ❌         | ❌        | ✅︎           | ❌           |
+| AWQ                   | ❌      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ✅︎          | ❌         | ✅︎        | ❌          | ❌           |
+| GPTQ                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ✅︎          | ❌         | ✅︎        | ❌          | ❌           |
+| Marlin (GPTQ/AWQ/FP8) | ❌      | ❌       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
+| INT8 (W8A8)           | ❌      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ✅︎        | ✅︎          | ✅︎           |
+| FP8 (W8A8)            | ❌      | ❌       | ❌       | ✅︎    | ✅︎       | ✅︎         | ❌          | ❌         | ❌        | ✅︎          | ❌           |
 | BitBLAS (GPTQ)        | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
 | AQLM                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
 | bitsandbytes          | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
 | DeepSpeedFP           | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
-| GGUF                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ✅︎         | ❌          | ❌         | ❌         | ❌          | ❌           |
-| INC (W8A8)            | ❌      | ❌      | ❌      | ❌    | ❌      | ❌        | ❌          | ✅︎         | ❌         | ❌           | ❌          |
+| GGUF                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ✅︎         | ❌          | ❌         | ❌        | ❌          | ❌           |
+| INC (W8A8)            | ❌      | ❌       | ❌       | ❌    | ❌       | ❌         | ❌          | ✅︎         | ❌        | ❌          | ❌           |
 
 - Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0.
 - ✅︎ indicates that the quantization method is supported on the specified hardware.
diff --git a/docs/features/quantization/torchao.md b/docs/features/quantization/torchao.md
index ab6802177048b..6932445997012 100644
--- a/docs/features/quantization/torchao.md
+++ b/docs/features/quantization/torchao.md
@@ -13,6 +13,7 @@ pip install \
 ```
 
 ## Quantizing HuggingFace Models
+
 You can quantize your own huggingface model with torchao, e.g. [transformers](https://huggingface.co/docs/transformers/main/en/quantization/torchao) and [diffusers](https://huggingface.co/docs/diffusers/en/quantization/torchao), and save the checkpoint to huggingface hub like [this](https://huggingface.co/jerryzh168/llama3-8b-int8wo) with the following example code:
 
 ??? code
diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md
index 6b84eca275309..04b943efbbbb4 100644
--- a/docs/features/reasoning_outputs.md
+++ b/docs/features/reasoning_outputs.md
@@ -123,13 +123,12 @@ OpenAI Python client library does not officially support `reasoning_content` att
     printed_content = False
 
     for chunk in stream:
-        reasoning_content = None
-        content = None
-        # Check the content is reasoning_content or content
-        if hasattr(chunk.choices[0].delta, "reasoning_content"):
-            reasoning_content = chunk.choices[0].delta.reasoning_content
-        elif hasattr(chunk.choices[0].delta, "content"):
-            content = chunk.choices[0].delta.content
+        # Safely extract reasoning_content and content from delta,
+        # defaulting to None if attributes don't exist or are empty strings
+        reasoning_content = (
+            getattr(chunk.choices[0].delta, "reasoning_content", None) or None
+        )
+        content = getattr(chunk.choices[0].delta, "content", None) or None
 
         if reasoning_content is not None:
             if not printed_reasoning_content:
diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md
index 2d2598da943c7..7a34d47d8e494 100644
--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@@ -164,7 +164,7 @@ Note, it is recommended to manually reserve 1 CPU for vLLM front-end process whe
 
 ### How to decide `VLLM_CPU_KVCACHE_SPACE`?
 
-  - This value is 4GB by default. Larger space can support more concurrent requests, longer context length. However, users should take care of memory capacity of each NUMA node. The memory usage of each TP rank is the sum of `weight shard size` and `VLLM_CPU_KVCACHE_SPACE`, if it exceeds the capacity of a single NUMA node, the TP worker will be killed with `exitcode 9` due to out-of-memory.
+This value is 4GB by default. Larger space can support more concurrent requests, longer context length. However, users should take care of memory capacity of each NUMA node. The memory usage of each TP rank is the sum of `weight shard size` and `VLLM_CPU_KVCACHE_SPACE`, if it exceeds the capacity of a single NUMA node, the TP worker will be killed with `exitcode 9` due to out-of-memory.
 
 ### How to do performance tuning for vLLM CPU?
 
@@ -183,13 +183,13 @@ vLLM CPU supports tensor parallel (TP) and pipeline parallel (PP) to leverage mu
 
 ### Which quantization configs does vLLM CPU support?
 
-  - vLLM CPU supports quantizations:
+- vLLM CPU supports quantizations:
     - AWQ (x86 only)
     - GPTQ (x86 only)
     - compressed-tensor INT8 W8A8 (x86, s390x)
 
 ### (x86 only) What is the purpose of `VLLM_CPU_MOE_PREPACK` and `VLLM_CPU_SGL_KERNEL`?
 
-  - Both of them requires `amx` CPU flag.
+- Both of them requires `amx` CPU flag.
     - `VLLM_CPU_MOE_PREPACK` can provides better performance for MoE models
     - `VLLM_CPU_SGL_KERNEL` can provides better performance for MoE models and small-batch scenarios.
diff --git a/docs/getting_started/installation/gpu/cuda.inc.md b/docs/getting_started/installation/gpu/cuda.inc.md
index 5ca5296d0a657..69a9842e4719b 100644
--- a/docs/getting_started/installation/gpu/cuda.inc.md
+++ b/docs/getting_started/installation/gpu/cuda.inc.md
@@ -20,16 +20,16 @@ Therefore, it is recommended to install vLLM with a **fresh new** environment. I
 # --8<-- [end:set-up-using-python]
 # --8<-- [start:pre-built-wheels]
 
-You can install vLLM using either `pip` or `uv pip`:
-
 ```bash
-# Install vLLM with CUDA 12.8.
-# If you are using pip.
-pip install vllm --extra-index-url https://download.pytorch.org/whl/cu128
-# If you are using uv.
 uv pip install vllm --torch-backend=auto
 ```
 
+??? console "pip"
+    ```bash
+    # Install vLLM with CUDA 12.8.
+    pip install vllm --extra-index-url https://download.pytorch.org/whl/cu128
+    ```
+
 We recommend leveraging `uv` to [automatically select the appropriate PyTorch index at runtime](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection) by inspecting the installed CUDA driver version via `--torch-backend=auto` (or `UV_TORCH_BACKEND=auto`). To select a specific backend (e.g., `cu126`), set `--torch-backend=cu126` (or `UV_TORCH_BACKEND=cu126`). If this doesn't work, try running `uv self update` to update `uv` first.
 
 !!! note
@@ -38,10 +38,10 @@ We recommend leveraging `uv` to [automatically select the appropriate PyTorch in
 As of now, vLLM's binaries are compiled with CUDA 12.8 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.6, 11.8, and public PyTorch release versions:
 
 ```bash
-# Install vLLM with CUDA 11.8.
-export VLLM_VERSION=0.6.1.post1
-export PYTHON_VERSION=312
-uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
+# Install vLLM with a specific CUDA version (e.g., 11.8 or 12.6).
+export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//')
+export CUDA_VERSION=118 # or 126
+uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu${CUDA_VERSION}-cp38-abi3-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu${CUDA_VERSION}
 ```
 
 [](){ #install-the-latest-code }
@@ -50,36 +50,22 @@ uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VE
 
 LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on a x86 platform with CUDA 12 for every commit since `v0.5.3`.
 
-##### Install the latest code using `pip`
-
-```bash
-pip install -U vllm \
-    --pre \
-    --extra-index-url https://wheels.vllm.ai/nightly
-```
-
-`--pre` is required for `pip` to consider pre-released versions.
-
-Another way to install the latest code is to use `uv`:
-
 ```bash
 uv pip install -U vllm \
     --torch-backend=auto \
     --extra-index-url https://wheels.vllm.ai/nightly
 ```
 
-##### Install specific revisions using `pip`
+??? console "pip"
+    ```bash
+    pip install -U vllm \
+        --pre \
+        --extra-index-url https://wheels.vllm.ai/nightly
+    ```
 
-If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), due to the limitation of `pip`, you have to specify the full URL of the wheel file by embedding the commit hash in the URL:
+    `--pre` is required for `pip` to consider pre-released versions.
 
-```bash
-export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
-pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
-```
-
-Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in the wheel metadata (the wheels listed in the extra index url have correct versions). Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
-
-##### Install specific revisions using `uv`
+##### Install specific revisions
 
 If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL:
 
@@ -92,17 +78,35 @@ uv pip install vllm \
 
 The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-remember command. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version.
 
+??? note "pip"
+    If you want to access the wheels for previous commits (e.g. to bisect the behavior change,
+    performance regression), due to the limitation of `pip`, you have to specify the full URL of the
+    wheel file by embedding the commit hash in the URL:
+
+    ```bash
+    export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
+    pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
+    ```
+
+    Note that the wheels are built with Python 3.8 ABI (see [PEP
+    425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible
+    with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a
+    placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in
+    the wheel metadata (the wheels listed in the extra index url have correct versions). Although we
+    don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the
+    wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
+
 # --8<-- [end:pre-built-wheels]
 # --8<-- [start:build-wheel-from-source]
 
 #### Set up using Python-only build (without compilation)
 
-If you only need to change Python code, you can build and install vLLM without compilation. Using `pip`'s [`--editable` flag](https://pip.pypa.io/en/stable/topics/local-project-installs/#editable-installs), changes you make to the code will be reflected when you run vLLM:
+If you only need to change Python code, you can build and install vLLM without compilation. Using `uv pip`'s [`--editable` flag](https://docs.astral.sh/uv/pip/packages/#editable-packages), changes you make to the code will be reflected when you run vLLM:
 
 ```bash
 git clone https://github.com/vllm-project/vllm.git
 cd vllm
-VLLM_USE_PRECOMPILED=1 pip install --editable .
+VLLM_USE_PRECOMPILED=1 uv pip install --editable .
 ```
 
 This command will do the following:
@@ -121,7 +125,7 @@ In case you see an error about wheel not found when running the above command, i
 ```bash
 export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch
 export VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
-pip install --editable .
+uv pip install --editable .
 ```
 
 You can find more information about vLLM's wheels in [install-the-latest-code][install-the-latest-code].
@@ -137,7 +141,7 @@ If you want to modify C++ or CUDA code, you'll need to build vLLM from source. T
 ```bash
 git clone https://github.com/vllm-project/vllm.git
 cd vllm
-pip install -e .
+uv pip install -e .
 ```
 
 !!! tip
@@ -152,14 +156,14 @@ pip install -e .
     The following environment variables can be set to configure the vLLM `sccache` remote: `SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1`. We also recommend setting `SCCACHE_IDLE_TIMEOUT=0`.
 
 !!! note "Faster Kernel Development"
-    For frequent C++/CUDA kernel changes, after the initial `pip install -e .` setup, consider using the [Incremental Compilation Workflow](../../contributing/incremental_build.md) for significantly faster rebuilds of only the modified kernel code.
+    For frequent C++/CUDA kernel changes, after the initial `uv pip install -e .` setup, consider using the [Incremental Compilation Workflow](../../contributing/incremental_build.md) for significantly faster rebuilds of only the modified kernel code.
 
 ##### Use an existing PyTorch installation
 
-There are scenarios where the PyTorch dependency cannot be easily installed via pip, e.g.:
+There are scenarios where the PyTorch dependency cannot be easily installed with `uv`, e.g.:
 
 - Building vLLM with PyTorch nightly or a custom PyTorch build.
-- Building vLLM with aarch64 and CUDA (GH200), where the PyTorch wheels are not available on PyPI. Currently, only the PyTorch nightly has wheels for aarch64 with CUDA. You can run `pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124` to [install PyTorch nightly](https://pytorch.org/get-started/locally/), and then build vLLM on top of it.
+- Building vLLM with aarch64 and CUDA (GH200), where the PyTorch wheels are not available on PyPI. Currently, only the PyTorch nightly has wheels for aarch64 with CUDA. You can run `uv pip install --index-url https://download.pytorch.org/whl/nightly/cu128 torch torchvision torchaudio` to [install PyTorch nightly](https://pytorch.org/get-started/locally/) and then build vLLM on top of it.
 
 To build vLLM using an existing PyTorch installation:
 
@@ -167,8 +171,8 @@ To build vLLM using an existing PyTorch installation:
 git clone https://github.com/vllm-project/vllm.git
 cd vllm
 python use_existing_torch.py
-pip install -r requirements/build.txt
-pip install --no-build-isolation -e .
+uv pip install -r requirements/build.txt
+uv pip install --no-build-isolation -e .
 ```
 
 ##### Use the local cutlass for compilation
@@ -179,7 +183,7 @@ To achieve this, you can set the environment variable VLLM_CUTLASS_SRC_DIR to po
 ```bash
 git clone https://github.com/vllm-project/vllm.git
 cd vllm
-VLLM_CUTLASS_SRC_DIR=/path/to/cutlass pip install -e .
+VLLM_CUTLASS_SRC_DIR=/path/to/cutlass uv pip install -e .
 ```
 
 ##### Troubleshooting
@@ -189,7 +193,7 @@ to be run simultaneously, via the environment variable `MAX_JOBS`. For example:
 
 ```bash
 export MAX_JOBS=6
-pip install -e .
+uv pip install -e .
 ```
 
 This is especially useful when you are building on less powerful machines. For example, when you use WSL it only [assigns 50% of the total memory by default](https://learn.microsoft.com/en-us/windows/wsl/wsl-config#main-wsl-settings), so using `export MAX_JOBS=1` can avoid compiling multiple files simultaneously and running out of memory.
@@ -228,7 +232,7 @@ Simply disable the `VLLM_TARGET_DEVICE` environment variable before installing:
 
 ```bash
 export VLLM_TARGET_DEVICE=empty
-pip install -e .
+uv pip install -e .
 ```
 
 # --8<-- [end:build-wheel-from-source]
diff --git a/docs/getting_started/installation/intel_gaudi.md b/docs/getting_started/installation/intel_gaudi.md
index 0be0d02d0679c..61b2b02aa10ba 100644
--- a/docs/getting_started/installation/intel_gaudi.md
+++ b/docs/getting_started/installation/intel_gaudi.md
@@ -339,13 +339,13 @@ Each described step is logged by vLLM server, as follows (negative values corres
 
 - `VLLM_{phase}_{dim}_BUCKET_{param}` - collection of 12 environment variables configuring ranges of bucketing mechanism
 
-    * `{phase}` is either `PROMPT` or `DECODE`
+    - `{phase}` is either `PROMPT` or `DECODE`
 
-    * `{dim}` is either `BS`, `SEQ` or `BLOCK`
+    - `{dim}` is either `BS`, `SEQ` or `BLOCK`
 
-    * `{param}` is either `MIN`, `STEP` or `MAX`
+    - `{param}` is either `MIN`, `STEP` or `MAX`
 
-    * Default values:
+    - Default values:
 
 | `{phase}` | Parameter | Env Variable | Value Expression |
 |-----------|-----------|--------------|------------------|
diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md
index 74235db16a15d..3a93497fab137 100644
--- a/docs/getting_started/quickstart.md
+++ b/docs/getting_started/quickstart.md
@@ -126,6 +126,7 @@ curl http://localhost:8000/v1/models
 ```
 
 You can pass in the argument `--api-key` or environment variable `VLLM_API_KEY` to enable the server to check for API key in the header.
+You can pass multiple keys after `--api-key`, and the server will accept any of the keys passed, this can be useful for key rotation.
 
 ### OpenAI Completions API with vLLM
 
diff --git a/docs/mkdocs/hooks/generate_argparse.py b/docs/mkdocs/hooks/generate_argparse.py
index 22cf41e6041d2..b003b5fd6ccef 100644
--- a/docs/mkdocs/hooks/generate_argparse.py
+++ b/docs/mkdocs/hooks/generate_argparse.py
@@ -62,6 +62,11 @@ class MarkdownFormatter(HelpFormatter):
                 choices = f'`{"`, `".join(str(c) for c in choices)}`'
                 self._markdown_output.append(
                     f"Possible choices: {choices}\n\n")
+            elif ((metavar := action.metavar)
+                  and isinstance(metavar, (list, tuple))):
+                metavar = f'`{"`, `".join(str(m) for m in metavar)}`'
+                self._markdown_output.append(
+                    f"Possible choices: {metavar}\n\n")
 
             self._markdown_output.append(f"{action.help}\n\n")
 
diff --git a/docs/models/generative_models.md b/docs/models/generative_models.md
index 21ad115e411a3..a3ad413593f3c 100644
--- a/docs/models/generative_models.md
+++ b/docs/models/generative_models.md
@@ -2,12 +2,19 @@
 
 vLLM provides first-class support for generative models, which covers most of LLMs.
 
-In vLLM, generative models implement the [VllmModelForTextGeneration][vllm.model_executor.models.VllmModelForTextGeneration] interface.
+In vLLM, generative models implement the[VllmModelForTextGeneration][vllm.model_executor.models.VllmModelForTextGeneration] interface.
 Based on the final hidden states of the input, these models output log probabilities of the tokens to generate,
 which are then passed through [Sampler][vllm.model_executor.layers.Sampler] to obtain the final text.
 
-For generative models, the only supported `--task` option is `"generate"`.
-Usually, this is automatically inferred so you don't have to specify it.
+## Configuration
+
+### Model Runner (`--runner`)
+
+Run a model in generation mode via the option `--runner generate`.
+
+!!! tip
+    There is no need to set this option in the vast majority of cases as vLLM can automatically
+    detect the model runner to use via `--runner auto`.
 
 ## Offline Inference
 
diff --git a/docs/models/hardware_supported_models/tpu.md b/docs/models/hardware_supported_models/tpu.md
index da03a3b3160ad..7b0a5ba6e72da 100644
--- a/docs/models/hardware_supported_models/tpu.md
+++ b/docs/models/hardware_supported_models/tpu.md
@@ -1,7 +1,8 @@
 # TPU
 
-# TPU Supported Models
-## Text-only Language Models
+## Supported Models
+
+### Text-only Language Models
 
 | Model                                               | Architecture                   | Supported |
 |-----------------------------------------------------|--------------------------------|-----------|
diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md
index 741ae2d79c1e5..1fbbba7ace5e1 100644
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -1,82 +1,88 @@
 # Pooling Models
 
-vLLM also supports pooling models, including embedding, reranking and reward models.
+vLLM also supports pooling models, such as embedding, classification and reward models.
 
 In vLLM, pooling models implement the [VllmModelForPooling][vllm.model_executor.models.VllmModelForPooling] interface.
-These models use a [Pooler][vllm.model_executor.layers.Pooler] to extract the final hidden states of the input
+These models use a [Pooler][vllm.model_executor.layers.pooler.Pooler] to extract the final hidden states of the input
 before returning them.
 
 !!! note
-    We currently support pooling models primarily as a matter of convenience.
-    As shown in the [Compatibility Matrix](../features/compatibility_matrix.md), most vLLM features are not applicable to
-    pooling models as they only work on the generation or decode stage, so performance may not improve as much.
+    We currently support pooling models primarily as a matter of convenience. This is not guaranteed to have any performance improvement over using HF Transformers / Sentence Transformers directly.
 
-If the model doesn't implement this interface, you can set `--task` which tells vLLM
-to convert the model into a pooling model.
+    We are now planning to optimize pooling models in vLLM. Please comment on <gh-issue:21796> if you have any suggestions!
 
-| `--task`   | Model type           | Supported pooling tasks       |
-|------------|----------------------|-------------------------------|
-| `embed`    | Embedding model      | `encode`, `embed`             |
-| `classify` | Classification model | `encode`, `classify`, `score` |
-| `reward`   | Reward model         | `encode`                      |
+## Configuration
 
-## Pooling Tasks
+### Model Runner
 
-In vLLM, we define the following pooling tasks and corresponding APIs:
+Run a model in pooling mode via the option `--runner pooling`.
 
-| Task       | APIs               |
-|------------|--------------------|
-| `encode`   | `encode`           |
-| `embed`    | `embed`, `score`\* |
-| `classify` | `classify`         |
-| `score`    | `score`            |
+!!! tip
+    There is no need to set this option in the vast majority of cases as vLLM can automatically
+    detect the model runner to use via `--runner auto`.
 
-\*The `score` API falls back to `embed` task if the model does not support `score` task.
+### Model Conversion
 
-Each pooling model in vLLM supports one or more of these tasks according to [Pooler.get_supported_tasks][vllm.model_executor.layers.Pooler.get_supported_tasks].
+vLLM can adapt models for various pooling tasks via the option `--convert <type>`.
 
-By default, the pooler assigned to each task has the following attributes:
+If `--runner pooling` has been set (manually or automatically) but the model does not implement the
+[VllmModelForPooling][vllm.model_executor.models.VllmModelForPooling] interface,
+vLLM will attempt to automatically convert the model according to the architecture names
+shown in the table below.
 
-| Task       | Pooling Type   | Normalization | Softmax |
-|------------|----------------|---------------|---------|
-| `encode`   | `ALL`          | ❌            | ❌      |
-| `embed`    | `LAST`         | ✅︎            | ❌      |
-| `classify` | `LAST`         | ❌            | ✅︎      |
+| Architecture                                    | `--convert` | Supported pooling tasks       |
+|-------------------------------------------------|-------------|-------------------------------|
+| `*ForTextEncoding`, `*EmbeddingModel`, `*Model` | `embed`     | `encode`, `embed`             |
+| `*For*Classification`, `*ClassificationModel`   | `classify`  | `encode`, `classify`, `score` |
+| `*ForRewardModeling`, `*RewardModel`            | `reward`    | `encode`                      |
 
-These defaults may be overridden by the model's implementation in vLLM.
+!!! tip
+    You can explicitly set `--convert <type>` to specify how to convert the model.
+
+### Pooling Tasks
+
+Each pooling model in vLLM supports one or more of these tasks according to
+[Pooler.get_supported_tasks][vllm.model_executor.layers.pooler.Pooler.get_supported_tasks],
+enabling the corresponding APIs:
+
+| Task       | APIs                                 |
+|------------|--------------------------------------|
+| `encode`   | `LLM.reward(...)`                    |
+| `embed`    | `LLM.embed(...)`, `LLM.score(...)`\* |
+| `classify` | `LLM.classify(...)`                  |
+| `score`    | `LLM.score(...)`                     |
+
+\* The `LLM.score(...)` API falls back to `embed` task if the model does not support `score` task.
+
+### Pooler Configuration
+
+#### Predefined models
+
+If the [Pooler][vllm.model_executor.layers.pooler.Pooler] defined by the model accepts `pooler_config`,
+you can override some of its attributes via the `--override-pooler-config` option.
+
+#### Converted models
+
+If the model has been converted via `--convert` (see above),
+the pooler assigned to each task has the following attributes by default:
+
+| Task       | Pooling Type | Normalization | Softmax |
+|------------|--------------|---------------|---------|
+| `reward`   | `ALL`        | ❌            | ❌     |
+| `embed`    | `LAST`       | ✅︎            | ❌      |
+| `classify` | `LAST`       | ❌            | ✅︎      |
 
 When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models,
-we attempt to override the defaults based on its Sentence Transformers configuration file (`modules.json`),
-which takes priority over the model's defaults.
+its Sentence Transformers configuration file (`modules.json`) takes priority over the model's defaults.
 
 You can further customize this via the `--override-pooler-config` option,
 which takes priority over both the model's and Sentence Transformers's defaults.
 
-!!! note
-
-    The above configuration may be disregarded if the model's implementation in vLLM defines its own pooler
-    that is not based on [PoolerConfig][vllm.config.PoolerConfig].
-
 ## Offline Inference
 
 The [LLM][vllm.LLM] class provides various methods for offline inference.
 See [configuration][configuration] for a list of options when initializing the model.
 
-### `LLM.encode`
-
-The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM.
-It returns the extracted hidden states directly, which is useful for reward models.
-
-```python
-from vllm import LLM
-
-llm = LLM(model="Qwen/Qwen2.5-Math-RM-72B", task="reward")
-(output,) = llm.encode("Hello, my name is")
-
-data = output.outputs.data
-print(f"Data: {data!r}")
-```
-
 ### `LLM.embed`
 
 The [embed][vllm.LLM.embed] method outputs an embedding vector for each prompt.
@@ -85,7 +91,7 @@ It is primarily designed for embedding models.
 ```python
 from vllm import LLM
 
-llm = LLM(model="intfloat/e5-mistral-7b-instruct", task="embed")
+llm = LLM(model="intfloat/e5-small", runner="pooling")
 (output,) = llm.embed("Hello, my name is")
 
 embeds = output.outputs.embedding
@@ -102,7 +108,7 @@ It is primarily designed for classification models.
 ```python
 from vllm import LLM
 
-llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach", task="classify")
+llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach", runner="pooling")
 (output,) = llm.classify("Hello, my name is")
 
 probs = output.outputs.probs
@@ -123,7 +129,7 @@ It is designed for embedding models and cross encoder models. Embedding models u
 ```python
 from vllm import LLM
 
-llm = LLM(model="BAAI/bge-reranker-v2-m3", task="score")
+llm = LLM(model="BAAI/bge-reranker-v2-m3", runner="pooling")
 (output,) = llm.score("What is the capital of France?",
                       "The capital of Brazil is Brasilia.")
 
@@ -133,6 +139,46 @@ print(f"Score: {score}")
 
 A code example can be found here: <gh-file:examples/offline_inference/basic/score.py>
 
+### `LLM.reward`
+
+The [reward][vllm.LLM.reward] method is available to all reward models in vLLM.
+It returns the extracted hidden states directly.
+
+```python
+from vllm import LLM
+
+llm = LLM(model="internlm/internlm2-1_8b-reward", runner="pooling", trust_remote_code=True)
+(output,) = llm.reward("Hello, my name is")
+
+data = output.outputs.data
+print(f"Data: {data!r}")
+```
+
+A code example can be found here: <gh-file:examples/offline_inference/basic/reward.py>
+
+### `LLM.encode`
+
+The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM.
+It returns the extracted hidden states directly.
+
+!!! note
+    Please use one of the more specific methods or set the task directly when using `LLM.encode`:
+
+    - For embeddings, use `LLM.embed(...)` or `pooling_task="embed"`.
+    - For classification logits, use `LLM.classify(...)` or `pooling_task="classify"`.
+    - For rewards, use `LLM.reward(...)` or `pooling_task="reward"`.
+    - For similarity scores, use `LLM.score(...)`.  
+
+```python
+from vllm import LLM
+
+llm = LLM(model="intfloat/e5-small", runner="pooling")
+(output,) = llm.encode("Hello, my name is", pooling_task="embed")
+
+data = output.outputs.data
+print(f"Data: {data!r}")
+```
+
 ## Online Serving
 
 Our [OpenAI-Compatible Server](../serving/openai_compatible_server.md) provides endpoints that correspond to the offline APIs:
@@ -175,7 +221,7 @@ You can change the output dimensions of embedding models that support Matryoshka
 from vllm import LLM, PoolingParams
 
 llm = LLM(model="jinaai/jina-embeddings-v3",
-          task="embed",
+          runner="pooling",
           trust_remote_code=True)
 outputs = llm.embed(["Follow the white rabbit."],
                     pooling_params=PoolingParams(dimensions=32))
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 0f3b730eabedc..5a9823bb6bae7 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -1,7 +1,6 @@
 # Supported Models
 
 vLLM supports [generative](./generative_models.md) and [pooling](./pooling_models.md) models across various tasks.
-If a model supports more than one task, you can set the task via the `--task` argument.
 
 For each task, we list the model architectures that have been implemented in vLLM.
 Alongside each architecture, we include some popular models that use it.
@@ -24,7 +23,7 @@ To check if the modeling backend is Transformers, you can simply do this:
 
 ```python
 from vllm import LLM
-llm = LLM(model=..., task="generate")  # Name or path of your model
+llm = LLM(model=...)  # Name or path of your model
 llm.apply_model(lambda model: print(type(model)))
 ```
 
@@ -46,10 +45,10 @@ If a model is neither supported natively by vLLM or Transformers, it can still b
 For a model to be compatible with the Transformers backend for vLLM it must:
 
 - be a Transformers compatible custom model (see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)):
-    * The model directory must have the correct structure (e.g. `config.json` is present).
-    * `config.json` must contain `auto_map.AutoModel`.
+    - The model directory must have the correct structure (e.g. `config.json` is present).
+    - `config.json` must contain `auto_map.AutoModel`.
 - be a Transformers backend for vLLM compatible model (see [writing-custom-models][writing-custom-models]):
-    * Customisation should be done in the base model (e.g. in `MyModel`, not `MyModelForCausalLM`).
+    - Customisation should be done in the base model (e.g. in `MyModel`, not `MyModelForCausalLM`).
 
 If the compatible model is:
 
@@ -135,10 +134,10 @@ class MyConfig(PretrainedConfig):
 
 - `base_model_tp_plan` is a `dict` that maps fully qualified layer name patterns to tensor parallel styles (currently only `"colwise"` and `"rowwise"` are supported).
 - `base_model_pp_plan` is a `dict` that maps direct child layer names to `tuple`s of `list`s of `str`s:
-    * You only need to do this for layers which are not present on all pipeline stages
-    * vLLM assumes that there will be only one `nn.ModuleList`, which is distributed across the pipeline stages
-    * The `list` in the first element of the `tuple` contains the names of the input arguments
-    * The `list` in the last element of the `tuple` contains the names of the variables the layer outputs to in your modeling code
+    - You only need to do this for layers which are not present on all pipeline stages
+    - vLLM assumes that there will be only one `nn.ModuleList`, which is distributed across the pipeline stages
+    - The `list` in the first element of the `tuple` contains the names of the input arguments
+    - The `list` in the last element of the `tuple` contains the names of the variables the layer outputs to in your modeling code
 
 ## Loading a Model
 
@@ -158,13 +157,13 @@ The [Transformers backend][transformers-backend] enables you to run models direc
     ```python
     from vllm import LLM
 
-    # For generative models (task=generate) only
-    llm = LLM(model=..., task="generate")  # Name or path of your model
+    # For generative models (runner=generate) only
+    llm = LLM(model=..., runner="generate")  # Name or path of your model
     output = llm.generate("Hello, my name is")
     print(output)
 
-    # For pooling models (task={embed,classify,reward,score}) only
-    llm = LLM(model=..., task="embed")  # Name or path of your model
+    # For pooling models (runner=pooling) only
+    llm = LLM(model=..., runner="pooling")  # Name or path of your model
     output = llm.encode("Hello, my name is")
     print(output)
     ```
@@ -281,13 +280,13 @@ And use with `trust_remote_code=True`.
 ```python
 from vllm import LLM
 
-llm = LLM(model=..., revision=..., task=..., trust_remote_code=True)
+llm = LLM(model=..., revision=..., runner=..., trust_remote_code=True)
 
-# For generative models (task=generate) only
+# For generative models (runner=generate) only
 output = llm.generate("Hello, my name is")
 print(output)
 
-# For pooling models (task={embed,classify,reward,score}) only
+# For pooling models (runner=pooling) only
 output = llm.encode("Hello, my name is")
 print(output)
 ```
@@ -312,8 +311,6 @@ See [this page](generative_models.md) for more information on how to use generat
 
 #### Text Generation
 
-Specified using `--task generate`.
-
 <style>
 th {
   white-space: nowrap;
@@ -339,7 +336,7 @@ th {
 | `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat`, etc. | | ✅︎ | ✅︎ |
 | `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3`, etc. | | ✅︎ | ✅︎ |
 | `Dots1ForCausalLM` | dots.llm1 | `rednote-hilab/dots.llm1.base`, `rednote-hilab/dots.llm1.inst`, etc. | | ✅︎ | ✅︎ |
-| `Ernie4_5_ForCausalLM` | Ernie4.5 | `baidu/ERNIE-4.5-0.3B-PT`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Ernie4_5ForCausalLM` | Ernie4.5 | `baidu/ERNIE-4.5-0.3B-PT`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Ernie4_5_MoeForCausalLM` | Ernie4.5MoE | `baidu/ERNIE-4.5-21B-A3B-PT`, `baidu/ERNIE-4.5-300B-A47B-PT`, etc. |✅︎| ✅︎ | ✅︎ |
 | `ExaoneForCausalLM` | EXAONE-3 | `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Exaone4ForCausalLM` | EXAONE-4 | `LGAI-EXAONE/EXAONE-4.0-32B`, etc. | ✅︎ | ✅︎ | ✅︎ |
@@ -392,7 +389,7 @@ th {
 | `PhiMoEForCausalLM` | Phi-3.5-MoE | `microsoft/Phi-3.5-MoE-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Phi4FlashForCausalLM` | Phi-4-mini-flash-reasoning | `microsoft/microsoft/Phi-4-mini-instruct`, etc. | | | |
 | `PersimmonForCausalLM` | Persimmon | `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc. | | ✅︎ | ✅︎ |
-| `Plamo2ForCausalLM` | PLaMo2 | `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc. | | | |
+| `Plamo2ForCausalLM` | PLaMo2 | `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc. | | ✅︎ | |
 | `QWenLMHeadModel` | Qwen | `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen2ForCausalLM` | QwQ, Qwen2 | `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen2MoeForCausalLM` | Qwen2MoE | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
@@ -420,25 +417,27 @@ See [this page](./pooling_models.md) for more information on how to use pooling
 
 !!! important
     Since some model architectures support both generative and pooling tasks,
-    you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode.
+    you should explicitly specify `--runner pooling` to ensure that the model is used in pooling mode instead of generative mode.
 
 #### Text Embedding
 
-Specified using `--task embed`.
-
 | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
 |--------------|--------|-------------------|----------------------|---------------------------|---------------------|
-| `BertModel` | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | | |
-| `Gemma2Model` | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | | ✅︎ |
+| `BertModel`<sup>C</sup> | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | | |
+| `Gemma2Model`<sup>C</sup> | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | | ✅︎ |
 | `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | |
-| `GteModel` | Arctic-Embed-2.0-M | `Snowflake/snowflake-arctic-embed-m-v2.0`. |  |  |  |
-| `GteNewModel` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. |  |  |  |
-| `ModernBertModel` | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. |  |  |  |
-| `NomicBertModel` | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. |  |  |  |
-| `LlamaModel`, `LlamaForCausalLM`, `MistralModel`, etc. | Llama-based | `intfloat/e5-mistral-7b-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Qwen2Model`, `Qwen2ForCausalLM` | Qwen2-based | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Qwen3Model`, `Qwen3ForCausalLM` | Qwen3-based | `Qwen/Qwen3-Embedding-0.6B`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `GteModel`<sup>C</sup> | Arctic-Embed-2.0-M | `Snowflake/snowflake-arctic-embed-m-v2.0`. |  |  |  |
+| `GteNewModel`<sup>C</sup> | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. |  |  |  |
+| `ModernBertModel`<sup>C</sup> | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. |  |  |  |
+| `NomicBertModel`<sup>C</sup> | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. |  |  |  |
+| `LlamaModel`<sup>C</sup>, `LlamaForCausalLM`<sup>C</sup>, `MistralModel`<sup>C</sup>, etc. | Llama-based | `intfloat/e5-mistral-7b-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Qwen2Model`<sup>C</sup>, `Qwen2ForCausalLM`<sup>C</sup> | Qwen2-based | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Qwen3Model`<sup>C</sup>, `Qwen3ForCausalLM`<sup>C</sup> | Qwen3-based | `Qwen/Qwen3-Embedding-0.6B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `RobertaModel`, `RobertaForMaskedLM` | RoBERTa-based | `sentence-transformers/all-roberta-large-v1`, etc. | | | |
+| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* | \* |
+
+<sup>C</sup> Automatically converted into an embedding model via `--convert embed`. ([details](./pooling_models.md#model-conversion))  
+\* Feature support is the same as that of the original model.
 
 !!! note
     `ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config.
@@ -460,14 +459,16 @@ of the whole prompt are extracted from the normalized hidden state corresponding
 
 #### Reward Modeling
 
-Specified using `--task reward`.
-
 | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
 |--------------|--------|-------------------|----------------------|---------------------------|---------------------|
 | `InternLM2ForRewardModel` | InternLM2-based | `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `LlamaForCausalLM` | Llama-based | `peiyi9979/math-shepherd-mistral-7b-prm`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `LlamaForCausalLM`<sup>C</sup> | Llama-based | `peiyi9979/math-shepherd-mistral-7b-prm`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen2ForRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-RM-72B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen2ForProcessRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-PRM-7B`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* | \* |
+
+<sup>C</sup> Automatically converted into a reward model via `--convert reward`. ([details](./pooling_models.md#model-conversion))  
+\* Feature support is the same as that of the original model.
 
 If your model is not in the above list, we will try to automatically convert the model using
 [as_reward_model][vllm.model_executor.models.adapters.as_reward_model]. By default, we return the hidden states of each token directly.
@@ -478,28 +479,31 @@ If your model is not in the above list, we will try to automatically convert the
 
 #### Classification
 
-Specified using `--task classify`.
-
 | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
 |--------------|--------|-------------------|----------------------|---------------------------|---------------------|
 | `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ | |
 | `GPT2ForSequenceClassification` | GPT2 | `nie3e/sentiment-polish-gpt2-small` | | | ✅︎ |
+| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* | \* |
+
+<sup>C</sup> Automatically converted into a classification model via `--convert classify`. ([details](./pooling_models.md#model-conversion))  
+\* Feature support is the same as that of the original model.
 
 If your model is not in the above list, we will try to automatically convert the model using
 [as_seq_cls_model][vllm.model_executor.models.adapters.as_seq_cls_model]. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
 
 #### Sentence Pair Scoring
 
-Specified using `--task score`.
+| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
+|--------------|--------|-------------------|----------------------|---------------------------|---------------------|
+| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | | | |
+| `GemmaForSequenceClassification` | Gemma-based | `BAAI/bge-reranker-v2-gemma` (see note), etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Qwen2ForSequenceClassification` | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2` (see note), etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Qwen3ForSequenceClassification` | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ | ✅︎ | ✅︎ |
+| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | | | |
+| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | | | |
 
-| Architecture | Models | Example HF Models | [V1](gh-issue:8779) |
-|--------------|--------|-------------------|---------------------|
-| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | |
-| `GemmaForSequenceClassification` | Gemma-based | `BAAI/bge-reranker-v2-gemma` (see note), etc. | |
-| `Qwen2ForSequenceClassification` | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2` (see note), etc. | ✅︎ |
-| `Qwen3ForSequenceClassification` | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ |
-| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | |
-| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | |
+<sup>C</sup> Automatically converted into a classification model via `--convert classify`. ([details](./pooling_models.md#model-conversion))  
+\* Feature support is the same as that of the original model.
 
 !!! note
     Load the official original `BAAI/bge-reranker-v2-gemma` by using the following command.
@@ -575,8 +579,6 @@ See [this page](generative_models.md) for more information on how to use generat
 
 #### Text Generation
 
-Specified using `--task generate`.
-
 | Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
 |--------------|--------|--------|-------------------|----------------------|---------------------------|---------------------|
 | `AriaForConditionalGeneration` | Aria | T + I<sup>+</sup> | `rhymes-ai/Aria` | | | ✅︎ |
@@ -593,6 +595,7 @@ Specified using `--task generate`.
 | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ |
 | `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ |
 | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ |
+| `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | | | ✅︎ |
 | `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | | ✅︎ |
@@ -613,6 +616,7 @@ Specified using `--task generate`.
 | `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ | ⚠️ |
 | `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ | ✅︎ |
 | `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Phi4MultimodalForCausalLM` | Phi-4-multimodal (HF Transformers) | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct` (with revision `refs/pr/70`), etc. | ✅︎ | ✅︎ | ✅︎ |
 | `PixtralForConditionalGeneration` | Mistral 3 (Mistral format), Pixtral (Mistral format) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistralai/Pixtral-12B-2409`, etc. | | ✅︎ | ✅︎ |
 | `QwenVLForConditionalGeneration`<sup>^</sup> | Qwen-VL | T + I<sup>E+</sup> | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A<sup>+</sup> | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ | ✅︎ |
@@ -630,10 +634,10 @@ Some models are supported only via the [Transformers backend](#transformers). Th
 |--------------|--------|--------|-------------------|-----------------------------|-----------------------------------------|---------------------|
 | `Emu3ForConditionalGeneration` | Emu3 | T + I | `BAAI/Emu3-Chat-hf` | ✅︎ | ✅︎ | ✅︎ |
 
-<sup>^</sup> You need to set the architecture name via `--hf-overrides` to match the one in vLLM.  
-&nbsp;&nbsp;&nbsp;&nbsp;• For example, to use DeepSeek-VL2 series models:  
-&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`--hf-overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'`  
-<sup>E</sup> Pre-computed embeddings can be inputted for this modality.  
+<sup>^</sup> You need to set the architecture name via `--hf-overrides` to match the one in vLLM.
+&nbsp;&nbsp;&nbsp;&nbsp;• For example, to use DeepSeek-VL2 series models:
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`--hf-overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'`
+<sup>E</sup> Pre-computed embeddings can be inputted for this modality.
 <sup>+</sup> Multiple items can be inputted per text prompt for this modality.
 
 !!! warning
@@ -703,8 +707,6 @@ Some models are supported only via the [Transformers backend](#transformers). Th
 
 #### Transcription
 
-Specified using `--task transcription`.
-
 Speech2Text models trained specifically for Automatic Speech Recognition.
 
 | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
@@ -717,14 +719,10 @@ See [this page](./pooling_models.md) for more information on how to use pooling
 
 !!! important
     Since some model architectures support both generative and pooling tasks,
-    you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode.
+    you should explicitly specify `--runner pooling` to ensure that the model is used in pooling mode instead of generative mode.
 
 #### Text Embedding
 
-Specified using `--task embed`.
-
-Any text generation model can be converted into an embedding model by passing `--task embed`.
-
 !!! note
     To get the best results, you should use pooling models that are specifically trained as such.
 
@@ -732,19 +730,24 @@ The following table lists those that are tested in vLLM.
 
 | Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
 |--------------|--------|--------|-------------------|----------------------|---------------------------|---------------------|
-| `LlavaNextForConditionalGeneration` | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | | |
-| `Phi3VForCausalLM` | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | 🚧 | ✅︎ | |
+| `LlavaNextForConditionalGeneration`<sup>C</sup> | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | | |
+| `Phi3VForCausalLM`<sup>C</sup> | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | 🚧 | ✅︎ | |
+| `*ForConditionalGeneration`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | \* | N/A | \* | \* | \* |
+
+<sup>C</sup> Automatically converted into an embedding model via `--convert embed`. ([details](./pooling_models.md#model-conversion))  
+\* Feature support is the same as that of the original model.
 
 ---
 
 #### Scoring
 
-Specified using `--task score`.
-
 | Architecture                        | Models             | Inputs   | Example HF Models        | [LoRA][lora-adapter]   | [PP][distributed-serving]   | [V1](gh-issue:8779)   |
 |-------------------------------------|--------------------|----------|--------------------------|------------------------|-----------------------------|-----------------------|
 | `JinaVLForSequenceClassification` | JinaVL-based | T + I<sup>E+</sup> | `jinaai/jina-reranker-m0`, etc. | | | ✅︎ |
 
+<sup>C</sup> Automatically converted into a classification model via `--convert classify`. ([details](./pooling_models.md#model-conversion))  
+\* Feature support is the same as that of the original model.
+
 ## Model Support Policy
 
 At vLLM, we are committed to facilitating the integration and support of third-party models within our ecosystem. Our approach is designed to balance the need for robustness and the practical limitations of supporting a wide range of models. Here’s how we manage third-party model support:
diff --git a/docs/serving/distributed_serving.md b/docs/serving/distributed_serving.md
index 4f111115f3073..08d889a00d2cf 100644
--- a/docs/serving/distributed_serving.md
+++ b/docs/serving/distributed_serving.md
@@ -58,7 +58,17 @@ vllm serve gpt2 \
 
 ## Multi-node deployment
 
-If a single node lacks sufficient GPUs to hold the model, deploy vLLM across multiple nodes. Multi-node deployments require Ray as the runtime engine. Ensure that every node provides an identical execution environment, including the model path and Python packages. Using container images is recommended because they provide a convenient way to keep environments consistent and to hide host heterogeneity.
+If a single node lacks sufficient GPUs to hold the model, deploy vLLM across multiple nodes. Ensure that every node provides an identical execution environment, including the model path and Python packages. Using container images is recommended because they provide a convenient way to keep environments consistent and to hide host heterogeneity.
+
+### What is Ray?
+
+Ray is a distributed computing framework for scaling Python programs. Multi-node vLLM deployments require Ray as the runtime engine.
+
+vLLM uses Ray to manage the distributed execution of tasks across multiple nodes and control where execution happens.
+
+Ray also offers high-level APIs for large-scale [offline batch inference](https://docs.ray.io/en/latest/data/working-with-llms.html) and [online serving](https://docs.ray.io/en/latest/serve/llm/serving-llms.html) that can leverage vLLM as the engine. These APIs add production-grade fault tolerance, scaling, and distributed observability to vLLM workloads.
+
+For details, see the [Ray documentation](https://docs.ray.io/en/latest/index.html).
 
 ### Ray cluster setup with containers
 
@@ -99,7 +109,7 @@ From any node, enter a container and run `ray status` and `ray list nodes` to ve
 ### Running vLLM on a Ray cluster
 
 !!! tip
-     If Ray is running inside containers, run the commands in the remainder of this guide _inside the containers_, not on the host. To open a shell inside a container, connect to a node and use `docker exec -it <container_name> /bin/bash`.
+    If Ray is running inside containers, run the commands in the remainder of this guide *inside the containers*, not on the host. To open a shell inside a container, connect to a node and use `docker exec -it <container_name> /bin/bash`.
 
 Once a Ray cluster is running, use vLLM as you would in a single-node setting. All resources across the Ray cluster are visible to vLLM, so a single `vllm` command on a single node is sufficient.
 
diff --git a/docs/serving/expert_parallel_deployment.md b/docs/serving/expert_parallel_deployment.md
index d79b6fc590189..280b3322b11c3 100644
--- a/docs/serving/expert_parallel_deployment.md
+++ b/docs/serving/expert_parallel_deployment.md
@@ -31,11 +31,12 @@ vLLM provides three communication backends for EP:
 
 Enable EP by setting the `--enable-expert-parallel` flag. The EP size is automatically calculated as:
 
-```
+```text
 EP_SIZE = TP_SIZE × DP_SIZE
 ```
 
 Where:
+
 - `TP_SIZE`: Tensor parallel size (always 1 for now)
 - `DP_SIZE`: Data parallel size
 - `EP_SIZE`: Expert parallel size (computed automatically)
diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md
index edec40f417600..dfed15d4ace97 100644
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -45,17 +45,17 @@ To call the server, in your preferred text editor, create a script that uses an
 We currently support the following OpenAI APIs:
 
 - [Completions API][completions-api] (`/v1/completions`)
-    - Only applicable to [text generation models](../models/generative_models.md) (`--task generate`).
+    - Only applicable to [text generation models](../models/generative_models.md).
     - *Note: `suffix` parameter is not supported.*
 - [Chat Completions API][chat-api] (`/v1/chat/completions`)
-    - Only applicable to [text generation models](../models/generative_models.md) (`--task generate`) with a [chat template][chat-template].
+    - Only applicable to [text generation models](../models/generative_models.md) with a [chat template][chat-template].
     - *Note: `parallel_tool_calls` and `user` parameters are ignored.*
 - [Embeddings API][embeddings-api] (`/v1/embeddings`)
-    - Only applicable to [embedding models](../models/pooling_models.md) (`--task embed`).
+    - Only applicable to [embedding models](../models/pooling_models.md).
 - [Transcriptions API][transcriptions-api] (`/v1/audio/transcriptions`)
-    - Only applicable to Automatic Speech Recognition (ASR) models (OpenAI Whisper) (`--task generate`).
+    - Only applicable to [Automatic Speech Recognition (ASR) models](../models/supported_models.md#transcription).
 - [Translation API][translations-api] (`/v1/audio/translations`)
-    - Only applicable to Automatic Speech Recognition (ASR) models (OpenAI Whisper) (`--task generate`).
+    - Only applicable to [Automatic Speech Recognition (ASR) models](../models/supported_models.md#transcription).
 
 In addition, we have the following custom APIs:
 
@@ -64,14 +64,14 @@ In addition, we have the following custom APIs:
 - [Pooling API][pooling-api] (`/pooling`)
     - Applicable to all [pooling models](../models/pooling_models.md).
 - [Classification API][classification-api] (`/classify`)
-    - Only applicable to [classification models](../models/pooling_models.md) (`--task classify`).
+    - Only applicable to [classification models](../models/pooling_models.md).
 - [Score API][score-api] (`/score`)
-    - Applicable to embedding models and [cross-encoder models](../models/pooling_models.md) (`--task score`).
+    - Applicable to [embedding models and cross-encoder models](../models/pooling_models.md).
 - [Re-rank API][rerank-api] (`/rerank`, `/v1/rerank`, `/v2/rerank`)
     - Implements [Jina AI's v1 re-rank API](https://jina.ai/reranker/)
     - Also compatible with [Cohere's v1 & v2 re-rank APIs](https://docs.cohere.com/v2/reference/rerank)
     - Jina and Cohere's APIs are very similar; Jina's includes extra information in the rerank endpoint's response.
-    - Only applicable to [cross-encoder models](../models/pooling_models.md) (`--task score`).
+    - Only applicable to [cross-encoder models](../models/pooling_models.md).
 
 [](){ #chat-template }
 
@@ -206,6 +206,7 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai
 We support both [Vision](https://platform.openai.com/docs/guides/vision)- and
 [Audio](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in)-related parameters;
 see our [Multimodal Inputs](../features/multimodal_inputs.md) guide for more information.
+
 - *Note: `image_url.detail` parameter is not supported.*
 
 Code example: <gh-file:examples/online_serving/openai_chat_completion_client.py>
@@ -250,14 +251,14 @@ and passing a list of `messages` in the request. Refer to the examples below for
     To serve the model:
 
     ```bash
-    vllm serve TIGER-Lab/VLM2Vec-Full --task embed \
+    vllm serve TIGER-Lab/VLM2Vec-Full --runner pooling \
       --trust-remote-code \
       --max-model-len 4096 \
       --chat-template examples/template_vlm2vec.jinja
     ```
 
     !!! important
-        Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass `--task embed`
+        Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass `--runner pooling`
         to run this model in embedding mode instead of text generation mode.
 
         The custom chat template is completely different from the original one for this model,
@@ -296,14 +297,14 @@ and passing a list of `messages` in the request. Refer to the examples below for
     To serve the model:
 
     ```bash
-    vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embed \
+    vllm serve MrLight/dse-qwen2-2b-mrl-v1 --runner pooling \
       --trust-remote-code \
       --max-model-len 8192 \
       --chat-template examples/template_dse_qwen2_vl.jinja
     ```
 
     !!! important
-        Like with VLM2Vec, we have to explicitly pass `--task embed`.
+        Like with VLM2Vec, we have to explicitly pass `--runner pooling`.
 
         Additionally, `MrLight/dse-qwen2-2b-mrl-v1` requires an EOS token for embeddings, which is handled
         by a custom chat template: <gh-file:examples/template_dse_qwen2_vl.jinja>
diff --git a/docs/training/rlhf.md b/docs/training/rlhf.md
index 4f75e4e01495c..f608a630ab7a5 100644
--- a/docs/training/rlhf.md
+++ b/docs/training/rlhf.md
@@ -2,10 +2,14 @@
 
 Reinforcement Learning from Human Feedback (RLHF) is a technique that fine-tunes language models using human-generated preference data to align model outputs with desired behaviors.
 
-vLLM can be used to generate the completions for RLHF. The best way to do this is with libraries like [TRL](https://github.com/huggingface/trl), [OpenRLHF](https://github.com/OpenRLHF/OpenRLHF) and [verl](https://github.com/volcengine/verl).
+vLLM can be used to generate the completions for RLHF. Some ways to do this include using libraries like [TRL](https://github.com/huggingface/trl), [OpenRLHF](https://github.com/OpenRLHF/OpenRLHF), [verl](https://github.com/volcengine/verl) and [unsloth](https://github.com/unslothai/unsloth).
 
 See the following basic examples to get started if you don't want to use an existing library:
 
 - [Training and inference processes are located on separate GPUs (inspired by OpenRLHF)](../examples/offline_inference/rlhf.md)
 - [Training and inference processes are colocated on the same GPUs using Ray](../examples/offline_inference/rlhf_colocate.md)
 - [Utilities for performing RLHF with vLLM](../examples/offline_inference/rlhf_utils.md)
+
+See the following notebooks showing how to use vLLM for GRPO:
+
+- [Qwen-3 4B GRPO using Unsloth + vLLM](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_(4B)-GRPO.ipynb)
diff --git a/docs/usage/security.md b/docs/usage/security.md
index 76140434dcb36..d54e2bb37ec07 100644
--- a/docs/usage/security.md
+++ b/docs/usage/security.md
@@ -13,15 +13,18 @@ All communications between nodes in a multi-node vLLM deployment are **insecure
 The following options control inter-node communications in vLLM:
 
 #### 1. **Environment Variables:**
-   - `VLLM_HOST_IP`: Sets the IP address for vLLM processes to communicate on
+
+- `VLLM_HOST_IP`: Sets the IP address for vLLM processes to communicate on
 
 #### 2. **KV Cache Transfer Configuration:**
-   - `--kv-ip`: The IP address for KV cache transfer communications (default: 127.0.0.1)
-   - `--kv-port`: The port for KV cache transfer communications (default: 14579)
+
+- `--kv-ip`: The IP address for KV cache transfer communications (default: 127.0.0.1)
+- `--kv-port`: The port for KV cache transfer communications (default: 14579)
 
 #### 3. **Data Parallel Configuration:**
-   - `data_parallel_master_ip`: IP of the data parallel master (default: 127.0.0.1)
-   - `data_parallel_master_port`: Port of the data parallel master (default: 29500)
+
+- `data_parallel_master_ip`: IP of the data parallel master (default: 127.0.0.1)
+- `data_parallel_master_port`: Port of the data parallel master (default: 29500)
 
 ### Notes on PyTorch Distributed
 
@@ -41,18 +44,21 @@ Key points from the PyTorch security guide:
 ### Security Recommendations
 
 #### 1. **Network Isolation:**
-   - Deploy vLLM nodes on a dedicated, isolated network
-   - Use network segmentation to prevent unauthorized access
-   - Implement appropriate firewall rules
+
+- Deploy vLLM nodes on a dedicated, isolated network
+- Use network segmentation to prevent unauthorized access
+- Implement appropriate firewall rules
 
 #### 2. **Configuration Best Practices:**
-   - Always set `VLLM_HOST_IP` to a specific IP address rather than using defaults
-   - Configure firewalls to only allow necessary ports between nodes
+
+- Always set `VLLM_HOST_IP` to a specific IP address rather than using defaults
+- Configure firewalls to only allow necessary ports between nodes
 
 #### 3. **Access Control:**
-   - Restrict physical and network access to the deployment environment
-   - Implement proper authentication and authorization for management interfaces
-   - Follow the principle of least privilege for all system components
+
+- Restrict physical and network access to the deployment environment
+- Implement proper authentication and authorization for management interfaces
+- Follow the principle of least privilege for all system components
 
 ## Security and Firewalls: Protecting Exposed vLLM Systems
 
diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md
index 498ff3da0ca31..38399c6633bdb 100644
--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@@ -148,7 +148,7 @@ are not yet supported.
 vLLM V1 supports logprobs and prompt logprobs. However, there are some important semantic
 differences compared to V0:
 
-**Logprobs Calculation**
+##### Logprobs Calculation
 
 Logprobs in V1 are now returned immediately once computed from the model’s raw output (i.e.
 before applying any logits post-processing such as temperature scaling or penalty
@@ -157,7 +157,7 @@ probabilities used during sampling.
 
 Support for logprobs with post-sampling adjustments is in progress and will be added in future updates.
 
-**Prompt Logprobs with Prefix Caching**
+##### Prompt Logprobs with Prefix Caching
 
 Currently prompt logprobs are only supported when prefix caching is turned off via `--no-enable-prefix-caching`. In a future release, prompt logprobs will be compatible with prefix caching, but a recomputation will be triggered to recover the full prompt logprobs even upon a prefix cache hit. See details in [RFC #13414](gh-issue:13414).
 
@@ -165,7 +165,7 @@ Currently prompt logprobs are only supported when prefix caching is turned off v
 
 As part of the major architectural rework in vLLM V1, several legacy features have been deprecated.
 
-**Sampling features**
+##### Sampling features
 
 - **best_of**: This feature has been deprecated due to limited usage. See details at [RFC #13361](gh-issue:13361).
 - **Per-Request Logits Processors**: In V0, users could pass custom
@@ -173,11 +173,11 @@ As part of the major architectural rework in vLLM V1, several legacy features ha
   feature has been deprecated. Instead, the design is moving toward supporting **global logits
   processors**, a feature the team is actively working on for future releases. See details at [RFC #13360](gh-pr:13360).
 
-**KV Cache features**
+##### KV Cache features
 
 - **GPU <> CPU KV Cache Swapping**: with the new simplified core architecture, vLLM V1 no longer requires KV cache swapping
 to handle request preemptions.
 
-**Structured Output features**
+##### Structured Output features
 
 - **Request-level Structured Output Backend**: Deprecated, alternative backends (outlines, guidance) with fallbacks is supported now.
diff --git a/examples/offline_inference/async_llm_streaming.py b/examples/offline_inference/async_llm_streaming.py
new file mode 100644
index 0000000000000..b876d536e3a19
--- /dev/null
+++ b/examples/offline_inference/async_llm_streaming.py
@@ -0,0 +1,111 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Simple example demonstrating streaming offline inference with AsyncLLM (V1 engine).
+
+This script shows the core functionality of vLLM's AsyncLLM engine for streaming
+token-by-token output in offline inference scenarios. It demonstrates DELTA mode
+streaming where you receive new tokens as they are generated.
+
+Usage:
+    python examples/offline_inference/async_llm_streaming.py
+"""
+
+import asyncio
+
+from vllm import SamplingParams
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.sampling_params import RequestOutputKind
+from vllm.v1.engine.async_llm import AsyncLLM
+
+
+async def stream_response(engine: AsyncLLM, prompt: str, request_id: str) -> None:
+    """
+    Stream response from AsyncLLM and display tokens as they arrive.
+
+    This function demonstrates the core streaming pattern:
+    1. Create SamplingParams with DELTA output kind
+    2. Call engine.generate() and iterate over the async generator
+    3. Print new tokens as they arrive
+    4. Handle the finished flag to know when generation is complete
+    """
+    print(f"\n🚀 Prompt: {prompt!r}")
+    print("💬 Response: ", end="", flush=True)
+
+    # Configure sampling parameters for streaming
+    sampling_params = SamplingParams(
+        max_tokens=100,
+        temperature=0.8,
+        top_p=0.95,
+        seed=42,  # For reproducible results
+        output_kind=RequestOutputKind.DELTA,  # Get only new tokens each iteration
+    )
+
+    try:
+        # Stream tokens from AsyncLLM
+        async for output in engine.generate(
+            request_id=request_id, prompt=prompt, sampling_params=sampling_params
+        ):
+            # Process each completion in the output
+            for completion in output.outputs:
+                # In DELTA mode, we get only new tokens generated since last iteration
+                new_text = completion.text
+                if new_text:
+                    print(new_text, end="", flush=True)
+
+            # Check if generation is finished
+            if output.finished:
+                print("\n✅ Generation complete!")
+                break
+
+    except Exception as e:
+        print(f"\n❌ Error during streaming: {e}")
+        raise
+
+
+async def main():
+    print("🔧 Initializing AsyncLLM...")
+
+    # Create AsyncLLM engine with simple configuration
+    engine_args = AsyncEngineArgs(
+        model="meta-llama/Llama-3.2-1B-Instruct",
+        enforce_eager=True,  # Faster startup for examples
+    )
+    engine = AsyncLLM.from_engine_args(engine_args)
+
+    try:
+        # Example prompts to demonstrate streaming
+        prompts = [
+            "The future of artificial intelligence is",
+            "In a galaxy far, far away",
+            "The key to happiness is",
+        ]
+
+        print(f"🎯 Running {len(prompts)} streaming examples...")
+
+        # Process each prompt
+        for i, prompt in enumerate(prompts, 1):
+            print(f"\n{'=' * 60}")
+            print(f"Example {i}/{len(prompts)}")
+            print(f"{'=' * 60}")
+
+            request_id = f"stream-example-{i}"
+            await stream_response(engine, prompt, request_id)
+
+            # Brief pause between examples
+            if i < len(prompts):
+                await asyncio.sleep(0.5)
+
+        print("\n🎉 All streaming examples completed!")
+
+    finally:
+        # Always clean up the engine
+        print("🔧 Shutting down engine...")
+        engine.shutdown()
+
+
+if __name__ == "__main__":
+    try:
+        asyncio.run(main())
+    except KeyboardInterrupt:
+        print("\n🛑 Interrupted by user")
diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
index 8014cb53f16a8..01d6a188be994 100644
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -190,6 +190,37 @@ def run_phi4mm(question: str, audio_count: int) -> ModelRequestData:
     )
 
 
+def run_phi4_multimodal(question: str, audio_count: int) -> ModelRequestData:
+    """
+    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
+    show how to process audio inputs.
+    """
+    model_path = snapshot_download(
+        "microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70"
+    )
+    # Since the vision-lora and speech-lora co-exist with the base model,
+    # we have to manually specify the path of the lora weights.
+    speech_lora_path = os.path.join(model_path, "speech-lora")
+    placeholders = "<|audio|>" * audio_count
+
+    prompts = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
+
+    engine_args = EngineArgs(
+        model=model_path,
+        max_model_len=12800,
+        max_num_seqs=2,
+        enable_lora=True,
+        max_lora_rank=320,
+        limit_mm_per_prompt={"audio": audio_count},
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompts,
+        lora_requests=[LoRARequest("speech", 1, speech_lora_path)],
+    )
+
+
 # Qwen2-Audio
 def run_qwen2_audio(question: str, audio_count: int) -> ModelRequestData:
     model_name = "Qwen/Qwen2-Audio-7B-Instruct"
@@ -303,6 +334,7 @@ model_example_map = {
     "granite_speech": run_granite_speech,
     "minicpmo": run_minicpmo,
     "phi4_mm": run_phi4mm,
+    "phi4_multimodal": run_phi4_multimodal,
     "qwen2_audio": run_qwen2_audio,
     "qwen2_5_omni": run_qwen2_5_omni,
     "ultravox": run_ultravox,
diff --git a/examples/offline_inference/basic/classify.py b/examples/offline_inference/basic/classify.py
index aaf0e83c9dee8..dc3bc399ca8a9 100644
--- a/examples/offline_inference/basic/classify.py
+++ b/examples/offline_inference/basic/classify.py
@@ -12,7 +12,9 @@ def parse_args():
     parser = EngineArgs.add_cli_args(parser)
     # Set example specific arguments
     parser.set_defaults(
-        model="jason9693/Qwen2.5-1.5B-apeach", task="classify", enforce_eager=True
+        model="jason9693/Qwen2.5-1.5B-apeach",
+        runner="pooling",
+        enforce_eager=True,
     )
     return parser.parse_args()
 
@@ -27,7 +29,7 @@ def main(args: Namespace):
     ]
 
     # Create an LLM.
-    # You should pass task="classify" for classification models
+    # You should pass runner="pooling" for classification models
     llm = LLM(**vars(args))
 
     # Generate logits. The output is a list of ClassificationRequestOutputs.
diff --git a/examples/offline_inference/basic/embed.py b/examples/offline_inference/basic/embed.py
index 7ff9c7f5e0eb1..158836728beed 100644
--- a/examples/offline_inference/basic/embed.py
+++ b/examples/offline_inference/basic/embed.py
@@ -12,10 +12,9 @@ def parse_args():
     parser = EngineArgs.add_cli_args(parser)
     # Set example specific arguments
     parser.set_defaults(
-        model="intfloat/e5-mistral-7b-instruct",
-        task="embed",
+        model="intfloat/e5-small",
+        runner="pooling",
         enforce_eager=True,
-        max_model_len=1024,
     )
     return parser.parse_args()
 
@@ -30,7 +29,7 @@ def main(args: Namespace):
     ]
 
     # Create an LLM.
-    # You should pass task="embed" for embedding models
+    # You should pass runner="pooling" for embedding models
     llm = LLM(**vars(args))
 
     # Generate embedding. The output is a list of EmbeddingRequestOutputs.
diff --git a/examples/offline_inference/basic/reward.py b/examples/offline_inference/basic/reward.py
new file mode 100644
index 0000000000000..aa173cf96f5bc
--- /dev/null
+++ b/examples/offline_inference/basic/reward.py
@@ -0,0 +1,53 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from argparse import Namespace
+
+from vllm import LLM, EngineArgs
+from vllm.utils import FlexibleArgumentParser
+
+
+def parse_args():
+    parser = FlexibleArgumentParser()
+    parser = EngineArgs.add_cli_args(parser)
+    # Set example specific arguments
+    parser.set_defaults(
+        model="internlm/internlm2-1_8b-reward",
+        runner="pooling",
+        enforce_eager=True,
+        max_model_len=1024,
+        trust_remote_code=True,
+    )
+    return parser.parse_args()
+
+
+def main(args: Namespace):
+    # Sample prompts.
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    # Create an LLM.
+    # You should pass runner="pooling" for reward models
+    llm = LLM(**vars(args))
+
+    # Generate rewards. The output is a list of PoolingRequestOutput.
+    outputs = llm.reward(prompts)
+
+    # Print the outputs.
+    print("\nGenerated Outputs:\n" + "-" * 60)
+    for prompt, output in zip(prompts, outputs):
+        rewards = output.outputs.data
+        rewards_trimmed = (
+            (str(rewards[:16])[:-1] + ", ...]") if len(rewards) > 16 else rewards
+        )
+        print(f"Prompt: {prompt!r} \nReward: {rewards_trimmed} (size={len(rewards)})")
+        print("-" * 60)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/examples/offline_inference/basic/score.py b/examples/offline_inference/basic/score.py
index d37527b0a131b..c9ca7a8bf06b8 100644
--- a/examples/offline_inference/basic/score.py
+++ b/examples/offline_inference/basic/score.py
@@ -12,7 +12,9 @@ def parse_args():
     parser = EngineArgs.add_cli_args(parser)
     # Set example specific arguments
     parser.set_defaults(
-        model="BAAI/bge-reranker-v2-m3", task="score", enforce_eager=True
+        model="BAAI/bge-reranker-v2-m3",
+        runner="pooling",
+        enforce_eager=True,
     )
     return parser.parse_args()
 
@@ -26,7 +28,7 @@ def main(args: Namespace):
     ]
 
     # Create an LLM.
-    # You should pass task="score" for cross-encoder models
+    # You should pass runner="pooling" for cross-encoder models
     llm = LLM(**vars(args))
 
     # Generate scores. The output is a list of ScoringRequestOutputs.
diff --git a/examples/offline_inference/disaggregated-prefill-v1/README.md b/examples/offline_inference/disaggregated-prefill-v1/README.md
index 9cbdb19820f56..abf6883f8d3ef 100644
--- a/examples/offline_inference/disaggregated-prefill-v1/README.md
+++ b/examples/offline_inference/disaggregated-prefill-v1/README.md
@@ -5,6 +5,6 @@ This example contains scripts that demonstrate disaggregated prefill in the offl
 ## Files
 
 - `run.sh` - A helper script that will run `prefill_example.py` and `decode_example.py` sequentially.
-  - Make sure you are in the `examples/offline_inference/disaggregated-prefill-v1` directory before running `run.sh`.
+    - Make sure you are in the `examples/offline_inference/disaggregated-prefill-v1` directory before running `run.sh`.
 - `prefill_example.py` - A script which performs prefill only, saving the KV state to the `local_storage` directory and the prompts to `output.txt`.
 - `decode_example.py` - A script which performs decode only, loading the KV state from the `local_storage` directory and the prompts from `output.txt`.
diff --git a/examples/offline_inference/embed_jina_embeddings_v3.py b/examples/offline_inference/embed_jina_embeddings_v3.py
index 7d78b8c63c634..33a63deee91bb 100644
--- a/examples/offline_inference/embed_jina_embeddings_v3.py
+++ b/examples/offline_inference/embed_jina_embeddings_v3.py
@@ -12,7 +12,9 @@ def parse_args():
     parser = EngineArgs.add_cli_args(parser)
     # Set example specific arguments
     parser.set_defaults(
-        model="jinaai/jina-embeddings-v3", task="embed", trust_remote_code=True
+        model="jinaai/jina-embeddings-v3",
+        runner="pooling",
+        trust_remote_code=True,
     )
     return parser.parse_args()
 
@@ -29,7 +31,7 @@ def main(args: Namespace):
     ]
 
     # Create an LLM.
-    # You should pass task="embed" for embedding models
+    # You should pass runner="pooling" for embedding models
     llm = LLM(**vars(args))
 
     # Generate embedding. The output is a list of EmbeddingRequestOutputs.
diff --git a/examples/offline_inference/embed_matryoshka_fy.py b/examples/offline_inference/embed_matryoshka_fy.py
index 50a645ba82702..6871bcfccf1b9 100644
--- a/examples/offline_inference/embed_matryoshka_fy.py
+++ b/examples/offline_inference/embed_matryoshka_fy.py
@@ -12,7 +12,9 @@ def parse_args():
     parser = EngineArgs.add_cli_args(parser)
     # Set example specific arguments
     parser.set_defaults(
-        model="jinaai/jina-embeddings-v3", task="embed", trust_remote_code=True
+        model="jinaai/jina-embeddings-v3",
+        runner="pooling",
+        trust_remote_code=True,
     )
     return parser.parse_args()
 
@@ -29,7 +31,7 @@ def main(args: Namespace):
     ]
 
     # Create an LLM.
-    # You should pass task="embed" for embedding models
+    # You should pass runner="pooling" for embedding models
     llm = LLM(**vars(args))
 
     # Generate embedding. The output is a list of EmbeddingRequestOutputs.
diff --git a/examples/offline_inference/openai_batch/README.md b/examples/offline_inference/openai_batch/README.md
index 631fde91fcd08..3c6f6c7a6c588 100644
--- a/examples/offline_inference/openai_batch/README.md
+++ b/examples/offline_inference/openai_batch/README.md
@@ -19,9 +19,9 @@ We currently support `/v1/chat/completions`, `/v1/embeddings`, and `/v1/score` e
 ## Pre-requisites
 
 * The examples in this document use `meta-llama/Meta-Llama-3-8B-Instruct`.
-  - Create a [user access token](https://huggingface.co/docs/hub/en/security-tokens)
-  - Install the token on your machine (Run `huggingface-cli login`).
-  - Get access to the gated model by [visiting the model card](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and agreeing to the terms and conditions.
+    * Create a [user access token](https://huggingface.co/docs/hub/en/security-tokens)
+    * Install the token on your machine (Run `huggingface-cli login`).
+    * Get access to the gated model by [visiting the model card](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and agreeing to the terms and conditions.
 
 ## Example 1: Running with a local file
 
@@ -105,7 +105,7 @@ To integrate with cloud blob storage, we recommend using presigned urls.
 
 * [Create an S3 bucket](https://docs.aws.amazon.com/AmazonS3/latest/userguide/creating-bucket.html).
 * The `awscli` package (Run `pip install awscli`) to configure your credentials and interactively use s3.
-  - [Configure your credentials](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-quickstart.html).
+    * [Configure your credentials](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-quickstart.html).
 * The `boto3` python package (Run `pip install boto3`) to generate presigned urls.
 
 ### Step 1: Upload your input script
diff --git a/examples/offline_inference/prithvi_geospatial_mae.py b/examples/offline_inference/prithvi_geospatial_mae.py
index 4fdc7a3cf709e..b6007b9f46301 100644
--- a/examples/offline_inference/prithvi_geospatial_mae.py
+++ b/examples/offline_inference/prithvi_geospatial_mae.py
@@ -3,12 +3,12 @@
 import argparse
 import datetime
 import os
-import re
 from typing import Union
 
 import albumentations
 import numpy as np
 import rasterio
+import regex as re
 import torch
 from einops import rearrange
 from terratorch.datamodules import Sen1Floods11NonGeoDataModule
diff --git a/examples/offline_inference/qwen3_reranker.py b/examples/offline_inference/qwen3_reranker.py
index b0fd57237d472..7bc48277f5512 100644
--- a/examples/offline_inference/qwen3_reranker.py
+++ b/examples/offline_inference/qwen3_reranker.py
@@ -17,7 +17,7 @@ model_name = "Qwen/Qwen3-Reranker-0.6B"
 # Models converted offline using this method can not only be more efficient
 # and support the vllm score API, but also make the init parameters more
 # concise, for example.
-# llm = LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", task="score")
+# llm = LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", runner="pooling")
 
 # If you want to load the official original version, the init parameters are
 # as follows.
@@ -27,7 +27,7 @@ def get_llm() -> LLM:
     """Initializes and returns the LLM model for Qwen3-Reranker."""
     return LLM(
         model=model_name,
-        task="score",
+        runner="pooling",
         hf_overrides={
             "architectures": ["Qwen3ForSequenceClassification"],
             "classifier_from_token": ["no", "yes"],
diff --git a/examples/offline_inference/save_sharded_state.py b/examples/offline_inference/save_sharded_state.py
index 9b154e370642b..41d7a34923208 100644
--- a/examples/offline_inference/save_sharded_state.py
+++ b/examples/offline_inference/save_sharded_state.py
@@ -29,6 +29,7 @@ import shutil
 from pathlib import Path
 
 from vllm import LLM, EngineArgs
+from vllm.model_executor.model_loader import ShardedStateLoader
 from vllm.utils import FlexibleArgumentParser
 
 
@@ -39,11 +40,14 @@ def parse_args():
         "--output", "-o", required=True, type=str, help="path to output checkpoint"
     )
     parser.add_argument(
-        "--file-pattern", type=str, help="string pattern of saved filenames"
+        "--file-pattern",
+        type=str,
+        default=ShardedStateLoader.DEFAULT_PATTERN,
+        help="string pattern of saved filenames",
     )
     parser.add_argument(
         "--max-file-size",
-        type=str,
+        type=int,
         default=5 * 1024**3,
         help="max size (in bytes) of each safetensors file",
     )
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index eb6b410848558..6f23a29e72f71 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -468,6 +468,39 @@ def run_tarsier(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+# Intern-S1
+def run_interns1(questions: list[str], modality: str) -> ModelRequestData:
+    model_name = "internlm/Intern-S1"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=8192,
+        max_num_seqs=2,
+        limit_mm_per_prompt={modality: 1},
+        enforce_eager=True,
+    )
+
+    if modality == "image":
+        placeholder = "<IMG_CONTEXT>"
+    elif modality == "video":
+        placeholder = "<video>"
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    messages = [
+        [{"role": "user", "content": f"{placeholder}\n{question}"}]
+        for question in questions
+    ]
+    prompts = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # InternVL
 def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
     model_name = "OpenGVLab/InternVL3-2B"
@@ -1066,6 +1099,41 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+# HF format Phi-4-multimodal-instruct
+def run_phi4_multimodal(questions: list[str], modality: str) -> ModelRequestData:
+    """
+    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
+    show how to process image inputs.
+    """
+    assert modality == "image"
+    model_path = snapshot_download(
+        "microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70"
+    )
+    # Since the vision-lora and speech-lora co-exist with the base model,
+    # we have to manually specify the path of the lora weights.
+    vision_lora_path = os.path.join(model_path, "vision-lora")
+    prompts = [
+        f"<|user|><|image|>{question}<|end|><|assistant|>" for question in questions
+    ]
+    engine_args = EngineArgs(
+        model=model_path,
+        max_model_len=5120,
+        max_num_seqs=2,
+        max_num_batched_tokens=12800,
+        enable_lora=True,
+        max_lora_rank=320,
+        # Note - mm_processor_kwargs can also be passed to generate/chat calls
+        mm_processor_kwargs={"dynamic_hd": 16},
+        limit_mm_per_prompt={"image": 1},
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
+    )
+
+
 # Pixtral HF-format
 def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
@@ -1303,6 +1371,7 @@ model_example_map = {
     "h2ovl_chat": run_h2ovl,
     "hyperclovax_seed_vision": run_hyperclovax_seed_vision,
     "idefics3": run_idefics3,
+    "interns1": run_interns1,
     "internvl_chat": run_internvl,
     "nemotron_vl": run_nemotron_vl,
     "keye_vl": run_keye_vl,
@@ -1324,6 +1393,7 @@ model_example_map = {
     "paligemma2": run_paligemma2,
     "phi3_v": run_phi3v,
     "phi4_mm": run_phi4mm,
+    "phi4_multimodal": run_phi4_multimodal,
     "pixtral_hf": run_pixtral_hf,
     "qwen_vl": run_qwen_vl,
     "qwen2_vl": run_qwen2_vl,
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 2e14fc807e104..dd50f3639709e 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -253,6 +253,33 @@ def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
 
+def load_interns1(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "internlm/Intern-S1"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=4096,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = "\n".join(
+        f"Image-{i}: <IMG_CONTEXT>\n" for i, _ in enumerate(image_urls, start=1)
+    )
+    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    prompt = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
 def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
     model_name = "OpenGVLab/InternVL2-2B"
 
@@ -733,6 +760,40 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
 
+def load_phi4_multimodal(question: str, image_urls: list[str]) -> ModelRequestData:
+    """
+    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
+    show how to process multi images inputs.
+    """
+
+    model_path = snapshot_download(
+        "microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70"
+    )
+    # Since the vision-lora and speech-lora co-exist with the base model,
+    # we have to manually specify the path of the lora weights.
+    vision_lora_path = os.path.join(model_path, "vision-lora")
+    engine_args = EngineArgs(
+        model=model_path,
+        max_model_len=4096,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        enable_lora=True,
+        max_lora_rank=320,
+        # Note - mm_processor_kwargs can also be passed to generate/chat calls
+        mm_processor_kwargs={"dynamic_hd": 4},
+    )
+
+    placeholders = "<|image|>" * len(image_urls)
+    prompt = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+        lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
+    )
+
+
 def load_qwen_vl_chat(question: str, image_urls: list[str]) -> ModelRequestData:
     model_name = "Qwen/Qwen-VL-Chat"
     engine_args = EngineArgs(
@@ -946,6 +1007,7 @@ model_example_map = {
     "gemma3": load_gemma3,
     "h2ovl_chat": load_h2ovl,
     "idefics3": load_idefics3,
+    "interns1": load_interns1,
     "internvl_chat": load_internvl,
     "hyperclovax_seed_vision": load_hyperclovax_seed_vision,
     "keye_vl": load_keye_vl,
@@ -960,6 +1022,7 @@ model_example_map = {
     "ovis": load_ovis,
     "phi3_v": load_phi3v,
     "phi4_mm": load_phi4mm,
+    "phi4_multimodal": load_phi4_multimodal,
     "pixtral_hf": load_pixtral_hf,
     "qwen_vl_chat": load_qwen_vl_chat,
     "qwen2_vl": load_qwen2_vl,
diff --git a/examples/offline_inference/vision_language_pooling.py b/examples/offline_inference/vision_language_pooling.py
index 57963ebd2b100..0cc0c1e708b12 100644
--- a/examples/offline_inference/vision_language_pooling.py
+++ b/examples/offline_inference/vision_language_pooling.py
@@ -70,7 +70,7 @@ def run_e5_v(query: Query) -> ModelRequestData:
 
     engine_args = EngineArgs(
         model="royokong/e5-v",
-        task="embed",
+        runner="pooling",
         max_model_len=4096,
         limit_mm_per_prompt={"image": 1},
     )
@@ -102,7 +102,7 @@ def run_vlm2vec(query: Query) -> ModelRequestData:
 
     engine_args = EngineArgs(
         model="TIGER-Lab/VLM2Vec-Full",
-        task="embed",
+        runner="pooling",
         max_model_len=4096,
         trust_remote_code=True,
         mm_processor_kwargs={"num_crops": 4},
@@ -122,7 +122,7 @@ def run_jinavl_reranker(query: Query) -> ModelRequestData:
 
     engine_args = EngineArgs(
         model="jinaai/jina-reranker-m0",
-        task="score",
+        runner="pooling",
         max_model_len=32768,
         trust_remote_code=True,
         mm_processor_kwargs={
diff --git a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh
index 76f5c0c99d0bb..568f7a43b4962 100644
--- a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh
+++ b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh
@@ -29,7 +29,7 @@ PROXY_PORT=${PROXY_PORT:-30001}
 PREFILL_GPUS=${PREFILL_GPUS:-0}
 DECODE_GPUS=${DECODE_GPUS:-1,2,3}
 PREFILL_PORTS=${PREFILL_PORTS:-20003}
-DECODE_PORTS=${DECODE_PORTS:-20005,20007,20009} 
+DECODE_PORTS=${DECODE_PORTS:-20005,20007,20009}
 
 echo "Warning: P2P NCCL disaggregated prefill XpYd support for vLLM v1 is experimental and subject to change."
 echo ""
@@ -164,7 +164,7 @@ main() {
         local gpu_id=${PREFILL_GPU_ARRAY[$i]}
         local port=${PREFILL_PORT_ARRAY[$i]}
         local kv_port=$((21001 + i))
-        
+
         echo "  Prefill server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
         CUDA_VISIBLE_DEVICES=$gpu_id VLLM_USE_V1=1 vllm serve $MODEL \
         --enforce-eager \
@@ -193,7 +193,7 @@ main() {
         local gpu_id=${DECODE_GPU_ARRAY[$i]}
         local port=${DECODE_PORT_ARRAY[$i]}
         local kv_port=$((22001 + i))
-        
+
         echo "  Decode server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
         VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
         --enforce-eager \
@@ -233,7 +233,7 @@ main() {
     # Run Benchmark
     # =============================================================================
     cd ../../../benchmarks/
-    python3 benchmark_serving.py --port 10001 --seed $(date +%s) \
+    vllm bench serve --port 10001 --seed $(date +%s) \
         --model $MODEL \
         --dataset-name random --random-input-len 7500 --random-output-len 200 \
         --num-prompts 200 --burstiness 100 --request-rate 2 | tee benchmark.log
@@ -243,4 +243,4 @@ main() {
     cleanup
 }
 
-main
\ No newline at end of file
+main
diff --git a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py
index ec58a183061e8..a6fd92feb2f11 100644
--- a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py
+++ b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py
@@ -120,6 +120,7 @@ async def forward_request(url, data, request_id):
 
 
 @app.route("/v1/completions", methods=["POST"])
+@app.route("/v1/chat/completions", methods=["POST"])
 async def handle_request():
     try:
         original_request_data = await request.get_json()
@@ -157,13 +158,13 @@ async def handle_request():
 
         # finish prefill
         async for _ in forward_request(
-            f"http://{prefill_addr}/v1/completions", prefill_request, request_id
+            f"http://{prefill_addr}{request.path}", prefill_request, request_id
         ):
             continue
 
         # return decode
         generator = forward_request(
-            f"http://{decode_addr}/v1/completions", original_request_data, request_id
+            f"http://{decode_addr}{request.path}", original_request_data, request_id
         )
         response = await make_response(generator)
         response.timeout = None
diff --git a/examples/online_serving/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
index c99b5148de875..ac5f79b56e49f 100644
--- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
@@ -9,7 +9,7 @@ Launch the vLLM server with the following command:
 vllm serve llava-hf/llava-1.5-7b-hf
 
 (multi-image inference with Phi-3.5-vision-instruct)
-vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
+vllm serve microsoft/Phi-3.5-vision-instruct --runner generate \
     --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt '{"image":2}'
 
 (audio inference with Ultravox)
diff --git a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
index 5a91929770945..7d1ea37714599 100644
--- a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
+++ b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
@@ -51,13 +51,12 @@ def main():
     printed_content = False
 
     for chunk in stream:
-        reasoning_content = None
-        content = None
-        # Check the content is reasoning_content or content
-        if hasattr(chunk.choices[0].delta, "reasoning_content"):
-            reasoning_content = chunk.choices[0].delta.reasoning_content
-        elif hasattr(chunk.choices[0].delta, "content"):
-            content = chunk.choices[0].delta.content
+        # Safely extract reasoning_content and content from delta,
+        # defaulting to None if attributes don't exist or are empty strings
+        reasoning_content = (
+            getattr(chunk.choices[0].delta, "reasoning_content", None) or None
+        )
+        content = getattr(chunk.choices[0].delta, "content", None) or None
 
         if reasoning_content is not None:
             if not printed_reasoning_content:
diff --git a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
index 70f3c2f19cf14..771ad8511e972 100644
--- a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
@@ -92,7 +92,7 @@ def dse_qwen2_vl(inp: dict):
 def parse_args():
     parser = argparse.ArgumentParser(
         "Script to call a specified VLM through the API. Make sure to serve "
-        "the model with --task embed before running this."
+        "the model with `--runner pooling` before running this."
     )
     parser.add_argument(
         "--model",
diff --git a/examples/online_serving/openai_cross_encoder_score.py b/examples/online_serving/openai_cross_encoder_score.py
index 2e0d168d615c6..f63c2bb84c99d 100644
--- a/examples/online_serving/openai_cross_encoder_score.py
+++ b/examples/online_serving/openai_cross_encoder_score.py
@@ -3,7 +3,7 @@
 """
 Example online usage of Score API.
 
-Run `vllm serve <model> --task score` to start up the server in vLLM.
+Run `vllm serve <model> --runner pooling` to start up the server in vLLM.
 """
 
 import argparse
diff --git a/examples/online_serving/openai_cross_encoder_score_for_multimodal.py b/examples/online_serving/openai_cross_encoder_score_for_multimodal.py
index e49905a864c1d..80ed2c27dfb11 100644
--- a/examples/online_serving/openai_cross_encoder_score_for_multimodal.py
+++ b/examples/online_serving/openai_cross_encoder_score_for_multimodal.py
@@ -3,7 +3,7 @@
 """
 Example online usage of Score API.
 
-Run `vllm serve <model> --task score` to start up the server in vLLM.
+Run `vllm serve <model> --runner pooling` to start up the server in vLLM.
 """
 
 import argparse
diff --git a/examples/online_serving/openai_pooling_client.py b/examples/online_serving/openai_pooling_client.py
index 8252b36705cc6..95555d41cbea5 100644
--- a/examples/online_serving/openai_pooling_client.py
+++ b/examples/online_serving/openai_pooling_client.py
@@ -3,7 +3,7 @@
 """
 Example online usage of Pooling API.
 
-Run `vllm serve <model> --task <embed|classify|reward|score>`
+Run `vllm serve <model> --runner pooling`
 to start up the server in vLLM.
 """
 
diff --git a/examples/online_serving/prometheus_grafana/README.md b/examples/online_serving/prometheus_grafana/README.md
index 6df9594516664..7c4e649e6d029 100644
--- a/examples/online_serving/prometheus_grafana/README.md
+++ b/examples/online_serving/prometheus_grafana/README.md
@@ -28,7 +28,7 @@ Submit some sample requests to the server:
 ```bash
 wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
 
-python3 ../../../benchmarks/benchmark_serving.py \
+vllm bench serve \
     --model mistralai/Mistral-7B-v0.1 \
     --tokenizer mistralai/Mistral-7B-v0.1 \
     --endpoint /v1/completions \
diff --git a/examples/online_serving/prompt_embed_inference_with_openai_client.py b/examples/online_serving/prompt_embed_inference_with_openai_client.py
index 3a90421383775..0bbe4b8f5ee9b 100644
--- a/examples/online_serving/prompt_embed_inference_with_openai_client.py
+++ b/examples/online_serving/prompt_embed_inference_with_openai_client.py
@@ -10,7 +10,7 @@ This script demonstrates how to:
 
 Run the vLLM server first:
 vllm serve meta-llama/Llama-3.2-1B-Instruct \
-  --task generate \
+  --runner generate \
   --max-model-len 4096 \
   --enable-prompt-embeds
 
diff --git a/examples/others/lmcache/README.md b/examples/others/lmcache/README.md
index 95a6bf995b2fd..759be55d6f1c5 100644
--- a/examples/others/lmcache/README.md
+++ b/examples/others/lmcache/README.md
@@ -28,16 +28,20 @@ to run disaggregated prefill and benchmark the performance.
 ### Components
 
 #### Server Scripts
+
 - `disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh` - Launches individual vLLM servers for prefill/decode, and also launches the proxy server.
 - `disagg_prefill_lmcache_v1/disagg_proxy_server.py` - FastAPI proxy server that coordinates between prefiller and decoder
 - `disagg_prefill_lmcache_v1/disagg_example_nixl.sh` - Main script to run the example
 
 #### Configuration
+
 - `disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml` - Configuration for prefiller server
 - `disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml` - Configuration for decoder server
 
 #### Log Files
+
 The main script generates several log files:
+
 - `prefiller.log` - Logs from the prefill server
 - `decoder.log` - Logs from the decode server
 - `proxy.log` - Logs from the proxy server
diff --git a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh
index 0b6c9213ebfff..1178681f1533b 100644
--- a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh
+++ b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh
@@ -122,7 +122,7 @@ main() {
 
     # begin benchmark
     cd ../../../../benchmarks/
-    python3 benchmark_serving.py --port 9000 --seed $(date +%s) \
+    vllm bench serve --port 9000 --seed $(date +%s) \
         --model meta-llama/Llama-3.1-8B-Instruct \
         --dataset-name random --random-input-len 7500 --random-output-len 200 \
         --num-prompts 200 --burstiness 100 --request-rate 3.6 | tee benchmark.log
@@ -133,4 +133,4 @@ main() {
 
 }
 
-main
\ No newline at end of file
+main
diff --git a/examples/others/logging_configuration.md b/examples/others/logging_configuration.md
index 916ab5fd1c871..7c8bdd199a72d 100644
--- a/examples/others/logging_configuration.md
+++ b/examples/others/logging_configuration.md
@@ -8,11 +8,11 @@ of logging configurations that range from simple-and-inflexible to
 more-complex-and-more-flexible.
 
 - No vLLM logging (simple and inflexible)
-  - Set `VLLM_CONFIGURE_LOGGING=0` (leaving `VLLM_LOGGING_CONFIG_PATH` unset)
+    - Set `VLLM_CONFIGURE_LOGGING=0` (leaving `VLLM_LOGGING_CONFIG_PATH` unset)
 - vLLM's default logging configuration (simple and inflexible)
-  - Leave `VLLM_CONFIGURE_LOGGING` unset or set `VLLM_CONFIGURE_LOGGING=1`
+    - Leave `VLLM_CONFIGURE_LOGGING` unset or set `VLLM_CONFIGURE_LOGGING=1`
 - Fine-grained custom logging configuration (more complex, more flexible)
-  - Leave `VLLM_CONFIGURE_LOGGING` unset or set `VLLM_CONFIGURE_LOGGING=1` and
+    - Leave `VLLM_CONFIGURE_LOGGING` unset or set `VLLM_CONFIGURE_LOGGING=1` and
     set `VLLM_LOGGING_CONFIG_PATH=<path-to-logging-config.json>`
 
 ## Logging Configuration Environment Variables
diff --git a/mkdocs.yaml b/mkdocs.yaml
index 8f731a2c1fc8c..e5b7454003310 100644
--- a/mkdocs.yaml
+++ b/mkdocs.yaml
@@ -62,6 +62,18 @@ plugins:
   - autorefs
   - awesome-nav
   - glightbox
+  - git-revision-date-localized:
+      # exclude autogenerated files
+      exclude:
+        - argparse/*
+        - examples/*
+  - minify:
+      minify_html: true
+      minify_js: true
+      minify_css: true
+      cache_safe: true
+      js_files: [docs/mkdocs/javascript/*.js]
+      css_files: [docs/mkdocs/stylesheets/*.css]
   # For API reference generation
   - api-autonav:
       modules: ["vllm"]
diff --git a/pyproject.toml b/pyproject.toml
index a65267942d47e..dfad5d2cdf319 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -156,16 +156,6 @@ markers = [
     "optional: optional tests that are automatically skipped, include --optional to run them",
 ]
 
-[tool.pymarkdown]
-plugins.md004.style = "sublist" # ul-style
-plugins.md007.indent = 4 # ul-indent
-plugins.md007.start_indented = true # ul-indent
-plugins.md013.enabled = false # line-length
-plugins.md041.enabled = false # first-line-h1
-plugins.md033.enabled = false # inline-html
-plugins.md046.enabled = false # code-block-style
-plugins.md024.allow_different_nesting = true # no-duplicate-headers
-
 [tool.ty.src]
 root = "./vllm"
 respect-ignore-files = true
diff --git a/requirements/cuda.txt b/requirements/cuda.txt
index c1273b224eabf..5557c868acafa 100644
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -12,3 +12,5 @@ torchaudio==2.7.1
 torchvision==0.22.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 # https://github.com/facebookresearch/xformers/releases/tag/v0.0.31
 xformers==0.0.31; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch >= 2.7
+# FlashInfer should be updated together with the Dockerfile
+flashinfer_python==0.2.9rc2
\ No newline at end of file
diff --git a/requirements/docs.txt b/requirements/docs.txt
index 950906b2ff36d..4d4fc7da6816d 100644
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -5,6 +5,8 @@ mkdocstrings-python
 mkdocs-gen-files
 mkdocs-awesome-nav
 mkdocs-glightbox
+mkdocs-git-revision-date-localized-plugin
+mkdocs-minify-plugin
 python-markdown-math
 regex
 ruff
diff --git a/requirements/test.in b/requirements/test.in
index c794d1b3cb8d4..3c5e3c0204bfb 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -15,7 +15,7 @@ httpx
 librosa # required for audio tests
 vector_quantize_pytorch # required for minicpmo_26 test
 vocos # required for minicpmo_26 test
-peft
+peft>=0.15.0 # required for phi-4-mm test
 pqdm
 ray[cgraph,default]>=2.43.0, !=2.44.* # Ray Compiled Graph, required by pipeline parallelism tests
 sentence-transformers # required for embedding tests
diff --git a/requirements/test.txt b/requirements/test.txt
index c4e3c33f373fc..d45048aae5809 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -661,7 +661,7 @@ pathvalidate==3.2.1
     # via pytablewriter
 patsy==1.0.1
     # via statsmodels
-peft==0.13.2
+peft==0.16.0
     # via
     #   -r requirements/test.in
     #   lm-eval
diff --git a/requirements/tpu.txt b/requirements/tpu.txt
index d86f643d388ba..2d0d8bd8457e3 100644
--- a/requirements/tpu.txt
+++ b/requirements/tpu.txt
@@ -19,8 +19,8 @@ nixl==0.3.0
 --find-links https://storage.googleapis.com/libtpu-releases/index.html
 --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
 --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-torch==2.9.0.dev20250716
-torchvision==0.24.0.dev20250716
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250716-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250716-cp312-cp312-linux_x86_64.whl ; python_version == "3.12"
+torch==2.9.0.dev20250724
+torchvision==0.24.0.dev20250724
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250724-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250724-cp312-cp312-linux_x86_64.whl ; python_version == "3.12"
 
diff --git a/setup.py b/setup.py
index d46e678e7aa40..bf3391e2db19e 100644
--- a/setup.py
+++ b/setup.py
@@ -7,6 +7,7 @@ import json
 import logging
 import os
 import re
+import shutil
 import subprocess
 import sys
 from pathlib import Path
@@ -297,6 +298,10 @@ class repackage_wheel(build_ext):
             ]).decode("utf-8")
             upstream_main_commit = json.loads(resp_json)["sha"]
 
+            # In Docker build context, .git may be immutable or missing.
+            if envs.VLLM_DOCKER_BUILD_CONTEXT:
+                return upstream_main_commit
+
             # Check if the upstream_main_commit exists in the local repo
             try:
                 subprocess.check_output(
@@ -357,20 +362,40 @@ class repackage_wheel(build_ext):
             # create a temporary directory to store the wheel
             temp_dir = tempfile.mkdtemp(prefix="vllm-wheels")
             wheel_path = os.path.join(temp_dir, wheel_filename)
-
             print(f"Downloading wheel from {wheel_location} to {wheel_path}")
-
             from urllib.request import urlretrieve
-
             try:
                 urlretrieve(wheel_location, filename=wheel_path)
             except Exception as e:
                 from setuptools.errors import SetupError
-
                 raise SetupError(
                     f"Failed to get vLLM wheel from {wheel_location}") from e
 
+        # Set the dist_dir for Docker build context
+        dist_dir = ("/workspace/dist"
+                    if envs.VLLM_DOCKER_BUILD_CONTEXT else "dist")
+        os.makedirs(dist_dir, exist_ok=True)
+
+        # Extract only necessary compiled .so files from precompiled wheel
         with zipfile.ZipFile(wheel_path) as wheel:
+            # Get version from METADATA (optional, mostly useful for logging)
+            metadata_file = next((n for n in wheel.namelist()
+                                  if n.endswith(".dist-info/METADATA")), None)
+            if not metadata_file:
+                raise RuntimeError(
+                    "Could not find METADATA in precompiled wheel.")
+            metadata = wheel.read(metadata_file).decode()
+            version_line = next((line for line in metadata.splitlines()
+                                 if line.startswith("Version: ")), None)
+            if not version_line:
+                raise RuntimeError(
+                    "Could not determine version from METADATA.")
+            version = version_line.split(": ")[1].strip()
+
+            print(f"Extracting precompiled kernels from vLLM wheel version: "
+                  f"{version}")
+
+            # List of compiled shared objects to extract
             files_to_copy = [
                 "vllm/_C.abi3.so",
                 "vllm/_moe_C.abi3.so",
@@ -378,15 +403,10 @@ class repackage_wheel(build_ext):
                 "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
                 "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
                 "vllm/cumem_allocator.abi3.so",
-                # "vllm/_version.py", # not available in nightly wheels yet
             ]
 
             file_members = list(
                 filter(lambda x: x.filename in files_to_copy, wheel.filelist))
-
-            # vllm_flash_attn python code:
-            # Regex from
-            #  `glob.translate('vllm/vllm_flash_attn/**/*.py', recursive=True)`
             compiled_regex = re.compile(
                 r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py")
             file_members += list(
@@ -402,19 +422,36 @@ class repackage_wheel(build_ext):
                 if package_name not in package_data:
                     package_data[package_name] = []
 
-                wheel.extract(file)
-                if file_name.endswith(".py"):
-                    # python files shouldn't be added to package_data
-                    continue
+                output_base = (dist_dir
+                               if envs.VLLM_DOCKER_BUILD_CONTEXT else ".")
+                target_path = os.path.join(output_base, file.filename)
+                os.makedirs(os.path.dirname(target_path), exist_ok=True)
+                with wheel.open(file.filename) as src, open(target_path,
+                                                            "wb") as dst:
+                    shutil.copyfileobj(src, dst)
 
                 package_data[package_name].append(file_name)
 
+        # Copy wheel into dist dir for Docker to consume (e.g., via --mount)
+        if envs.VLLM_DOCKER_BUILD_CONTEXT:
+            arch_tag = "cp38-abi3-manylinux1_x86_64"
+            corrected_wheel_name = f"vllm-{version}-{arch_tag}.whl"
+            final_wheel_path = os.path.join(dist_dir, corrected_wheel_name)
+
+            print(
+                "Docker build context detected, copying precompiled wheel to "
+                f"{final_wheel_path}")
+            shutil.copy2(wheel_path, final_wheel_path)
+
 
 def _no_device() -> bool:
     return VLLM_TARGET_DEVICE == "empty"
 
 
 def _is_cuda() -> bool:
+    # Allow forced CUDA in Docker/precompiled builds, even without torch.cuda
+    if envs.VLLM_USE_PRECOMPILED and envs.VLLM_DOCKER_BUILD_CONTEXT:
+        return True
     has_cuda = torch.version.cuda is not None
     return (VLLM_TARGET_DEVICE == "cuda" and has_cuda
             and not (_is_neuron() or _is_tpu()))
diff --git a/tests/compile/test_async_tp.py b/tests/compile/test_async_tp.py
index 62804e721e3dc..9a51e6b3514f4 100644
--- a/tests/compile/test_async_tp.py
+++ b/tests/compile/test_async_tp.py
@@ -22,6 +22,8 @@ from ..utils import (compare_two_settings, create_new_process_for_each_test,
                      multi_gpu_test)
 from .backend import TestBackend
 
+FP8_DTYPE = current_platform.fp8_dtype()
+
 prompts = [
     "Hello, my name is",
     "The president of the United States is",
@@ -32,9 +34,10 @@ prompts = [
 
 class TestMMRSModel(torch.nn.Module):
 
-    def __init__(self, hidden_size=16):
+    def __init__(self, hidden_size=16, dtype=torch.float16):
         super().__init__()
         self.hidden_size = hidden_size
+        self.dtype = dtype
         self.gate_proj = torch.nn.Parameter(torch.empty(
             (self.hidden_size * 2, hidden_size)),
                                             requires_grad=False)
@@ -64,9 +67,10 @@ class TestMMRSModel(torch.nn.Module):
 
 class TestAGMMModel(torch.nn.Module):
 
-    def __init__(self, hidden_size=16):
+    def __init__(self, hidden_size=16, dtype=torch.float16):
         super().__init__()
         self.hidden_size = hidden_size
+        self.dtype = dtype
         self.weight = torch.nn.Parameter(torch.empty(
             (hidden_size, hidden_size)),
                                          requires_grad=False)
@@ -91,8 +95,125 @@ class TestAGMMModel(torch.nn.Module):
         return [torch.ops.symm_mem.fused_all_gather_matmul.default]
 
 
+class _BaseScaledMMModel(torch.nn.Module):
+
+    def __init__(self, hidden_size=16, dtype=torch.float16):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.dtype = dtype
+        self.weight = torch.empty([hidden_size, hidden_size], dtype=FP8_DTYPE)\
+            .contiguous().transpose(0, 1)
+
+        # Initialize scale_b for _scaled_mm.
+        self.scale_b = torch.ones(1, self.hidden_size, dtype=torch.float32)
+
+
+class TestScaledMMRSModel(_BaseScaledMMModel):
+
+    def forward(self, input: torch.Tensor):
+        """
+        Forward pass implementing the scaled_mm + reduce scatter in the FX graph
+    
+        """
+        fp8_input = input.to(FP8_DTYPE)
+        scale_a = torch.ones(input.shape[0], 1, dtype=torch.float32)
+        scaled_mm = torch._scaled_mm(fp8_input,
+                                     self.weight,
+                                     scale_a=scale_a,
+                                     scale_b=self.scale_b,
+                                     out_dtype=self.dtype)
+        reduce_scatter = tensor_model_parallel_reduce_scatter(scaled_mm, dim=0)
+        return reduce_scatter
+
+    def ops_in_model_before(self):
+        return [torch.ops.vllm.reduce_scatter.default]
+
+    def ops_in_model_after(self):
+        return [torch.ops.symm_mem.fused_scaled_matmul_reduce_scatter.default]
+
+
+class TestAGScaledMMModel(_BaseScaledMMModel):
+
+    def forward(self, input: torch.Tensor):
+        """
+        Forward pass implementing the all gather + scaled_mm in the FX graph
+        """
+        # Reshape input
+        fp8_input = input.to(FP8_DTYPE)
+        all_gather = tensor_model_parallel_all_gather(fp8_input, dim=0)
+
+        scale_a = torch.ones(all_gather.shape[0], 1, dtype=torch.float32)
+        scaled_mm = torch._scaled_mm(all_gather,
+                                     self.weight,
+                                     scale_a=scale_a,
+                                     scale_b=self.scale_b,
+                                     out_dtype=self.dtype)
+        return scaled_mm
+
+    def ops_in_model_before(self):
+        return [torch.ops.vllm.all_gather.default]
+
+    def ops_in_model_after(self):
+        return [torch.ops.symm_mem.fused_all_gather_scaled_matmul.default]
+
+
+class TestCutlassScaledMMRSModel(_BaseScaledMMModel):
+
+    def forward(self, input: torch.Tensor):
+        """
+        Forward pass implementing the cutlass_scaled_mm + reduce scatter
+        in the FX graph
+    
+        """
+        fp8_input = input.to(FP8_DTYPE)
+        scale_a = torch.ones(input.shape[0], 1, dtype=torch.float32)
+        mm_out = torch.empty((fp8_input.shape[0], self.weight.shape[1]),
+                             dtype=self.dtype,
+                             device=input.device)
+        torch.ops._C.cutlass_scaled_mm(mm_out, fp8_input, self.weight, scale_a,
+                                       self.scale_b, None)
+        reduce_scatter = tensor_model_parallel_reduce_scatter(mm_out, dim=0)
+        return reduce_scatter
+
+    def ops_in_model_before(self):
+        return [torch.ops.vllm.reduce_scatter.default]
+
+    def ops_in_model_after(self):
+        return [torch.ops.symm_mem.fused_scaled_matmul_reduce_scatter.default]
+
+
+class TestAGCutlassScaledMMModel(_BaseScaledMMModel):
+
+    def forward(self, input: torch.Tensor):
+        """
+        Forward pass implementing the all gather + cutlass_scaled_mm 
+        in the FX graph
+        """
+        # Reshape input
+        fp8_input = input.to(FP8_DTYPE)
+        all_gather = tensor_model_parallel_all_gather(fp8_input, dim=0)
+
+        scale_a = torch.ones(all_gather.shape[0], 1, dtype=torch.float32)
+
+        mm_out = torch.empty((all_gather.shape[0], self.weight.shape[1]),
+                             dtype=self.dtype,
+                             device=all_gather.device)
+        torch.ops._C.cutlass_scaled_mm(mm_out, all_gather, self.weight,
+                                       scale_a, self.scale_b, None)
+        return mm_out
+
+    def ops_in_model_before(self):
+        return [torch.ops.vllm.all_gather.default]
+
+    def ops_in_model_after(self):
+        return [torch.ops.symm_mem.fused_all_gather_scaled_matmul.default]
+
+
 @multi_gpu_test(num_gpus=2)
-@pytest.mark.parametrize("test_model", [TestMMRSModel, TestAGMMModel])
+@pytest.mark.parametrize("test_model", [
+    TestMMRSModel, TestAGMMModel, TestScaledMMRSModel, TestAGScaledMMModel,
+    TestCutlassScaledMMRSModel, TestAGCutlassScaledMMModel
+])
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize("seq_len", [16])
 @pytest.mark.parametrize("hidden_size", [16])
@@ -101,6 +222,14 @@ class TestAGMMModel(torch.nn.Module):
                     reason="Only test on CUDA")
 def test_async_tp_pass_replace(test_model: str, batch_size: int, seq_len: int,
                                hidden_size: int, dtype: torch.dtype):
+    if test_model in (TestScaledMMRSModel, TestAGScaledMMModel,
+                      TestCutlassScaledMMRSModel,
+                      TestAGCutlassScaledMMModel) and dtype == torch.float16:
+        pytest.skip(
+            "Only bf16 high precision output types are supported for " \
+            "per-token (row-wise) scaling"
+        )
+
     num_processes = 2
 
     def run_torch_spawn(fn, nprocs):
@@ -148,9 +277,6 @@ def async_tp_pass_on_test_model(local_rank: int, world_size: int,
     # in the vllm_config, it's not really used.
     model_name = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e"
     vllm_config.model_config = ModelConfig(model=model_name,
-                                           task="auto",
-                                           tokenizer=model_name,
-                                           tokenizer_mode="auto",
                                            trust_remote_code=True,
                                            dtype=dtype,
                                            seed=42)
@@ -158,7 +284,8 @@ def async_tp_pass_on_test_model(local_rank: int, world_size: int,
     async_tp_pass = AsyncTPPass(vllm_config)
     backend = TestBackend(async_tp_pass)
 
-    model = test_model_cls(hidden_size)
+    model = test_model_cls(hidden_size,
+                           dtype)  # Pass dtype to model constructor
 
     hidden_states = torch.randn((batch_size * seq_len, hidden_size),
                                 dtype=dtype,
@@ -177,7 +304,10 @@ def async_tp_pass_on_test_model(local_rank: int, world_size: int,
 
 
 @create_new_process_for_each_test()
-@pytest.mark.parametrize("model_id", ["meta-llama/Llama-3.2-1B-Instruct"])
+@pytest.mark.parametrize("model_id", [
+    "meta-llama/Llama-3.2-1B-Instruct",
+    "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"
+])
 @pytest.mark.parametrize("tp_size", [2])
 @pytest.mark.parametrize("async_tp_enabled", [True])
 @pytest.mark.parametrize("distributed_backend", ["mp"])
diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py
index 1ee9b234d9f4c..cf715cd03222c 100644
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@@ -62,8 +62,8 @@ class TestSetting:
         TestSetting(
             model="BAAI/bge-multilingual-gemma2",
             model_args=[
-                "--task", "embed", "--dtype", "bfloat16", "--max-model-len",
-                "2048"
+                "--runner", "pooling", "--dtype", "bfloat16",
+                "--max-model-len", "2048"
             ],
             pp_size=1,
             tp_size=1,
@@ -75,7 +75,7 @@ class TestSetting:
         # # encoder-based embedding model (BERT)
         # TestSetting(
         #     model="BAAI/bge-base-en-v1.5",
-        #     model_args=["--task", "embed"],
+        #     model_args=["--runner", "pooling"],
         #     pp_size=1,
         #     tp_size=1,
         #     attn_backend="XFORMERS",
diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py
index 0ba59f4b5a056..90e8e0ff95858 100644
--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
@@ -26,6 +26,8 @@ def test_use_cudagraphs_dynamic(monkeypatch):
     assert not vllm_config.compilation_config.use_cudagraph
 
 
+# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
+@pytest.mark.forked
 # NB: We don't test VLLM_DISABLE_COMPILE_CACHE=0 because that depends
 # on the state of the cache directory on the current machine, which
 # may be influenced by other tests.
@@ -33,8 +35,8 @@ def test_use_cudagraphs_dynamic(monkeypatch):
 def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val):
     assert vllm.envs.VLLM_USE_V1
 
-    # spawn means that the counters are in the same process.
-    monkeypatch.setenv('VLLM_WORKER_MULTIPROC_METHOD', "spawn")
+    # Disable multiprocessing so that the counter is in the same process
+    monkeypatch.setenv('VLLM_ENABLE_V1_MULTIPROCESSING', '0')
     monkeypatch.setenv('VLLM_DISABLE_COMPILE_CACHE', val)
 
     compilation_config = {
@@ -50,6 +52,8 @@ def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val):
         pass
 
 
+# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
+@pytest.mark.forked
 @pytest.mark.parametrize("enabled", [True, False])
 def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
     assert vllm.envs.VLLM_USE_V1
@@ -72,3 +76,50 @@ def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
                         compilation_config=compilation_config,
                         gpu_memory_utilization=0.4) as _):
         pass
+
+
+# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
+@pytest.mark.forked
+def test_dynamo_as_is(vllm_runner, monkeypatch):
+    # Disable multiprocessing so that the counter is in the same process
+    monkeypatch.setenv('VLLM_ENABLE_V1_MULTIPROCESSING', '0')
+
+    with (
+            compilation_counter.expect(dynamo_as_is_count=1),
+            # loading the model causes compilation (if enabled) to happen
+            vllm_runner('facebook/opt-125m',
+                        compilation_config={"level": 1},
+                        gpu_memory_utilization=0.4) as _):
+        pass
+
+
+# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
+@pytest.mark.forked
+def test_no_compilation(vllm_runner, monkeypatch):
+    # Disable multiprocessing so that the counter is in the same process
+    monkeypatch.setenv('VLLM_ENABLE_V1_MULTIPROCESSING', '0')
+
+    with (
+            compilation_counter.expect(num_graphs_seen=0,
+                                       dynamo_as_is_count=0),
+            # loading the model causes compilation (if enabled) to happen
+            vllm_runner('facebook/opt-125m',
+                        compilation_config={"level": 0},
+                        gpu_memory_utilization=0.4) as _):
+        pass
+
+
+# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
+@pytest.mark.forked
+def test_enforce_eager(vllm_runner, monkeypatch):
+    # Disable multiprocessing so that the counter is in the same process
+    monkeypatch.setenv('VLLM_ENABLE_V1_MULTIPROCESSING', '0')
+
+    with (
+            compilation_counter.expect(num_graphs_seen=0,
+                                       dynamo_as_is_count=0),
+            # loading the model causes compilation (if enabled) to happen
+            vllm_runner('facebook/opt-125m',
+                        enforce_eager=True,
+                        gpu_memory_utilization=0.4) as _):
+        pass
diff --git a/tests/compile/test_fusion_all_reduce.py b/tests/compile/test_fusion_all_reduce.py
index 492e90f2a75f4..b8d64247f6beb 100644
--- a/tests/compile/test_fusion_all_reduce.py
+++ b/tests/compile/test_fusion_all_reduce.py
@@ -125,9 +125,6 @@ def all_reduce_fusion_pass_on_test_model(local_rank: int, world_size: int,
     # in the vllm_config, it's not really used.
     model_name = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e"
     vllm_config.model_config = ModelConfig(model=model_name,
-                                           task="auto",
-                                           tokenizer=model_name,
-                                           tokenizer_mode="auto",
                                            trust_remote_code=True,
                                            dtype=dtype,
                                            seed=42)
diff --git a/tests/compile/test_sequence_parallelism.py b/tests/compile/test_sequence_parallelism.py
index b56edfc906126..a6baa97fe6990 100644
--- a/tests/compile/test_sequence_parallelism.py
+++ b/tests/compile/test_sequence_parallelism.py
@@ -250,9 +250,6 @@ def sequence_parallelism_pass_on_test_model(
     # in the vllm_config, it's not really used.
     model_name = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e"
     vllm_config.model_config = ModelConfig(model=model_name,
-                                           task="auto",
-                                           tokenizer=model_name,
-                                           tokenizer_mode="auto",
                                            trust_remote_code=True,
                                            dtype=dtype,
                                            seed=42)
diff --git a/tests/conftest.py b/tests/conftest.py
index a18dbf58c803d..67f0e7424038c 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -23,7 +23,7 @@ from vllm import LLM, SamplingParams
 from vllm.assets.audio import AudioAsset
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
-from vllm.config import TaskOption, _get_and_verify_dtype
+from vllm.config import ConvertOption, RunnerOption, _get_and_verify_dtype
 from vllm.connections import global_http_connection
 from vllm.distributed import (cleanup_dist_env_and_memory,
                               init_distributed_environment,
@@ -769,7 +769,8 @@ class VllmRunner:
     def __init__(
         self,
         model_name: str,
-        task: TaskOption = "auto",
+        runner: RunnerOption = "auto",
+        convert: ConvertOption = "auto",
         tokenizer_name: Optional[str] = None,
         tokenizer_mode: str = "auto",
         trust_remote_code: bool = True,
@@ -786,7 +787,8 @@ class VllmRunner:
     ) -> None:
         self.llm = LLM(
             model=model_name,
-            task=task,
+            runner=runner,
+            convert=convert,
             tokenizer=tokenizer_name,
             tokenizer_mode=tokenizer_mode,
             trust_remote_code=trust_remote_code,
@@ -1051,6 +1053,10 @@ class VllmRunner:
         req_outputs = self.llm.encode(prompts)
         return [req_output.outputs.data for req_output in req_outputs]
 
+    def reward(self, prompts: list[str]) -> list[list[float]]:
+        req_outputs = self.llm.reward(prompts)
+        return [req_output.outputs.data for req_output in req_outputs]
+
     def score(
         self,
         text_1: Union[str, list[str]],
@@ -1062,8 +1068,17 @@ class VllmRunner:
         return [req_output.outputs.score for req_output in req_outputs]
 
     def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
-        executor = self.llm.llm_engine.model_executor
-        return executor.apply_model(func)
+        if hasattr(self.llm.llm_engine, "model_executor"):
+            # This works either in V0 or in V1 with
+            # VLLM_ENABLE_V1_MULTIPROCESSING=0
+            executor = self.llm.llm_engine.model_executor
+            return executor.apply_model(func)
+
+        # This works in V1 with VLLM_ALLOW_INSECURE_SERIALIZATION=1
+        def _apply_model(self):
+            return func(self.get_model())
+
+        return self.llm.llm_engine.collective_rpc(_apply_model)
 
     def __enter__(self):
         return self
diff --git a/tests/distributed/test_expert_parallel.py b/tests/distributed/test_expert_parallel.py
index f641bf1604145..f273f302e72e8 100644
--- a/tests/distributed/test_expert_parallel.py
+++ b/tests/distributed/test_expert_parallel.py
@@ -6,7 +6,7 @@ from typing import Literal, NamedTuple, Optional
 
 import pytest
 
-from vllm.config import TaskOption
+from vllm.config import RunnerOption
 from vllm.logger import init_logger
 
 from ..utils import compare_two_settings, create_new_process_for_each_test
@@ -31,14 +31,14 @@ class EPTestOptions(NamedTuple):
 class EPTestSettings:
     parallel_setups: list[ParallelSetup]
     distributed_backends: list[str]
-    task: TaskOption
+    runner: RunnerOption
     test_options: EPTestOptions
 
     @staticmethod
     def detailed(
         *,
         tp_base: int = 2,
-        task: TaskOption = "auto",
+        runner: RunnerOption = "auto",
         trust_remote_code: bool = False,
         tokenizer_mode: Optional[str] = None,
         load_format: Optional[str] = None,
@@ -63,7 +63,7 @@ class EPTestSettings:
                               chunked_prefill=False),
             ],
             distributed_backends=["mp", "ray"],
-            task=task,
+            runner=runner,
             test_options=EPTestOptions(trust_remote_code=trust_remote_code,
                                        tokenizer_mode=tokenizer_mode,
                                        load_format=load_format,
@@ -74,7 +74,7 @@ class EPTestSettings:
     def fast(
         *,
         tp_base: int = 2,
-        task: TaskOption = "auto",
+        runner: RunnerOption = "auto",
         trust_remote_code: bool = False,
         tokenizer_mode: Optional[str] = None,
         load_format: Optional[str] = None,
@@ -87,7 +87,7 @@ class EPTestSettings:
                               chunked_prefill=False),
             ],
             distributed_backends=["mp"],
-            task=task,
+            runner=runner,
             test_options=EPTestOptions(trust_remote_code=trust_remote_code,
                                        tokenizer_mode=tokenizer_mode,
                                        load_format=load_format,
@@ -100,7 +100,7 @@ class EPTestSettings:
         for parallel_setup in self.parallel_setups:
             for distributed_backend in self.distributed_backends:
                 yield (model_name, parallel_setup, distributed_backend,
-                       self.task, opts)
+                       self.runner, opts)
 
 
 # NOTE: You can adjust tp_base locally to fit the model in GPU
@@ -118,7 +118,7 @@ def _compare_tp(
     model_name: str,
     parallel_setup: ParallelSetup,
     distributed_backend: str,
-    task: TaskOption,
+    runner: RunnerOption,
     test_options: EPTestOptions,
     num_gpus_available: int,
     *,
@@ -154,8 +154,8 @@ def _compare_tp(
         common_args.append("--enable-chunked-prefill")
     if eager_mode:
         common_args.append("--enforce-eager")
-    if task != "auto":
-        common_args.extend(["--task", task])
+    if runner != "auto":
+        common_args.extend(["--runner", runner])
     if trust_remote_code:
         common_args.append("--trust-remote-code")
     if tokenizer_mode:
@@ -203,7 +203,7 @@ def _compare_tp(
 
 
 @pytest.mark.parametrize(
-    ("model_name", "parallel_setup", "distributed_backend", "task",
+    ("model_name", "parallel_setup", "distributed_backend", "runner",
      "test_options"),
     [
         params for model_name, settings in TEST_MODELS.items()
@@ -215,14 +215,14 @@ def test_ep(
     model_name: str,
     parallel_setup: ParallelSetup,
     distributed_backend: str,
-    task: TaskOption,
+    runner: RunnerOption,
     test_options: EPTestOptions,
     num_gpus_available,
 ):
     _compare_tp(model_name,
                 parallel_setup,
                 distributed_backend,
-                task,
+                runner,
                 test_options,
                 num_gpus_available,
                 method="generate")
diff --git a/tests/distributed/test_kvlayout.py b/tests/distributed/test_kvlayout.py
new file mode 100644
index 0000000000000..d447876f6cc7c
--- /dev/null
+++ b/tests/distributed/test_kvlayout.py
@@ -0,0 +1,72 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm.config import (DeviceConfig, KVTransferConfig, ModelConfig,
+                         VllmConfig, set_current_vllm_config)
+from vllm.distributed.kv_transfer.kv_connector.utils import (
+    get_kv_connector_cache_layout)
+from vllm.logger import init_logger
+
+logger = init_logger("test_expert_parallel")
+
+
+def test_get_kv_connector_cache_layout_without_kv_connector():
+    vllm_config = VllmConfig(device_config=DeviceConfig("cpu"))
+    with set_current_vllm_config(vllm_config):
+        # Test with default settings
+        layout = get_kv_connector_cache_layout()
+        assert layout == "NHD"
+
+
+def test_get_kv_connector_cache_layout_with_lmcache_connector():
+    kv_transfer_config = KVTransferConfig(
+        kv_connector="LMCacheConnectorV1",
+        kv_role="kv_both",
+    )
+    vllm_config = VllmConfig(device_config=DeviceConfig("cpu"),
+                             kv_transfer_config=kv_transfer_config)
+    with set_current_vllm_config(vllm_config):
+        # Test with default settings
+        layout = get_kv_connector_cache_layout()
+        assert layout == "NHD"
+
+
+def test_get_kv_connector_cache_layout_with_nixl_connector():
+    kv_transfer_config = KVTransferConfig(
+        kv_connector="NixlConnector",
+        kv_role="kv_both",
+    )
+    model_config = ModelConfig()
+    vllm_config = VllmConfig(device_config=DeviceConfig("cpu"),
+                             model_config=model_config,
+                             kv_transfer_config=kv_transfer_config)
+    with set_current_vllm_config(vllm_config):
+        # Test with default settings
+        layout = get_kv_connector_cache_layout()
+        assert layout == "HND"
+
+
+def test_get_kv_connector_cache_layout_with_multi_connector():
+    kv_transfer_config = KVTransferConfig(kv_connector="MultiConnector",
+                                          kv_role="kv_both",
+                                          kv_connector_extra_config={
+                                              "connectors": [{
+                                                  "kv_connector":
+                                                  "SharedStorageConnector",
+                                                  "kv_role":
+                                                  "kv_both"
+                                              }, {
+                                                  "kv_connector":
+                                                  "NixlConnector",
+                                                  "kv_role":
+                                                  "kv_both"
+                                              }]
+                                          })
+    model_config = ModelConfig()
+    vllm_config = VllmConfig(device_config=DeviceConfig("cpu"),
+                             model_config=model_config,
+                             kv_transfer_config=kv_transfer_config)
+    with set_current_vllm_config(vllm_config):
+        # Test with default settings
+        layout = get_kv_connector_cache_layout()
+        assert layout == "HND"
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 2391430a083a9..cfb2e2dd15f4d 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -14,7 +14,7 @@ from typing import Literal, NamedTuple, Optional
 
 import pytest
 
-from vllm.config import _FLOAT16_NOT_SUPPORTED_MODELS, TaskOption
+from vllm.config import _FLOAT16_NOT_SUPPORTED_MODELS, RunnerOption
 from vllm.logger import init_logger
 from vllm.transformers_utils.config import get_config
 
@@ -60,7 +60,7 @@ class PPTestSettings:
     distributed_backends: list[str]
     # vllm major version: "0" for V0, "1" for V1
     vllm_major_versions: list[str]
-    task: TaskOption
+    runner: RunnerOption
     test_options: PPTestOptions
 
     def __post_init__(self):
@@ -76,7 +76,7 @@ class PPTestSettings:
         tp_base: int = 1,
         pp_base: int = 2,
         multi_node_only: bool = False,
-        task: TaskOption = "auto",
+        runner: RunnerOption = "auto",
         load_format: Optional[str] = None,
     ):
         return PPTestSettings(
@@ -104,7 +104,7 @@ class PPTestSettings:
             ],
             distributed_backends=["mp", "mp", "ray", "ray"],
             vllm_major_versions=["0", "1", "0", "1"],
-            task=task,
+            runner=runner,
             test_options=PPTestOptions(multi_node_only=multi_node_only,
                                        load_format=load_format),
         )
@@ -114,7 +114,7 @@ class PPTestSettings:
         *,
         tp_base: int = 1,
         pp_base: int = 2,
-        task: TaskOption = "auto",
+        runner: RunnerOption = "auto",
         multi_node_only: bool = False,
         load_format: Optional[str] = None,
     ):
@@ -127,7 +127,7 @@ class PPTestSettings:
             ],
             distributed_backends=["mp"],
             vllm_major_versions=["0"],
-            task=task,
+            runner=runner,
             test_options=PPTestOptions(multi_node_only=multi_node_only,
                                        load_format=load_format),
         )
@@ -139,7 +139,7 @@ class PPTestSettings:
             for backend, vllm_major_version in zip(self.distributed_backends,
                                                    self.vllm_major_versions):
                 yield (model_id, parallel_setup, backend, vllm_major_version,
-                       self.task, opts)
+                       self.runner, opts)
 
 
 # NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
@@ -175,6 +175,7 @@ TEXT_GENERATION_MODELS = {
     "internlm/internlm2-chat-7b": PPTestSettings.fast(),
     "inceptionai/jais-13b-chat": PPTestSettings.fast(),
     "ai21labs/Jamba-tiny-dev": PPTestSettings.fast(),
+    "pfnet/plamo-2-1b": PPTestSettings.fast(),
     "meta-llama/Llama-3.2-1B-Instruct": PPTestSettings.detailed(),
     # Tests TransformersForCausalLM
     "hmellor/Ilama-3.2-1B": PPTestSettings.fast(),
@@ -211,10 +212,10 @@ TEXT_GENERATION_MODELS = {
 
 EMBEDDING_MODELS = {  # type: ignore[var-annotated]
     # [Text-only]
-    "intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(task="embed"),
-    "BAAI/bge-multilingual-gemma2": PPTestSettings.fast(task="embed"),
+    "intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(runner="pooling"),
+    "BAAI/bge-multilingual-gemma2": PPTestSettings.fast(runner="pooling"),
     "Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(
-        load_format="dummy", task="embed"
+        load_format="dummy", runner="pooling"
     ),
 }
 
@@ -269,7 +270,7 @@ def _compare_tp(
     parallel_setup: ParallelSetup,
     distributed_backend: str,
     vllm_major_version: str,
-    task: TaskOption,
+    runner: RunnerOption,
     test_options: PPTestOptions,
     num_gpus_available: int,
     *,
@@ -335,8 +336,8 @@ def _compare_tp(
         common_args.append("--enable-chunked-prefill")
     if eager_mode:
         common_args.append("--enforce-eager")
-    if task != "auto":
-        common_args.extend(["--task", task])
+    if runner != "auto":
+        common_args.extend(["--runner", runner])
     if trust_remote_code:
         common_args.append("--trust-remote-code")
     if tokenizer_mode:
@@ -415,7 +416,7 @@ def _compare_tp(
 
 @pytest.mark.parametrize(
     ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
-     "task", "test_options"),
+     "runner", "test_options"),
     [
         params for model_id, settings in TEXT_GENERATION_MODELS.items()
         for params in settings.iter_params(model_id) if model_id in TEST_MODELS
@@ -427,7 +428,7 @@ def test_tp_language_generation(
     parallel_setup: ParallelSetup,
     distributed_backend: str,
     vllm_major_version: str,
-    task: TaskOption,
+    runner: RunnerOption,
     test_options: PPTestOptions,
     num_gpus_available,
 ):
@@ -435,7 +436,7 @@ def test_tp_language_generation(
                 parallel_setup,
                 distributed_backend,
                 vllm_major_version,
-                task,
+                runner,
                 test_options,
                 num_gpus_available,
                 method="generate",
@@ -444,7 +445,7 @@ def test_tp_language_generation(
 
 @pytest.mark.parametrize(
     ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
-     "task", "test_options"),
+     "runner", "test_options"),
     [
         params for model_id, settings in EMBEDDING_MODELS.items()
         for params in settings.iter_params(model_id) if model_id in TEST_MODELS
@@ -456,7 +457,7 @@ def test_tp_language_embedding(
     parallel_setup: ParallelSetup,
     distributed_backend: str,
     vllm_major_version: str,
-    task: TaskOption,
+    runner: RunnerOption,
     test_options: PPTestOptions,
     num_gpus_available,
 ):
@@ -464,7 +465,7 @@ def test_tp_language_embedding(
                 parallel_setup,
                 distributed_backend,
                 vllm_major_version,
-                task,
+                runner,
                 test_options,
                 num_gpus_available,
                 method="encode",
@@ -473,7 +474,7 @@ def test_tp_language_embedding(
 
 @pytest.mark.parametrize(
     ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
-     "task", "test_options"),
+     "runner", "test_options"),
     [
         params for model_id, settings in MULTIMODAL_MODELS.items()
         for params in settings.iter_params(model_id) if model_id in TEST_MODELS
@@ -485,7 +486,7 @@ def test_tp_multimodal_generation(
     parallel_setup: ParallelSetup,
     distributed_backend: str,
     vllm_major_version: str,
-    task: TaskOption,
+    runner: RunnerOption,
     test_options: PPTestOptions,
     num_gpus_available,
 ):
@@ -493,7 +494,7 @@ def test_tp_multimodal_generation(
                 parallel_setup,
                 distributed_backend,
                 vllm_major_version,
-                task,
+                runner,
                 test_options,
                 num_gpus_available,
                 method="generate",
diff --git a/tests/distributed/test_sequence_parallel.py b/tests/distributed/test_sequence_parallel.py
index b2f6a8ab9dd31..49b8eddecb4a9 100644
--- a/tests/distributed/test_sequence_parallel.py
+++ b/tests/distributed/test_sequence_parallel.py
@@ -14,7 +14,7 @@ from typing import Literal, NamedTuple, Optional
 
 import pytest
 
-from vllm.config import TaskOption
+from vllm.config import RunnerOption
 from vllm.logger import init_logger
 
 from ..models.registry import HF_EXAMPLE_MODELS
@@ -48,7 +48,7 @@ class SPTestSettings:
     distributed_backends: list[str]
     # vllm major version: "0" for V0, "1" for V1
     vllm_major_versions: list[str]
-    task: TaskOption
+    runner: RunnerOption
     test_options: SPTestOptions
 
     def __post_init__(self):
@@ -64,7 +64,7 @@ class SPTestSettings:
         tp_base: int = 2,
         pp_base: int = 1,
         multi_node_only: bool = False,
-        task: TaskOption = "auto",
+        runner: RunnerOption = "auto",
         load_format: Optional[str] = None,
     ):
         parallel_setups = []
@@ -81,7 +81,7 @@ class SPTestSettings:
             parallel_setups=parallel_setups,
             distributed_backends=["mp", "ray"],
             vllm_major_versions=["1", "1"],
-            task=task,
+            runner=runner,
             test_options=SPTestOptions(multi_node_only=multi_node_only,
                                        load_format=load_format),
         )
@@ -91,7 +91,7 @@ class SPTestSettings:
         *,
         tp_base: int = 2,
         pp_base: int = 1,
-        task: TaskOption = "auto",
+        runner: RunnerOption = "auto",
         multi_node_only: bool = False,
         load_format: Optional[str] = None,
     ):
@@ -109,7 +109,7 @@ class SPTestSettings:
             parallel_setups=parallel_setups,
             distributed_backends=["mp", "ray"],
             vllm_major_versions=["1", "1"],
-            task=task,
+            runner=runner,
             test_options=SPTestOptions(multi_node_only=multi_node_only,
                                        load_format=load_format),
         )
@@ -119,7 +119,7 @@ class SPTestSettings:
         *,
         tp_base: int = 2,
         pp_base: int = 1,
-        task: TaskOption = "auto",
+        runner: RunnerOption = "auto",
         multi_node_only: bool = False,
         load_format: Optional[str] = None,
     ):
@@ -135,7 +135,7 @@ class SPTestSettings:
             parallel_setups=parallel_setups,
             distributed_backends=["mp", "ray"],
             vllm_major_versions=["1", "1"],
-            task=task,
+            runner=runner,
             test_options=SPTestOptions(multi_node_only=multi_node_only,
                                        load_format=load_format),
         )
@@ -147,7 +147,7 @@ class SPTestSettings:
             for backend, vllm_major_version in zip(self.distributed_backends,
                                                    self.vllm_major_versions):
                 yield (model_id, parallel_setup, backend, vllm_major_version,
-                       self.task, opts)
+                       self.runner, opts)
 
 
 def _compare_sp(
@@ -155,7 +155,7 @@ def _compare_sp(
     parallel_setup: ParallelSetup,
     distributed_backend: str,
     vllm_major_version: str,
-    task: TaskOption,
+    runner: RunnerOption,
     test_options: SPTestOptions,
     num_gpus_available: int,
     *,
@@ -217,8 +217,8 @@ def _compare_sp(
         common_args.append("--enable-chunked-prefill")
     if eager_mode:
         common_args.append("--enforce-eager")
-    if task != "auto":
-        common_args.extend(["--task", task])
+    if runner != "auto":
+        common_args.extend(["--runner", runner])
     if trust_remote_code:
         common_args.append("--trust-remote-code")
     if tokenizer_mode:
@@ -298,7 +298,7 @@ SP_TEST_MODELS = [
 
 @pytest.mark.parametrize(
     ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
-     "task", "test_options"),
+     "runner", "test_options"),
     [
         params for model_id, settings in SP_TEXT_GENERATION_MODELS.items()
         for params in settings.iter_params(model_id)
@@ -311,7 +311,7 @@ def test_tp_sp_generation(
     parallel_setup: ParallelSetup,
     distributed_backend: str,
     vllm_major_version: str,
-    task: TaskOption,
+    runner: RunnerOption,
     test_options: SPTestOptions,
     num_gpus_available,
 ):
@@ -319,7 +319,7 @@ def test_tp_sp_generation(
                 parallel_setup,
                 distributed_backend,
                 vllm_major_version,
-                task,
+                runner,
                 test_options,
                 num_gpus_available,
                 method="generate",
diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
index 5a91758414a5a..1d1926068d28c 100644
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -72,6 +72,10 @@ def test_get_type(type_hints, type, expected):
         "type": int,
         "choices": [1, 2]
     }),
+    ({str, Literal["x", "y"]}, {
+        "type": str,
+        "metavar": ["x", "y"]
+    }),
     ({Literal[1, "a"]}, Exception),
 ])
 def test_literal_to_kwargs(type_hints, expected):
diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py
deleted file mode 100644
index 55578341cb2e7..0000000000000
--- a/tests/entrypoints/llm/test_guided_generate.py
+++ /dev/null
@@ -1,552 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import json
-import weakref
-from enum import Enum
-
-import jsonschema
-import pytest
-import regex as re
-from pydantic import BaseModel
-
-from vllm.distributed import cleanup_dist_env_and_memory
-from vllm.entrypoints.llm import LLM
-from vllm.outputs import RequestOutput
-from vllm.sampling_params import GuidedDecodingParams, SamplingParams
-
-MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
-
-# Separate backends which support grammars vs ones
-# which only support regex based constraints in tests.
-GRAMMAR_DECODING_BACKENDS = [
-    # (backend, disable_any_whitespace),
-    ("lm-format-enforcer", False),
-    ("xgrammar", True),
-    ("guidance", True),
-]
-
-ALL_DECODING_BACKENDS = ([("outlines", False)] + GRAMMAR_DECODING_BACKENDS)
-
-
-@pytest.fixture(scope="module")
-def llm():
-    # pytest caches the fixture so we use weakref.proxy to
-    # enable garbage collection
-    llm = LLM(model=MODEL_NAME, max_model_len=1024, seed=0)
-
-    with llm.deprecate_legacy_api():
-        yield weakref.proxy(llm)
-        del llm
-    cleanup_dist_env_and_memory()
-
-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace",
-                         ALL_DECODING_BACKENDS)
-def test_guided_regex(sample_regex, llm, guided_decoding_backend: str,
-                      disable_any_whitespace: bool):
-    sampling_params = SamplingParams(
-        temperature=0.8,
-        top_p=0.95,
-        guided_decoding=GuidedDecodingParams(
-            regex=sample_regex,
-            backend=guided_decoding_backend,
-            disable_any_whitespace=disable_any_whitespace))
-
-    outputs = llm.generate(prompts=[
-        f"Give an example IPv4 address with this regex: {sample_regex}"
-    ] * 2,
-                           sampling_params=sampling_params,
-                           use_tqdm=True)
-
-    assert outputs is not None
-    for output in outputs:
-        assert output is not None
-        assert isinstance(output, RequestOutput)
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(generated_text)
-        assert generated_text is not None
-        assert re.fullmatch(sample_regex, generated_text) is not None
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace",
-                         ALL_DECODING_BACKENDS)
-def test_guided_json_completion(sample_json_schema, llm,
-                                guided_decoding_backend: str,
-                                disable_any_whitespace: bool):
-    sampling_params = SamplingParams(
-        temperature=1.0,
-        max_tokens=1000,
-        guided_decoding=GuidedDecodingParams(
-            json=sample_json_schema,
-            backend=guided_decoding_backend,
-            disable_any_whitespace=disable_any_whitespace))
-    outputs = llm.generate(prompts=[
-        f"Give an example JSON for an employee profile "
-        f"that fits this schema: {sample_json_schema}"
-    ] * 2,
-                           sampling_params=sampling_params,
-                           use_tqdm=True)
-
-    assert outputs is not None
-
-    for output in outputs:
-        assert output is not None
-        assert isinstance(output, RequestOutput)
-        prompt = output.prompt
-
-        generated_text = output.outputs[0].text
-        assert generated_text is not None
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-        output_json = json.loads(generated_text)
-        jsonschema.validate(instance=output_json, schema=sample_json_schema)
-
-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace",
-                         ALL_DECODING_BACKENDS)
-def test_guided_complex_json_completion(sample_complex_json_schema, llm,
-                                        guided_decoding_backend: str,
-                                        disable_any_whitespace: bool):
-    sampling_params = SamplingParams(
-        temperature=1.0,
-        max_tokens=1000,
-        guided_decoding=GuidedDecodingParams(
-            json=sample_complex_json_schema,
-            backend=guided_decoding_backend,
-            disable_any_whitespace=disable_any_whitespace))
-    outputs = llm.generate(prompts=[
-        f"Give an example JSON for an assignment grade "
-        f"that fits this schema: {sample_complex_json_schema}"
-    ] * 2,
-                           sampling_params=sampling_params,
-                           use_tqdm=True)
-
-    assert outputs is not None
-
-    for output in outputs:
-        assert output is not None
-        assert isinstance(output, RequestOutput)
-        prompt = output.prompt
-
-        generated_text = output.outputs[0].text
-        assert generated_text is not None
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-        output_json = json.loads(generated_text)
-        jsonschema.validate(instance=output_json,
-                            schema=sample_complex_json_schema)
-
-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace",
-                         ALL_DECODING_BACKENDS)
-def test_guided_definition_json_completion(sample_definition_json_schema, llm,
-                                           guided_decoding_backend: str,
-                                           disable_any_whitespace: bool):
-    sampling_params = SamplingParams(
-        temperature=1.0,
-        max_tokens=1000,
-        guided_decoding=GuidedDecodingParams(
-            json=sample_definition_json_schema,
-            backend=guided_decoding_backend,
-            disable_any_whitespace=disable_any_whitespace))
-    outputs = llm.generate(prompts=[
-        f"Give an example JSON for solving 8x + 7 = -23 "
-        f"that fits this schema: {sample_definition_json_schema}"
-    ] * 2,
-                           sampling_params=sampling_params,
-                           use_tqdm=True)
-
-    assert outputs is not None
-
-    for output in outputs:
-        assert output is not None
-        assert isinstance(output, RequestOutput)
-        prompt = output.prompt
-
-        generated_text = output.outputs[0].text
-        assert generated_text is not None
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-        output_json = json.loads(generated_text)
-        jsonschema.validate(instance=output_json,
-                            schema=sample_definition_json_schema)
-
-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace",
-                         ALL_DECODING_BACKENDS)
-def test_guided_enum_json_completion(sample_enum_json_schema, llm,
-                                     guided_decoding_backend: str,
-                                     disable_any_whitespace: bool):
-    sampling_params = SamplingParams(
-        temperature=1.0,
-        max_tokens=1000,
-        guided_decoding=GuidedDecodingParams(
-            json=sample_enum_json_schema,
-            backend=guided_decoding_backend,
-            disable_any_whitespace=disable_any_whitespace))
-    outputs = llm.generate(prompts=[
-        "Create a bug report JSON that fits this schema: "
-        f"{sample_enum_json_schema}. Make it for a high priority critical bug."
-    ] * 2,
-                           sampling_params=sampling_params,
-                           use_tqdm=True)
-
-    assert outputs is not None
-
-    for output in outputs:
-        assert output is not None
-        assert isinstance(output, RequestOutput)
-        prompt = output.prompt
-
-        generated_text = output.outputs[0].text
-        assert generated_text is not None
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-        output_json = json.loads(generated_text)
-        jsonschema.validate(instance=output_json,
-                            schema=sample_enum_json_schema)
-
-        # Additional assertions to verify enum values
-        assert output_json["status"] in ["active", "inactive", "pending"]
-        assert output_json["priority"] in ["low", "medium", "high", "critical"]
-        assert output_json["category"]["type"] in [
-            "bug", "feature", "improvement"
-        ]
-        assert output_json["category"]["severity"] in [1, 2, 3, 4, 5]
-        for flag in output_json["flags"]:
-            assert flag in ["urgent", "blocked", "needs_review", "approved"]
-
-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace",
-                         ALL_DECODING_BACKENDS)
-def test_guided_choice_completion(sample_guided_choice, llm,
-                                  guided_decoding_backend: str,
-                                  disable_any_whitespace: bool):
-    sampling_params = SamplingParams(
-        temperature=0.8,
-        top_p=0.95,
-        guided_decoding=GuidedDecodingParams(
-            choice=sample_guided_choice,
-            backend=guided_decoding_backend,
-            disable_any_whitespace=disable_any_whitespace))
-    outputs = llm.generate(
-        prompts="The best language for type-safe systems programming is ",
-        sampling_params=sampling_params,
-        use_tqdm=True)
-
-    assert outputs is not None
-    for output in outputs:
-        assert output is not None
-        assert isinstance(output, RequestOutput)
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(generated_text)
-        assert generated_text is not None
-        assert generated_text in sample_guided_choice
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace",
-                         GRAMMAR_DECODING_BACKENDS)
-def test_guided_grammar(sample_sql_statements, llm,
-                        guided_decoding_backend: str,
-                        disable_any_whitespace: bool):
-    sampling_params = SamplingParams(
-        temperature=0.8,
-        top_p=0.95,
-        max_tokens=1000,
-        guided_decoding=GuidedDecodingParams(
-            grammar=sample_sql_statements,
-            backend=guided_decoding_backend,
-            disable_any_whitespace=disable_any_whitespace))
-    outputs = llm.generate(
-        prompts=("Generate a sql state that select col_1 from "
-                 "table_1 where it is equals to 1"),
-        sampling_params=sampling_params,
-        use_tqdm=True,
-    )
-
-    assert outputs is not None
-    for output in outputs:
-        assert output is not None
-        assert isinstance(output, RequestOutput)
-        prompt = output.prompt
-
-        generated_text = output.outputs[0].text
-        assert generated_text is not None
-        # use Lark to parse the output, and make sure it's a valid parse tree
-        from lark import Lark
-        parser = Lark(sample_sql_statements)
-        parser.parse(generated_text)
-
-        # remove spaces for comparison b/c we removed them in the grammar
-        ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(
-            " ", "")
-
-        assert generated_text.strip() == ground_truth
-
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-
-@pytest.mark.skip_global_cleanup
-def test_guided_options_request_deprecation_warning(sample_regex, llm):
-    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-
-    with pytest.warns(DeprecationWarning, match="guided_options_request"):
-        llm.generate(prompts="This should fail",
-                     sampling_params=sampling_params,
-                     use_tqdm=True,
-                     guided_options_request=dict(guided_regex=sample_regex))
-
-
-@pytest.mark.skip_global_cleanup
-def test_validation_against_both_guided_decoding_options(sample_regex, llm):
-    sampling_params = SamplingParams(
-        temperature=0.8,
-        top_p=0.95,
-        guided_decoding=GuidedDecodingParams(regex=sample_regex))
-
-    with pytest.raises(ValueError, match="Cannot set both"):
-        llm.generate(prompts="This should fail",
-                     sampling_params=sampling_params,
-                     use_tqdm=True,
-                     guided_options_request=dict(guided_regex=sample_regex))
-
-
-@pytest.mark.skip_global_cleanup
-def test_disable_guided_decoding_fallback(sample_regex, llm):
-    # see has_xgrammar_unsupported_json_features()
-    unsupported_json = {
-        "type": "object",
-        "properties": {
-            "example": {
-                "type": "string",
-                "minLength": 5  # unsupported by xgrammar
-            }
-        }
-    }
-    sampling_params = SamplingParams(temperature=0.8,
-                                     top_p=0.95,
-                                     guided_decoding=GuidedDecodingParams(
-                                         json=unsupported_json,
-                                         backend="xgrammar",
-                                         disable_fallback=True))
-
-    with pytest.raises(
-            ValueError,
-            match="xgrammar does not support advanced JSON schema features "
-            "like string length, item limits, or property bounds."):
-        llm.generate(prompts="This should fail",
-                     sampling_params=sampling_params,
-                     use_tqdm=True)
-
-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace",
-                         GRAMMAR_DECODING_BACKENDS)
-def test_guided_json_object(llm, guided_decoding_backend: str,
-                            disable_any_whitespace: bool):
-    sampling_params = SamplingParams(
-        temperature=1.0,
-        max_tokens=100,
-        n=2,
-        guided_decoding=GuidedDecodingParams(
-            json_object=True,
-            backend=guided_decoding_backend,
-            disable_any_whitespace=disable_any_whitespace))
-
-    outputs = llm.generate(
-        prompts=("Generate a JSON object with curly braces for a person with "
-                 "name and age fields for John Smith who is 31 years old."),
-        sampling_params=sampling_params,
-        use_tqdm=True)
-
-    assert outputs is not None
-    for output in outputs:
-        assert output is not None
-        assert isinstance(output, RequestOutput)
-
-        for i in range(2):
-            generated_text = output.outputs[i].text
-            print(generated_text)
-            assert generated_text is not None
-
-            if disable_any_whitespace:
-                assert "\n" not in generated_text
-
-            # Parse to verify it is valid JSON
-            parsed_json = json.loads(generated_text)
-            # A list is not what was intended, but is still valid
-            # json.
-            assert isinstance(parsed_json, (dict, list))
-
-
-class CarType(str, Enum):
-    sedan = "sedan"
-    suv = "SUV"
-    truck = "Truck"
-    coupe = "Coupe"
-
-
-class CarDescription(BaseModel):
-    brand: str
-    model: str
-    car_type: CarType
-
-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace",
-                         ALL_DECODING_BACKENDS)
-def test_guided_json_completion_with_enum(llm, guided_decoding_backend: str,
-                                          disable_any_whitespace: bool):
-    json_schema = CarDescription.model_json_schema()
-    sampling_params = SamplingParams(
-        temperature=1.0,
-        max_tokens=1000,
-        guided_decoding=GuidedDecodingParams(
-            json=json_schema,
-            backend=guided_decoding_backend,
-            disable_any_whitespace=disable_any_whitespace))
-    outputs = llm.generate(
-        prompts="Generate a JSON with the brand, model and car_type of"
-        "the most iconic car from the 90's",
-        sampling_params=sampling_params,
-        use_tqdm=True)
-
-    assert outputs is not None
-    for output in outputs:
-        assert output is not None
-        assert isinstance(output, RequestOutput)
-        prompt = output.prompt
-
-        generated_text = output.outputs[0].text
-        assert generated_text is not None
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-        output_json = json.loads(generated_text)
-        jsonschema.validate(instance=output_json, schema=json_schema)
-
-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace",
-                         ALL_DECODING_BACKENDS)
-def test_guided_number_range_json_completion(llm, guided_decoding_backend: str,
-                                             disable_any_whitespace: bool):
-    sample_output_schema = {
-        "type": "object",
-        "properties": {
-            "age": {
-                "type": "integer",
-                "minimum": 18,
-                "maximum": 99
-            },
-            "score": {
-                "type": "number",
-                "minimum": 0.0,
-                "maximum": 100.0
-            },
-            "zipcode": {
-                "type": "string",
-                "pattern": r"^\d{5}(-\d{4})?$"
-            },
-        },
-        "required": ["age", "score", "zipcode"],
-    }
-    sampling_params = SamplingParams(
-        temperature=1.0,
-        max_tokens=1000,
-        guided_decoding=GuidedDecodingParams(
-            json=sample_output_schema,
-            backend=guided_decoding_backend,
-            disable_any_whitespace=disable_any_whitespace),
-    )
-    outputs = llm.generate(
-        prompts=[
-            "Create a JSON object for a user with age, score, and zipcode."
-        ] * 2,
-        sampling_params=sampling_params,
-        use_tqdm=True,
-    )
-
-    assert outputs is not None
-
-    for output in outputs:
-        assert output is not None
-        assert isinstance(output, RequestOutput)
-        prompt = output.prompt
-
-        generated_text = output.outputs[0].text
-        assert generated_text is not None
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-        output_json = json.loads(generated_text)
-        jsonschema.validate(instance=output_json, schema=sample_output_schema)
-        assert 18 <= output_json["age"] <= 99
-        assert 0.0 <= output_json["score"] <= 100.0
-        assert (re.fullmatch(r"^\d{5}(-\d{4})?$", output_json["zipcode"])
-                is not None)
-
-
-@pytest.mark.skip_global_cleanup
-def test_guidance_no_additional_properties(llm):
-    schema = {
-        'type': 'object',
-        'properties': {
-            'a1': {
-                'type': 'string'
-            },
-            'a2': {
-                'type': 'string'
-            },
-            'a3': {
-                'type': 'string'
-            }
-        },
-        'required': ['a1', 'a2', 'a3'],
-    }
-
-    prompt = (
-        "<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a "
-        "helpful assistant.<|im_end|>\n<|im_start|>user\nPlease generate a "
-        "large JSON object with key-value pairs a1=b1, a2=b2, ..., a20=b20"
-        "<|im_end|>\n<|im_start|>assistant\n")
-
-    def generate_with_backend(backend, disable_additional_properties):
-        guided_params = GuidedDecodingParams(
-            json=schema,
-            backend=backend,
-            disable_any_whitespace=True,
-            disable_additional_properties=disable_additional_properties)
-        sampling_params = SamplingParams(temperature=0,
-                                         max_tokens=256,
-                                         guided_decoding=guided_params)
-
-        outputs = llm.generate(prompts=prompt, sampling_params=sampling_params)
-        assert outputs is not None
-        generated_text = outputs[0].outputs[0].text
-        assert generated_text is not None
-        parsed_json = json.loads(generated_text)
-        assert isinstance(parsed_json, dict)
-        jsonschema.validate(instance=parsed_json, schema=schema)
-        return parsed_json
-
-    base_generated = generate_with_backend("guidance", False)
-    assert "a1" in base_generated
-    assert "a2" in base_generated
-    assert "a3" in base_generated
-    # by default additional keys are generated
-    assert "a4" in base_generated
-    assert "a5" in base_generated
-    assert "a6" in base_generated
-
-    generated = generate_with_backend("guidance", True)
-    assert "a1" in generated
-    assert "a2" in generated
-    assert "a3" in generated
-    assert "a4" not in generated
-    assert "a5" not in generated
-    assert "a6" not in generated
diff --git a/tests/entrypoints/llm/test_lazy_outlines.py b/tests/entrypoints/llm/test_lazy_outlines.py
index 61b6b4fbf8e35..ac0b7e134c55a 100644
--- a/tests/entrypoints/llm/test_lazy_outlines.py
+++ b/tests/entrypoints/llm/test_lazy_outlines.py
@@ -4,43 +4,11 @@
 import sys
 from contextlib import nullcontext
 
-import pytest
 from vllm_test_utils import BlameResult, blame
 
 from vllm import LLM, SamplingParams
 from vllm.distributed import cleanup_dist_env_and_memory
-
-
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    V1 only supports xgrammar so this is irrelevant.
-    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
-
-
-def run_normal_opt125m():
-    prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ]
-    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-
-    # Create an LLM without guided decoding as a baseline.
-    llm = LLM(model="facebook/opt-125m",
-              enforce_eager=True,
-              gpu_memory_utilization=0.3)
-    outputs = llm.generate(prompts, sampling_params)
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-    # Destroy the LLM object and free up the GPU memory.
-    del llm
-    cleanup_dist_env_and_memory()
+from vllm.sampling_params import GuidedDecodingParams
 
 
 def run_normal():
@@ -67,20 +35,22 @@ def run_normal():
     cleanup_dist_env_and_memory()
 
 
-def run_lmfe(sample_regex):
+def run_xgrammar(sample_regex):
     # Create an LLM with guided decoding enabled.
     llm = LLM(model="distilbert/distilgpt2",
               enforce_eager=True,
-              guided_decoding_backend="lm-format-enforcer",
+              guided_decoding_backend="xgrammar",
               gpu_memory_utilization=0.3)
-    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+    prompt = f"Give an example IPv4 address with this regex: {sample_regex}"
+    guided_decoding = GuidedDecodingParams(regex=sample_regex)
+    sampling_params = SamplingParams(temperature=0.8,
+                                     top_p=0.95,
+                                     guided_decoding=guided_decoding)
     outputs = llm.generate(
-        prompts=[
-            f"Give an example IPv4 address with this regex: {sample_regex}"
-        ] * 2,
+        prompts=[prompt] * 2,
         sampling_params=sampling_params,
         use_tqdm=True,
-        guided_options_request=dict(guided_regex=sample_regex))
+    )
 
     for output in outputs:
         prompt = output.prompt
@@ -103,7 +73,7 @@ def test_lazy_outlines(sample_regex):
         lambda: module_name in sys.modules) if use_blame else nullcontext()
     with context as result:
         run_normal()
-        run_lmfe(sample_regex)
+        run_xgrammar(sample_regex)
     if use_blame:
         assert isinstance(result, BlameResult)
         print(f"the first import location is:\n{result.trace_stack}")
diff --git a/tests/entrypoints/openai/correctness/test_mteb_embed.py b/tests/entrypoints/openai/correctness/test_mteb_embed.py
index 12a86f9bdd59e..783f7d3e0d5aa 100644
--- a/tests/entrypoints/openai/correctness/test_mteb_embed.py
+++ b/tests/entrypoints/openai/correctness/test_mteb_embed.py
@@ -19,7 +19,8 @@ MAIN_SCORE = 0.7422994752439667
 @pytest.fixture(scope="module")
 def server():
     args = [
-        "--task", "embed", "--enforce-eager", "--disable-uvicorn-access-log"
+        "--runner", "pooling", "--enforce-eager",
+        "--disable-uvicorn-access-log"
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
diff --git a/tests/entrypoints/openai/correctness/test_mteb_score.py b/tests/entrypoints/openai/correctness/test_mteb_score.py
index 05e953de4a0fc..cfb865815c9b2 100644
--- a/tests/entrypoints/openai/correctness/test_mteb_score.py
+++ b/tests/entrypoints/openai/correctness/test_mteb_score.py
@@ -21,7 +21,8 @@ MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
 @pytest.fixture(scope="module")
 def server():
     args = [
-        "--task", "score", "--enforce-eager", "--disable-uvicorn-access-log"
+        "--runner", "pooling", "--enforce-eager",
+        "--disable-uvicorn-access-log"
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index e7c3ffaa6a9f2..5ad29d70f10df 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -488,7 +488,9 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
 
 @pytest.mark.asyncio
 async def test_guided_choice_chat(client: openai.AsyncOpenAI,
-                                  sample_guided_choice):
+                                  sample_guided_choice, is_v1_server: bool):
+    if not is_v1_server:
+        pytest.skip("Guided decoding is only supported in v1 engine")
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -524,8 +526,10 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-async def test_guided_json_chat(client: openai.AsyncOpenAI,
-                                sample_json_schema):
+async def test_guided_json_chat(client: openai.AsyncOpenAI, sample_json_schema,
+                                is_v1_server: bool):
+    if not is_v1_server:
+        pytest.skip("Guided decoding is only supported in v1 engine")
 
     messages = [{
         "role": "system",
@@ -568,7 +572,10 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex):
+async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex,
+                                 is_v1_server: bool):
+    if not is_v1_server:
+        pytest.skip("Guided decoding is only supported in v1 engine")
 
     messages = [{
         "role": "system",
@@ -653,7 +660,10 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema):
+async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema,
+                              is_v1_server: bool):
+    if not is_v1_server:
+        pytest.skip("Tool use is only supported in v1 engine")
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -741,131 +751,6 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema):
     assert json1["age"] != json2["age"]
 
 
-@pytest.mark.asyncio
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_required_tool_use(client: openai.AsyncOpenAI,
-                                 is_v1_server: bool, model_name: str):
-    if is_v1_server:
-        pytest.skip(
-            "tool_choice='required' requires features unsupported on V1")
-
-    tools = [
-        {
-            "type": "function",
-            "function": {
-                "name": "get_current_weather",
-                "description": "Get the current weather in a given location",
-                "parameters": {
-                    "type": "object",
-                    "properties": {
-                        "city": {
-                            "type": "string",
-                            "description":
-                            "The city to find the weather for, e.g. 'Vienna'",
-                            "default": "Vienna",
-                        },
-                        "country": {
-                            "type":
-                            "string",
-                            "description":
-                            "The country that the city is in, e.g. 'Austria'",
-                        },
-                        "unit": {
-                            "type": "string",
-                            "description":
-                            "The unit to fetch the temperature in",
-                            "enum": ["celsius", "fahrenheit"],
-                        },
-                    },
-                    "required": ["country", "unit"],
-                },
-            },
-        },
-        {
-            "type": "function",
-            "function": {
-                "name": "get_forecast",
-                "description": "Get the weather forecast for a given location",
-                "parameters": {
-                    "type": "object",
-                    "properties": {
-                        "city": {
-                            "type": "string",
-                            "description":
-                            "The city to get the forecast for, e.g. 'Vienna'",
-                            "default": "Vienna",
-                        },
-                        "country": {
-                            "type":
-                            "string",
-                            "description":
-                            "The country that the city is in, e.g. 'Austria'",
-                        },
-                        "days": {
-                            "type":
-                            "integer",
-                            "description":
-                            "Number of days to get the forecast for (1-7)",
-                        },
-                        "unit": {
-                            "type": "string",
-                            "description":
-                            "The unit to fetch the temperature in",
-                            "enum": ["celsius", "fahrenheit"],
-                        },
-                    },
-                    "required": ["country", "days", "unit"],
-                },
-            },
-        },
-    ]
-
-    messages = [
-        {
-            "role": "user",
-            "content": "Hi! How are you doing today?"
-        },
-        {
-            "role": "assistant",
-            "content": "I'm doing well! How can I help you?"
-        },
-        {
-            "role":
-            "user",
-            "content":
-            "Can you tell me what the current weather is in Berlin and the "\
-            "forecast for the next 5 days, in fahrenheit?",
-        },
-    ]
-
-    # Non-streaming test
-    chat_completion = await client.chat.completions.create(
-        messages=messages,
-        model=model_name,
-        tools=tools,
-        tool_choice="required",
-    )
-
-    assert chat_completion.choices[0].message.tool_calls is not None
-    assert len(chat_completion.choices[0].message.tool_calls) > 0
-
-    # Streaming test
-    stream = await client.chat.completions.create(
-        messages=messages,
-        model=model_name,
-        tools=tools,
-        tool_choice="required",
-        stream=True,
-    )
-
-    output = []
-    async for chunk in stream:
-        if chunk.choices and chunk.choices[0].delta.tool_calls:
-            output.extend(chunk.choices[0].delta.tool_calls)
-
-    assert len(output) > 0
-
-
 @pytest.mark.asyncio
 async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
                                                   sample_json_schema):
@@ -948,7 +833,11 @@ async def test_response_format_json_object(client: openai.AsyncOpenAI):
 
 
 @pytest.mark.asyncio
-async def test_response_format_json_schema(client: openai.AsyncOpenAI):
+async def test_response_format_json_schema(client: openai.AsyncOpenAI,
+                                           is_v1_server: bool):
+    if not is_v1_server:
+        pytest.skip(
+            "JSON schema response format is only supported in v1 engine")
     prompt = 'what is 1+1? The format is "result": 2'
     # Check that this prompt cannot lead to a valid JSON without json_schema
     for _ in range(2):
diff --git a/tests/entrypoints/openai/test_chat_logit_bias_validation.py b/tests/entrypoints/openai/test_chat_logit_bias_validation.py
index e9d1a855294cb..9fa7ab83555af 100644
--- a/tests/entrypoints/openai/test_chat_logit_bias_validation.py
+++ b/tests/entrypoints/openai/test_chat_logit_bias_validation.py
@@ -15,10 +15,6 @@ MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
 def get_vocab_size(model_name):
     config = ModelConfig(
         model=model_name,
-        task="auto",
-        tokenizer=model_name,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
         seed=0,
         dtype="bfloat16",
     )
diff --git a/tests/entrypoints/openai/test_chat_template.py b/tests/entrypoints/openai/test_chat_template.py
index 6e32887f5ed0a..5b6e2a4146b1f 100644
--- a/tests/entrypoints/openai/test_chat_template.py
+++ b/tests/entrypoints/openai/test_chat_template.py
@@ -102,6 +102,7 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
         tokenizer=model_info.tokenizer or model,
         tokenizer_mode=model_info.tokenizer_mode,
         trust_remote_code=model_info.trust_remote_code,
+        revision=model_info.revision,
         hf_overrides=model_info.hf_overrides,
     )
 
diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py
index 6eca3e767f3f0..74ef6deeea16b 100644
--- a/tests/entrypoints/openai/test_completion.py
+++ b/tests/entrypoints/openai/test_completion.py
@@ -28,7 +28,7 @@ MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
 # but we're not testing generation quality here
 LORA_NAME = "typeof/zephyr-7b-beta-lora"
 
-GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
+GUIDED_DECODING_BACKENDS = ["outlines", "xgrammar", "guidance"]
 
 
 @pytest.fixture(scope="module")
@@ -95,6 +95,14 @@ def server(default_server_args, request):
             os.environ['VLLM_USE_V1'] = original_value
 
 
+@pytest.fixture
+def is_v1_server(server):
+    import os
+
+    # For completion tests, we assume v0 since there's no explicit v1 setup
+    return os.environ.get('VLLM_USE_V1', '0') == '1'
+
+
 @pytest_asyncio.fixture
 async def client(server):
     async with server.get_async_client() as async_client:
@@ -631,7 +639,10 @@ async def test_allowed_token_ids(client: openai.AsyncOpenAI):
 @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_json_completion(client: openai.AsyncOpenAI,
                                       guided_decoding_backend: str,
-                                      sample_json_schema):
+                                      sample_json_schema, is_v1_server: bool):
+    if not is_v1_server:
+        pytest.skip("Guided decoding is only supported in v1 engine")
+
     completion = await client.completions.create(
         model=MODEL_NAME,
         prompt=f"Give an example JSON for an employee profile "
@@ -653,7 +664,10 @@ async def test_guided_json_completion(client: openai.AsyncOpenAI,
 @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_regex_completion(client: openai.AsyncOpenAI,
                                        guided_decoding_backend: str,
-                                       sample_regex):
+                                       sample_regex, is_v1_server: bool):
+    if not is_v1_server:
+        pytest.skip("Guided decoding is only supported in v1 engine")
+
     completion = await client.completions.create(
         model=MODEL_NAME,
         prompt=f"Give an example IPv4 address with this regex: {sample_regex}",
@@ -674,7 +688,11 @@ async def test_guided_regex_completion(client: openai.AsyncOpenAI,
 @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_choice_completion(client: openai.AsyncOpenAI,
                                         guided_decoding_backend: str,
-                                        sample_guided_choice):
+                                        sample_guided_choice,
+                                        is_v1_server: bool):
+    if not is_v1_server:
+        pytest.skip("Guided decoding is only supported in v1 engine")
+
     completion = await client.completions.create(
         model=MODEL_NAME,
         prompt="The best language for type-safe systems programming is ",
@@ -692,7 +710,9 @@ async def test_guided_choice_completion(client: openai.AsyncOpenAI,
 
 @pytest.mark.asyncio
 async def test_guided_grammar(client: openai.AsyncOpenAI,
-                              sample_sql_statements):
+                              sample_sql_statements, is_v1_server: bool):
+    if not is_v1_server:
+        pytest.skip("Guided grammar is only supported in v1 engine")
 
     completion = await client.completions.create(
         model=MODEL_NAME,
@@ -754,7 +774,11 @@ async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
 @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
                                           guided_decoding_backend: str,
-                                          sample_json_schema, sample_regex):
+                                          sample_json_schema, sample_regex,
+                                          is_v1_server: bool):
+    if not is_v1_server:
+        pytest.skip("Guided decoding is only supported in v1 engine")
+
     with pytest.raises(openai.BadRequestError):
         _ = await client.completions.create(
             model=MODEL_NAME,
diff --git a/tests/entrypoints/openai/test_completion_with_function_calling.py b/tests/entrypoints/openai/test_completion_with_function_calling.py
index eca048d855b58..a5b081f861074 100644
--- a/tests/entrypoints/openai/test_completion_with_function_calling.py
+++ b/tests/entrypoints/openai/test_completion_with_function_calling.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from typing import Union
+
 import openai  # use the official client for correctness check
 import pytest
 import pytest_asyncio
@@ -40,10 +42,17 @@ async def client(server):
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("stream", [True, False])
-@pytest.mark.parametrize("tool_choice", ["auto", "required"])
+@pytest.mark.parametrize("tool_choice", [
+    "auto", "required", {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather"
+        }
+    }
+])
 @pytest.mark.parametrize("enable_thinking", [True, False])
 async def test_function_tool_use(client: openai.AsyncOpenAI, model_name: str,
-                                 stream: bool, tool_choice: str,
+                                 stream: bool, tool_choice: Union[str, dict],
                                  enable_thinking: bool):
     tools = [
         {
diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py
index f03c96b12179c..a7203befcc402 100644
--- a/tests/entrypoints/openai/test_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@@ -33,8 +33,8 @@ def v1(run_with_both_engines):
 @pytest.fixture(scope="module")
 def server():
     args = [
-        "--task",
-        "embed",
+        "--runner",
+        "pooling",
         # use half precision for speed and memory savings in CI environment
         "--dtype",
         DTYPE,
diff --git a/tests/entrypoints/openai/test_embedding_dimensions.py b/tests/entrypoints/openai/test_embedding_dimensions.py
index 08b797dc57ad2..91e91699b92ca 100644
--- a/tests/entrypoints/openai/test_embedding_dimensions.py
+++ b/tests/entrypoints/openai/test_embedding_dimensions.py
@@ -42,8 +42,8 @@ def dtype(request):
 @pytest.fixture(scope="module")
 def server(model_info, dtype: str):
     args = [
-        "--task",
-        "embed",
+        "--runner",
+        "pooling",
         # use half precision for speed and memory savings in CI environment
         "--dtype",
         dtype,
diff --git a/tests/entrypoints/openai/test_openai_schema.py b/tests/entrypoints/openai/test_openai_schema.py
index 580bf34f20c4a..771119d04ea31 100644
--- a/tests/entrypoints/openai/test_openai_schema.py
+++ b/tests/entrypoints/openai/test_openai_schema.py
@@ -21,7 +21,7 @@ LONG_TIMEOUT_SECONDS: Final[int] = 60
 @pytest.fixture(scope="module")
 def server():
     args = [
-        "--task",
+        "--runner",
         "generate",
         "--max-model-len",
         "2048",
diff --git a/tests/entrypoints/openai/test_optional_middleware.py b/tests/entrypoints/openai/test_optional_middleware.py
index 882fa0886ce30..eb387998c2cc4 100644
--- a/tests/entrypoints/openai/test_optional_middleware.py
+++ b/tests/entrypoints/openai/test_optional_middleware.py
@@ -27,8 +27,8 @@ def server(request: pytest.FixtureRequest):
         passed_params = [passed_params]
 
     args = [
-        "--task",
-        "embed",
+        "--runner",
+        "pooling",
         # use half precision for speed and memory savings in CI environment
         "--dtype",
         "float16",
diff --git a/tests/entrypoints/openai/test_pooling.py b/tests/entrypoints/openai/test_pooling.py
index 02165ee6d58eb..63f4205e0a42b 100644
--- a/tests/entrypoints/openai/test_pooling.py
+++ b/tests/entrypoints/openai/test_pooling.py
@@ -20,8 +20,8 @@ DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' +
 @pytest.fixture(scope="module")
 def server():
     args = [
-        "--task",
-        "reward",
+        "--runner",
+        "pooling",
         # use half precision for speed and memory savings in CI environment
         "--dtype",
         "bfloat16",
diff --git a/tests/entrypoints/openai/test_prompt_validation.py b/tests/entrypoints/openai/test_prompt_validation.py
index ff0730c77032c..e31a1d077608f 100644
--- a/tests/entrypoints/openai/test_prompt_validation.py
+++ b/tests/entrypoints/openai/test_prompt_validation.py
@@ -9,6 +9,11 @@ import regex as re
 from ...utils import RemoteOpenAIServer
 
 
+@pytest.fixture(scope="function", autouse=True)
+def use_v1_only(monkeypatch):
+    monkeypatch.setenv('VLLM_USE_V1', '1')
+
+
 @pytest.mark.asyncio
 async def test_empty_prompt():
     model_name = "gpt2"
@@ -37,24 +42,3 @@ async def test_out_of_vocab_token_ids():
                                             prompt=[999999],
                                             max_tokens=5,
                                             temperature=0.0)
-
-
-@pytest.mark.asyncio
-async def test_reject_multistep_with_guided_decoding():
-    model_name = "gpt2"
-    server_args = ["--enforce-eager", "--num-scheduler-steps", "8"]
-    with RemoteOpenAIServer(model_name, server_args) as remote_server:
-        client = remote_server.get_async_client()
-
-        with pytest.raises(
-                openai.BadRequestError,
-                match=re.compile(
-                    '.*Guided decoding .* multi-step decoding.*').pattern):
-            await client.completions.create(
-                model=model_name,
-                prompt="Hello",
-                max_tokens=5,
-                temperature=0.0,
-                extra_body={"response_format": {
-                    "type": "json_object"
-                }})
diff --git a/tests/entrypoints/openai/test_skip_tokenizer.py b/tests/entrypoints/openai/test_skip_tokenizer.py
index 32d28277e0ef8..0bb42ed8aa7fb 100644
--- a/tests/entrypoints/openai/test_skip_tokenizer.py
+++ b/tests/entrypoints/openai/test_skip_tokenizer.py
@@ -26,8 +26,8 @@ def v1(run_with_both_engines):
 @pytest.fixture(scope="module")
 def server():
     args = [
-        "--task",
-        "embed",
+        "--runner",
+        "pooling",
         # use half precision for speed and memory savings in CI environment
         "--dtype",
         DTYPE,
diff --git a/tests/entrypoints/openai/test_truncation.py b/tests/entrypoints/openai/test_truncation.py
index b33a26af65b33..79b6ce059ce49 100644
--- a/tests/entrypoints/openai/test_truncation.py
+++ b/tests/entrypoints/openai/test_truncation.py
@@ -29,8 +29,8 @@ input = """Immerse yourself in the enchanting chronicle of calculus, a
 @pytest.fixture(scope="module")
 def server():
     args = [
-        "--task",
-        "embed",
+        "--runner",
+        "pooling",
         "--dtype",
         "bfloat16",
         "--enforce-eager",
diff --git a/tests/entrypoints/openai/test_video.py b/tests/entrypoints/openai/test_video.py
index b68e08556ee96..ad4dff00daaa4 100644
--- a/tests/entrypoints/openai/test_video.py
+++ b/tests/entrypoints/openai/test_video.py
@@ -25,7 +25,7 @@ TEST_VIDEO_URLS = [
 @pytest.fixture(scope="module")
 def server():
     args = [
-        "--task",
+        "--runner",
         "generate",
         "--max-model-len",
         "32768",
diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index b6f1d64803e51..8259a81d7b6a1 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -48,7 +48,7 @@ EXPECTED_MM_BEAM_SEARCH_RES = [
 @pytest.fixture(scope="module")
 def server():
     args = [
-        "--task",
+        "--runner",
         "generate",
         "--max-model-len",
         "2048",
diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py
index fe982e286ae47..4e6a21058658b 100644
--- a/tests/entrypoints/openai/test_vision_embedding.py
+++ b/tests/entrypoints/openai/test_vision_embedding.py
@@ -31,8 +31,8 @@ TEST_IMAGE_URLS = [
 @pytest.fixture(scope="module")
 def server():
     args = [
-        "--task",
-        "embed",
+        "--runner",
+        "pooling",
         "--max-model-len",
         "2048",
         "--max-num-seqs",
diff --git a/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py
new file mode 100644
index 0000000000000..09726c7e3e5b5
--- /dev/null
+++ b/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py
@@ -0,0 +1,132 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+from transformers import AutoTokenizer
+
+from vllm.entrypoints.openai.protocol import ExtractedToolCallInformation
+from vllm.entrypoints.openai.tool_parsers.llama_tool_parser import (
+    Llama3JsonToolParser)
+
+
+@pytest.fixture
+def parser():
+    # Use a small tokenizer for testing
+    tokenizer = AutoTokenizer.from_pretrained("gpt2")
+    return Llama3JsonToolParser(tokenizer)
+
+
+def test_extract_tool_calls_simple(parser):
+    # Test with a simple tool call
+    model_output = ('Here is the result: {"name": "getOpenIncidentsTool", '
+                    '"parameters": {}} Would you like to know more?')
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert isinstance(result, ExtractedToolCallInformation)
+    assert result.tools_called is True
+    assert len(result.tool_calls) == 1
+    assert result.tool_calls[0].type == "function"
+    assert result.tool_calls[0].function.name == "getOpenIncidentsTool"
+    assert result.tool_calls[0].function.arguments == "{}"
+    assert result.content is None
+
+
+def test_extract_tool_calls_with_arguments(parser):
+    # Test with a tool call that has arguments
+    model_output = (
+        '{"name": "searchTool", "parameters": {"query": "test query", '
+        '"limit": 10}}')
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert result.tools_called is True
+    assert len(result.tool_calls) == 1
+    assert result.tool_calls[0].function.name == "searchTool"
+    assert '"query": "test query"' in result.tool_calls[0].function.arguments
+    assert '"limit": 10' in result.tool_calls[0].function.arguments
+
+
+def test_extract_tool_calls_no_json(parser):
+    # Test with text that doesn't contain a JSON object
+    model_output = "This is just some text without any tool calls"
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert result.tools_called is False
+    assert len(result.tool_calls) == 0
+    assert result.content == model_output
+
+
+def test_extract_tool_calls_invalid_json(parser):
+    # Test with invalid JSON
+    model_output = '{"name": "invalidTool", "parameters": {invalid json}'
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert result.tools_called is False
+    assert len(result.tool_calls) == 0
+    assert result.content == model_output
+
+
+def test_extract_tool_calls_with_arguments_key(parser):
+    # Test with a tool call that uses "arguments" instead of "parameters"
+    model_output = '{"name": "searchTool", "arguments": {"query": "test"}}'
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert result.tools_called is True
+    assert len(result.tool_calls) == 1
+    assert result.tool_calls[0].function.name == "searchTool"
+    assert '"query": "test"' in result.tool_calls[0].function.arguments
+
+
+def test_extract_tool_calls_multiple_json(parser):
+    # Test with multiple JSONs separated by semicolons
+    model_output = (
+        '{"name": "searchTool", "parameters": {"query": "test1"}}; '
+        '{"name": "getOpenIncidentsTool", "parameters": {}}; '
+        '{"name": "searchTool", "parameters": {"query": "test2"}}')
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert result.tools_called is True
+    assert len(result.tool_calls) == 3
+
+    # Check first tool call
+    assert result.tool_calls[0].function.name == "searchTool"
+    assert '"query": "test1"' in result.tool_calls[0].function.arguments
+
+    # Check second tool call
+    assert result.tool_calls[1].function.name == "getOpenIncidentsTool"
+    assert result.tool_calls[1].function.arguments == "{}"
+
+    # Check third tool call
+    assert result.tool_calls[2].function.name == "searchTool"
+    assert '"query": "test2"' in result.tool_calls[2].function.arguments
+
+
+def test_extract_tool_calls_multiple_json_with_whitespace(parser):
+    # Test with multiple JSONs separated by semicolons and extra whitespace
+    model_output = (
+        '{"name": "searchTool", "parameters": {"query": "test1"}} ; '
+        '{"name": "getOpenIncidentsTool", "parameters": {}} ; '
+        '{"name": "searchTool", "parameters": {"query": "test2"}}')
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert result.tools_called is True
+    assert len(result.tool_calls) == 3
+    assert result.tool_calls[0].function.name == "searchTool"
+    assert result.tool_calls[1].function.name == "getOpenIncidentsTool"
+    assert result.tool_calls[2].function.name == "searchTool"
+
+
+def test_extract_tool_calls_multiple_json_with_surrounding_text(parser):
+    # Test with multiple JSONs and surrounding text
+    model_output = (
+        'Here are the results: '
+        '{"name": "searchTool", "parameters": {"query": "test1"}}; '
+        '{"name": "getOpenIncidentsTool", "parameters": {}}; '
+        '{"name": "searchTool", "parameters": {"query": "test2"}} '
+        'Would you like to know more?')
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert result.tools_called is True
+    assert len(result.tool_calls) == 3
+    assert result.tool_calls[0].function.name == "searchTool"
+    assert result.tool_calls[1].function.name == "getOpenIncidentsTool"
+    assert result.tool_calls[2].function.name == "searchTool"
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index ed57fe39df645..54daf1a91d645 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -47,12 +47,8 @@ MISTRAL_MODEL_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
 @pytest.fixture(scope="function")
 def phi3v_model_config():
     return ModelConfig(PHI3V_MODEL_ID,
-                       task="generate",
-                       tokenizer=PHI3V_MODEL_ID,
-                       tokenizer_mode="auto",
+                       runner="generate",
                        trust_remote_code=True,
-                       dtype="auto",
-                       seed=0,
                        limit_mm_per_prompt={
                            "image": 2,
                        })
@@ -61,12 +57,8 @@ def phi3v_model_config():
 @pytest.fixture(scope="function")
 def phi3v_model_config_mm_interleaved():
     return ModelConfig(PHI3V_MODEL_ID,
-                       task="generate",
-                       tokenizer=PHI3V_MODEL_ID,
-                       tokenizer_mode="auto",
+                       runner="generate",
                        trust_remote_code=True,
-                       dtype="auto",
-                       seed=0,
                        interleave_mm_strings=True,
                        limit_mm_per_prompt={
                            "image": 2,
@@ -86,11 +78,7 @@ def phi3v_tokenizer():
 @pytest.fixture(scope="function")
 def qwen25omni_model_config_mm_interleaved():
     return ModelConfig(QWEN25OMNI_MODEL_ID,
-                       task="generate",
-                       tokenizer=QWEN25OMNI_MODEL_ID,
-                       tokenizer_mode="auto",
-                       dtype="auto",
-                       seed=0,
+                       runner="generate",
                        interleave_mm_strings=True,
                        limit_mm_per_prompt={
                            "image": 2,
@@ -112,12 +100,7 @@ def qwen25omni_tokenizer():
 @pytest.fixture(scope="module")
 def mllama_model_config():
     return ModelConfig(MLLAMA_MODEL_ID,
-                       task="generate",
-                       tokenizer=MLLAMA_MODEL_ID,
-                       tokenizer_mode="auto",
-                       trust_remote_code=True,
-                       dtype="auto",
-                       seed=0,
+                       runner="generate",
                        limit_mm_per_prompt={
                            "image": 2,
                        })
@@ -136,12 +119,7 @@ def mllama_tokenizer():
 @pytest.fixture(scope="function")
 def mistral_model_config():
     return ModelConfig(MISTRAL_MODEL_ID,
-                       task="generate",
-                       tokenizer=MISTRAL_MODEL_ID,
-                       tokenizer_mode="auto",
-                       trust_remote_code=True,
-                       dtype="auto",
-                       seed=0,
+                       runner="generate",
                        limit_mm_per_prompt={
                            "image": 2,
                        })
@@ -1105,12 +1083,7 @@ def test_multimodal_image_parsing_matches_hf(model, image_url):
 
     # Build a config for the model
     model_config = ModelConfig(model,
-                               task="generate",
-                               tokenizer=model,
-                               tokenizer_mode="auto",
-                               trust_remote_code=True,
-                               dtype="auto",
-                               seed=0,
+                               runner="generate",
                                limit_mm_per_prompt={
                                    "image": 2,
                                })
@@ -1170,6 +1143,7 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
         model,
         tokenizer=model_info.tokenizer or model,
         tokenizer_mode=model_info.tokenizer_mode,
+        revision=model_info.revision,
         trust_remote_code=model_info.trust_remote_code,
         hf_overrides=model_info.hf_overrides,
     )
@@ -1225,6 +1199,7 @@ def test_resolve_content_format_hf_defined(model, expected_format):
         model,
         tokenizer=model_info.tokenizer or model,
         tokenizer_mode=model_info.tokenizer_mode,
+        revision=model_info.revision,
         trust_remote_code=model_info.trust_remote_code,
         hf_overrides=model_info.hf_overrides,
     )
@@ -1284,6 +1259,7 @@ def test_resolve_content_format_fallbacks(model, expected_format):
         model,
         tokenizer=model_info.tokenizer or model,
         tokenizer_mode=model_info.tokenizer_mode,
+        revision=model_info.revision,
         trust_remote_code=model_info.trust_remote_code,
         hf_overrides=model_info.hf_overrides,
     )
diff --git a/tests/kernels/attention/test_flashinfer_trtllm_decode_attention.py b/tests/kernels/attention/test_flashinfer_trtllm_decode_attention.py
index 96eee13695a9d..2e2130fab6a21 100644
--- a/tests/kernels/attention/test_flashinfer_trtllm_decode_attention.py
+++ b/tests/kernels/attention/test_flashinfer_trtllm_decode_attention.py
@@ -113,27 +113,25 @@ def test_flashinfer_trtllm_decode_with_baseline(
                  kv_data_type=dtype,
                  logits_soft_cap=soft_cap)
 
-    output = wrapper.run(query, key_value_cache, scale)
+    output = torch.empty(query.shape, dtype=dtype)
+    wrapper.run(query, key_value_cache, scale, out=output)
 
     # TRTLLM Decode
     max_kv_len = max(kv_lens)
     kv_lens_tensor = torch.tensor(kv_lens,
                                   dtype=torch.int,
                                   device=query.device)
-    output_trtllm = flashinfer.decode.trtllm_batch_decode_with_kv_cache(
+    output_trtllm = torch.empty(query.shape, dtype=dtype)
+    flashinfer.decode.trtllm_batch_decode_with_kv_cache(
         query.contiguous(),
         key_value_cache,
         workspace_buffer,
-        num_query_heads,
-        num_kv_heads,
-        scale,
         block_tables,
         kv_lens_tensor,
-        block_size,
         max_kv_len,
-        "auto",
-        k_scale,
-        v_scale,
+        bmm1_scale=k_scale * scale,
+        bmm2_scale=v_scale,
+        out=output_trtllm,
     )
 
     torch.testing.assert_close(output, output_trtllm, atol=1e-2, rtol=1e-2), \
diff --git a/tests/kernels/moe/test_moe_permute_unpermute.py b/tests/kernels/moe/test_moe_permute_unpermute.py
index 7cc83b512c8b9..8d215a0cbeed8 100644
--- a/tests/kernels/moe/test_moe_permute_unpermute.py
+++ b/tests/kernels/moe/test_moe_permute_unpermute.py
@@ -17,28 +17,34 @@ from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
     moe_permute, moe_permute_unpermute_supported, moe_unpermute)
 from vllm.platforms import current_platform
 
-NUM_EXPERTS = [16, 64]
+NUM_EXPERTS = [16, 64, 256]
 TOP_KS = [2, 4, 6, 8]
 EP_SIZE = [1, 4, 16]
 current_platform.seed_everything(0)
 
 
-def torch_permute(hidden_states: torch.Tensor,
-                  topk_ids: torch.Tensor,
-                  token_expert_indices: torch.Tensor,
-                  topk: int,
-                  n_expert: int,
-                  n_local_expert: int,
-                  start_expert: int,
-                  expert_map: Optional[torch.Tensor] = None,
-                  align_block_size: Optional[int] = None,
-                  fill_invalid_expert: int = -1) -> list[torch.Tensor]:
+def torch_permute(
+        hidden_states: torch.Tensor,
+        topk_ids: torch.Tensor,
+        #   token_expert_indices: torch.Tensor,
+        topk: int,
+        n_expert: int,
+        n_local_expert: int,
+        start_expert: int,
+        expert_map: Optional[torch.Tensor] = None,
+        align_block_size: Optional[int] = None,
+        fill_invalid_expert: int = -1) -> list[torch.Tensor]:
     n_token, n_hidden = hidden_states.shape[0], hidden_states.shape[1]
     if expert_map is not None:
         is_local_expert = (expert_map[topk_ids] != -1)
         not_local_expert = (expert_map[topk_ids] == -1)
         topk_ids = is_local_expert * (
             topk_ids - start_expert) + not_local_expert * (topk_ids + n_expert)
+    token_expert_indices = torch.arange(0,
+                                        n_token * topk,
+                                        dtype=torch.int32,
+                                        device=hidden_states.device).reshape(
+                                            (n_token, topk))
 
     sorted_topk_ids, sorted_indices = torch.sort(topk_ids.flatten(),
                                                  stable=True)
@@ -59,8 +65,8 @@ def torch_permute(hidden_states: torch.Tensor,
     valid_row_idx = []
     if align_block_size is None:
 
-        permuted_hidden_states = hidden_states[dst_row_id2src_row_id_map %
-                                               n_token, ...]
+        permuted_hidden_states = hidden_states[dst_row_id2src_row_id_map //
+                                               topk, ...]
         permuted_row_size = permuted_hidden_states.shape[0]
         m_indices = torch.empty(permuted_row_size,
                                 device="cuda",
@@ -73,14 +79,21 @@ def torch_permute(hidden_states: torch.Tensor,
             0, n_token * topk, device="cuda",
             dtype=torch.int32)[src2dst_idx].reshape((n_token, topk))
         valid_row_idx += [i for i in range(expert_first_token_offset[-1])]
+        dst_row_id2src_row_id_map[
+            expert_first_token_offset[-1]:] = n_token * topk
         return [
             permuted_hidden_states, expert_first_token_offset,
-            src_row_id2dst_row_id_map, m_indices, valid_row_idx
+            src_row_id2dst_row_id_map, dst_row_id2src_row_id_map, m_indices,
+            valid_row_idx
         ]
     else:
         permuted_row_size = (topk * n_token + n_expert *
                              (align_block_size - 1) + align_block_size -
                              1) // align_block_size * align_block_size
+        permuted_idx = torch.full((permuted_row_size, ),
+                                  n_token * topk,
+                                  dtype=torch.int32,
+                                  device=hidden_states.device)
         permuted_hidden_states = torch.empty((permuted_row_size, n_hidden),
                                              device="cuda",
                                              dtype=hidden_states.dtype)
@@ -105,13 +118,16 @@ def torch_permute(hidden_states: torch.Tensor,
             align_first_token_offset = align_expert_first_token_offset[i - 1]
             align_last_token_offset = align_expert_first_token_offset[i]
             dst_row_id2src_row_id_in_expert = dst_row_id2src_row_id_map[
-                first_token_offset:first_token_offset +
-                n_token_in_expert] % n_token
+                first_token_offset:first_token_offset + n_token_in_expert]
             # store token in current expert with align_first_token_offset
             permuted_hidden_states[align_first_token_offset:\
                                    align_first_token_offset+n_token_in_expert,\
                                       ...] = hidden_states[\
-                                       dst_row_id2src_row_id_in_expert, ...]
+                                       dst_row_id2src_row_id_in_expert // topk,\
+                                          ...]
+            permuted_idx[align_first_token_offset:\
+                         align_first_token_offset+\
+                         n_token_in_expert] = dst_row_id2src_row_id_in_expert
             # set current expert m_indices
             m_indices[align_first_token_offset:align_last_token_offset] = i - 1
             valid_row_idx += [
@@ -135,7 +151,7 @@ def torch_permute(hidden_states: torch.Tensor,
             src2dst_idx].reshape((n_token, topk))
         return [
             permuted_hidden_states, align_expert_first_token_offset,
-            align_src_row_id2dst_row_id, m_indices, valid_row_idx
+            align_src_row_id2dst_row_id, permuted_idx, m_indices, valid_row_idx
         ]
 
 
@@ -146,15 +162,18 @@ def torch_unpermute(permuted_hidden_states: torch.Tensor,
                     valid_row_idx: torch.Tensor, topk: int,
                     n_expert: int) -> torch.Tensor:
     # ignore invalid row
+    n_hidden = permuted_hidden_states.shape[1]
     mask = torch.zeros(permuted_hidden_states.shape[0],
                        dtype=bool,
                        device="cuda")
     mask[valid_row_idx] = True
     permuted_hidden_states[~mask] = 0
-    idx = src_row_id2dst_row_id_map.flatten()[
-        token_expert_indices.flatten()].reshape(token_expert_indices.shape)
-    output = permuted_hidden_states[idx, ...] * topk_weights[..., None]
-    output = output.sum(dim=1).to(permuted_hidden_states.dtype)
+
+    permuted_hidden_states = permuted_hidden_states[
+        src_row_id2dst_row_id_map.flatten(), ...]
+    permuted_hidden_states = permuted_hidden_states.view(-1, topk, n_hidden)
+    output = (permuted_hidden_states * topk_weights.unsqueeze(2)).sum(1).to(
+        permuted_hidden_states.dtype)
     return output
 
 
@@ -184,43 +203,56 @@ def test_moe_permute_unpermute(n_token: int, n_hidden: int, topk: int,
     gating_output = torch.randn((n_token, n_expert), device="cuda").to(dtype)
     topk_weights, topk_ids, token_expert_indices = fused_topk(
         hidden_states, gating_output, topk, False)
-    gold0, gold1, gold2, gold3, valid_row_idx = torch_permute(
-        hidden_states,
-        topk_ids,
-        token_expert_indices,
-        topk,
-        n_expert,
-        n_local_expert,
-        start_expert,
-        expert_map=expert_map,
-        align_block_size=align_block_size,
-        fill_invalid_expert=fill_invalid_expert)
+    (gold_permuted_hidden_states, gold_expert_first_token_offset,
+     gold_inv_permuted_idx, gold_permuted_idx, gold_m_indices,
+     valid_row_idx) = torch_permute(
+         hidden_states,
+         topk_ids,
+         # token_expert_indices,
+         topk,
+         n_expert,
+         n_local_expert,
+         start_expert,
+         expert_map=expert_map,
+         align_block_size=align_block_size,
+         fill_invalid_expert=fill_invalid_expert)
 
-    result0, result1, result2, result3 = moe_permute(
-        hidden_states, topk_weights, topk_ids, token_expert_indices, topk,
-        n_expert, n_local_expert, expert_map, align_block_size,
-        fill_invalid_expert)
+    (permuted_hidden_states, _, expert_first_token_offset, inv_permuted_idx,
+     m_indices) = moe_permute(hidden_states=hidden_states,
+                              a1q_scale=None,
+                              topk_ids=topk_ids,
+                              n_expert=n_expert,
+                              n_local_expert=n_local_expert,
+                              expert_map=expert_map,
+                              align_block_size=align_block_size,
+                              fill_invalid_expert=fill_invalid_expert)
 
     # check expert_first_token_offset
-    torch.testing.assert_close(gold1, result1, atol=0, rtol=0)
-    # check src_row_id2dst_row_id_map
-    torch.testing.assert_close(gold2, result2, atol=0, rtol=0)
-    # check mindice
-    torch.testing.assert_close(gold3, result3, atol=0, rtol=0)
-    # check permuted_hidden_states, only valid token
-    torch.testing.assert_close(gold0[valid_row_idx],
-                               result0[valid_row_idx],
+    torch.testing.assert_close(gold_expert_first_token_offset,
+                               expert_first_token_offset,
+                               atol=0,
+                               rtol=0)
+    # check src_row_id2dst_row_id_map
+    torch.testing.assert_close(gold_inv_permuted_idx.flatten(),
+                               inv_permuted_idx,
+                               atol=0,
+                               rtol=0)
+    # check mindice
+    torch.testing.assert_close(gold_m_indices, m_indices, atol=0, rtol=0)
+    # check permuted_hidden_states, only valid token
+    torch.testing.assert_close(gold_permuted_hidden_states[valid_row_idx],
+                               permuted_hidden_states[valid_row_idx],
                                atol=0,
                                rtol=0)
-
     # add a random tensor to simulate group gemm
-    result0 = 0.5 * result0 + torch.randn_like(result0)
+    result0 = 0.5 * permuted_hidden_states + torch.randn_like(
+        permuted_hidden_states)
+    result4 = torch.empty_like(hidden_states)
+    moe_unpermute(result4, result0, topk_weights, inv_permuted_idx,
+                  expert_first_token_offset)
 
-    result4 = moe_unpermute(result0, topk_weights, topk_ids, result2, result1,
-                            topk, n_expert, n_local_expert)
     gold4 = torch_unpermute(result0, topk_weights, topk_ids,
-                            token_expert_indices, result2, valid_row_idx, topk,
-                            n_local_expert)
-
+                            token_expert_indices, inv_permuted_idx,
+                            valid_row_idx, topk, n_local_expert)
     # check unpermuted hidden
     torch.testing.assert_close(result4, gold4, atol=2e-2, rtol=0)
diff --git a/tests/kernels/quantization/test_cutlass_scaled_mm.py b/tests/kernels/quantization/test_cutlass_scaled_mm.py
index c4d349f1a5a09..544e6dc197904 100644
--- a/tests/kernels/quantization/test_cutlass_scaled_mm.py
+++ b/tests/kernels/quantization/test_cutlass_scaled_mm.py
@@ -96,7 +96,7 @@ def cutlass_fp8_gemm_helper(m: int,
     out = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
     baseline = baseline_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
 
-    torch.testing.assert_close(out, baseline, rtol=1e-2, atol=1.5e-1)
+    torch.testing.assert_close(out, baseline, rtol=5e-1, atol=1.5e-1)
 
     opcheck(torch.ops._C.cutlass_scaled_mm,
             (out, a, b, scale_a, scale_b, bias))
diff --git a/tests/kernels/quantization/test_per_token_group_quant.py b/tests/kernels/quantization/test_per_token_group_quant.py
index f826983fe94e1..07f17d1efe641 100644
--- a/tests/kernels/quantization/test_per_token_group_quant.py
+++ b/tests/kernels/quantization/test_per_token_group_quant.py
@@ -5,7 +5,7 @@ from unittest.mock import patch
 import pytest
 import torch
 
-from vllm.model_executor.layers.quantization.utils import fp8_utils
+from vllm.model_executor.layers.quantization.utils import fp8_utils, int8_utils
 
 
 @pytest.mark.parametrize("shape", [(32, 128), (64, 256), (16, 512)])
@@ -42,3 +42,32 @@ def test_per_token_group_quant_fp8(shape, column_major: bool,
 
     assert torch.allclose(out_q.float(), ref_q.float(), atol=0.15, rtol=0.15)
     assert torch.allclose(scale, ref_s, atol=0.01, rtol=0.01)
+
+
+@pytest.mark.parametrize("shape", [(32, 128), (64, 256), (16, 512)])
+@pytest.mark.parametrize("group_size", [64, 128])
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+def test_per_token_group_quant_int8(shape, group_size: int):
+    device = "cuda"
+
+    torch.manual_seed(42)
+    num_tokens, hidden_dim = shape
+
+    x = (torch.randn(
+        (num_tokens, hidden_dim), device=device, dtype=torch.bfloat16) * 8)
+
+    # cuda path
+    out_q, scale = int8_utils.per_token_group_quant_int8(
+        x,
+        group_size,
+    )
+
+    # triton ref
+    with patch("vllm.platforms.current_platform.is_cuda", return_value=False):
+        ref_q, ref_s = int8_utils.per_token_group_quant_int8(
+            x,
+            group_size,
+        )
+
+    assert torch.allclose(out_q.float(), ref_q.float(), atol=0.15, rtol=0.15)
+    assert torch.allclose(scale, ref_s, atol=0.01, rtol=0.01)
diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py
index 9999c1be54ea5..bd0aea67b9702 100644
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -38,13 +38,8 @@ def test_worker_apply_lora(sql_lora_files):
     vllm_config = VllmConfig(
         model_config=ModelConfig(
             "meta-llama/Llama-2-7b-hf",
-            task="auto",
-            tokenizer="meta-llama/Llama-2-7b-hf",
-            tokenizer_mode="auto",
-            trust_remote_code=False,
             seed=0,
             dtype="float16",
-            revision=None,
             enforce_eager=True,
         ),
         load_config=LoadConfig(
diff --git a/tests/model_executor/test_guided_processors.py b/tests/model_executor/test_guided_processors.py
deleted file mode 100644
index 721478f42442e..0000000000000
--- a/tests/model_executor/test_guided_processors.py
+++ /dev/null
@@ -1,213 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import json
-import pickle
-
-import pytest
-import torch
-from transformers import AutoTokenizer
-
-from vllm.config import ModelConfig
-from vllm.model_executor.guided_decoding import (
-    get_guided_decoding_logits_processor,
-    get_local_guided_decoding_logits_processor)
-from vllm.model_executor.guided_decoding.outlines_logits_processors import (
-    JSONLogitsProcessor, RegexLogitsProcessor)
-from vllm.sampling_params import GuidedDecodingParams
-
-MODEL_NAME = 'HuggingFaceH4/zephyr-7b-beta'
-GUIDED_DECODING_BACKENDS = [
-    "outlines", "lm-format-enforcer", "xgrammar", "guidance"
-]
-GUIDED_DECODING_BACKENDS_WITH_REASONING_SUPPORT = ["outlines", "xgrammar"]
-REASONING_MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
-
-
-# Initialize the tokenizer for the model here to avoid repeated loading
-@pytest.fixture(scope="module")
-def zephyr_7B_tokenzer():
-    return AutoTokenizer.from_pretrained(MODEL_NAME)
-
-
-@pytest.fixture(scope="module")
-def deepseek_r1_qwen_tokenizer():
-    return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
-
-
-def test_guided_logits_processors(zephyr_7B_tokenzer, sample_regex,
-                                  sample_json_schema):
-    """Basic unit test for RegexLogitsProcessor and JSONLogitsProcessor."""
-    regex_LP = RegexLogitsProcessor(sample_regex,
-                                    zephyr_7B_tokenzer,
-                                    reasoner=None)
-    json_LP = JSONLogitsProcessor(sample_json_schema,
-                                  zephyr_7B_tokenzer,
-                                  whitespace_pattern=None,
-                                  reasoner=None)
-
-    tensor = torch.rand(32000)
-    original_tensor = torch.clone(tensor)
-    tensor = regex_LP([], tensor)
-    assert tensor.shape == original_tensor.shape
-    assert not torch.allclose(tensor, original_tensor)
-
-    tensor = torch.rand(32000)
-    original_tensor = torch.clone(tensor)
-    tensor = json_LP([], tensor)
-    assert tensor.shape == original_tensor.shape
-    assert not torch.allclose(tensor, original_tensor)
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("backend", GUIDED_DECODING_BACKENDS)
-@pytest.mark.parametrize("is_local", [True, False])
-async def test_guided_logits_processor_black_box(backend: str, is_local: bool,
-                                                 sample_regex,
-                                                 sample_json_schema,
-                                                 zephyr_7B_tokenzer):
-
-    config = ModelConfig(
-        MODEL_NAME,
-        task="generate",
-        tokenizer=MODEL_NAME,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="bfloat16",
-    )
-    regex_request = GuidedDecodingParams(regex=sample_regex, backend=backend)
-
-    regex_lp = get_local_guided_decoding_logits_processor(
-            regex_request, zephyr_7B_tokenzer, config) if is_local else \
-            await get_guided_decoding_logits_processor(
-                    regex_request, zephyr_7B_tokenzer, config)
-    assert regex_lp is not None
-    tensor = torch.rand(32000)
-    original_tensor = torch.clone(tensor)
-    # allowed tokens at state 0
-    tensor = regex_lp([], tensor)
-    assert tensor.shape == original_tensor.shape
-    assert not torch.allclose(tensor, original_tensor)
-
-    json_request = GuidedDecodingParams(json=sample_json_schema,
-                                        backend=backend)
-    json_lp = await get_guided_decoding_logits_processor(
-        json_request, zephyr_7B_tokenzer, config)
-    assert json_lp is not None
-    tensor = torch.rand(32000)
-    original_tensor = torch.clone(tensor)
-    tensor = json_lp([], tensor)
-    assert tensor.shape == original_tensor.shape
-    assert not torch.allclose(tensor, original_tensor)
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("backend",
-                         GUIDED_DECODING_BACKENDS_WITH_REASONING_SUPPORT)
-@pytest.mark.parametrize("is_local", [True, False])
-@pytest.mark.parametrize("reasoning_backend", ["deepseek_r1"])
-async def test_guided_logits_processor_with_reasoning(
-        backend: str, is_local: bool, reasoning_backend: str, sample_regex,
-        sample_json_schema, deepseek_r1_qwen_tokenizer):
-
-    config = ModelConfig(
-        REASONING_MODEL_NAME,
-        task="generate",
-        tokenizer=REASONING_MODEL_NAME,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="bfloat16",
-    )
-    token_ids = deepseek_r1_qwen_tokenizer.encode(
-        "<think>here is the thinking process")
-    regex_request = GuidedDecodingParams(regex=sample_regex, backend=backend)
-
-    regex_lp = get_local_guided_decoding_logits_processor(regex_request,
-                    deepseek_r1_qwen_tokenizer, config,
-                    reasoning_backend) if is_local else \
-            await get_guided_decoding_logits_processor(
-                    regex_request, deepseek_r1_qwen_tokenizer, config,
-                    reasoning_backend)
-    assert regex_lp is not None
-    tensor = torch.rand(151664)
-    original_tensor = torch.clone(tensor)
-    tensor = regex_lp(token_ids, tensor)
-    assert tensor.shape == original_tensor.shape
-    assert torch.allclose(tensor, original_tensor)
-
-    token_ids = deepseek_r1_qwen_tokenizer.encode(
-        "<think>here is the thinking process")
-    json_request = GuidedDecodingParams(json=sample_json_schema,
-                                        backend=backend)
-    json_lp = get_local_guided_decoding_logits_processor(
-        json_request, deepseek_r1_qwen_tokenizer, config,
-        reasoning_backend) if is_local else \
-        await get_guided_decoding_logits_processor(
-            json_request, deepseek_r1_qwen_tokenizer, config, reasoning_backend)
-    assert json_lp is not None
-    tensor = torch.rand(151664)
-    original_tensor = torch.clone(tensor)
-    tensor = json_lp(token_ids, tensor)
-    assert tensor.shape == original_tensor.shape
-    assert torch.allclose(tensor, original_tensor)
-
-    # Thinking is over, so the tensor should change.
-    token_ids = deepseek_r1_qwen_tokenizer.encode(
-        "<think>here is the thinking process</think>")
-    json_request = GuidedDecodingParams(json=sample_json_schema,
-                                        backend=backend)
-    json_lp = get_local_guided_decoding_logits_processor(
-        json_request, deepseek_r1_qwen_tokenizer, config,
-        reasoning_backend) if is_local else \
-        await get_guided_decoding_logits_processor(
-            json_request, deepseek_r1_qwen_tokenizer, config, reasoning_backend)
-    assert json_lp is not None
-    tensor = torch.rand(151664)
-    original_tensor = torch.clone(tensor)
-    tensor = json_lp(token_ids, tensor)
-    assert tensor.shape == original_tensor.shape
-    assert not torch.allclose(tensor, original_tensor)
-
-
-def test_multiple_guided_options_not_allowed(sample_json_schema, sample_regex):
-    with pytest.raises(ValueError,
-                       match="You can only use one kind of guided"):
-        GuidedDecodingParams(json=sample_json_schema, regex=sample_regex)
-
-    with pytest.raises(ValueError,
-                       match="You can only use one kind of guided"):
-        GuidedDecodingParams(json=sample_json_schema, json_object=True)
-
-    with pytest.raises(ValueError,
-                       match="You can only use one kind of guided"):
-        GuidedDecodingParams(json=sample_json_schema, choice=["a", "b"])
-
-    with pytest.raises(ValueError,
-                       match="You can only use one kind of guided"):
-        GuidedDecodingParams(json=sample_json_schema, grammar="test grammar")
-
-
-def test_pickle_xgrammar_tokenizer_data():
-    try:
-        import xgrammar as xgr
-    except ImportError:
-        pytest.skip("Could not import xgrammar to run test")
-
-    from vllm.model_executor.guided_decoding.xgrammar_decoding import (
-        TokenizerData)
-    tokenizer_data = TokenizerData(
-        metadata=
-        '{"vocab_type":2,"vocab_size":151665,"add_prefix_space":false,"stop_token_ids":[151645]}',
-        encoded_vocab=['!', '"', '#', '$', '%'],
-    )
-    pickled = pickle.dumps(tokenizer_data)
-
-    assert pickled is not None
-
-    depickled: TokenizerData = pickle.loads(pickled)
-
-    assert depickled is not None
-    assert json.loads(
-        depickled.metadata)['vocab_type'] == xgr.VocabType.BYTE_LEVEL.value
diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py
index aae9a4d1ef11d..0ade75b7e6228 100644
--- a/tests/model_executor/test_model_load_with_params.py
+++ b/tests/model_executor/test_model_load_with_params.py
@@ -22,10 +22,12 @@ REVISION_ROBERTA = os.environ.get("REVISION", "main")
 
 @pytest.mark.skipif(current_platform.is_rocm(),
                     reason="Xformers backend is not supported on ROCm.")
-def test_model_loading_with_params(vllm_runner):
+def test_model_loading_with_params(vllm_runner, monkeypatch):
     """
     Test parameter weight loading with tp>1.
     """
+    # to use apply_model
+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
     with vllm_runner(model_name=MODEL_NAME,
                      revision=REVISION,
                      dtype="float16",
@@ -55,16 +57,17 @@ def test_model_loading_with_params(vllm_runner):
 
         vllm_model.apply_model(check_model)
 
-        # assert output
         assert output
 
 
 @pytest.mark.skipif(current_platform.is_rocm(),
                     reason="Xformers backend is not supported on ROCm.")
-def test_roberta_model_loading_with_params(vllm_runner):
+def test_roberta_model_loading_with_params(vllm_runner, monkeypatch):
     """
     Test parameter weight loading with tp>1.
     """
+    # to use apply_model
+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
     with vllm_runner(model_name=MODEL_NAME_ROBERTA,
                      revision=REVISION_ROBERTA,
                      dtype="float16",
@@ -95,16 +98,17 @@ def test_roberta_model_loading_with_params(vllm_runner):
 
         vllm_model.apply_model(check_model)
 
-        # assert output
         assert output
 
 
 @pytest.mark.skipif(current_platform.is_rocm(),
                     reason="Xformers backend is not supported on ROCm.")
-def test_facebook_roberta_model_loading_with_params(vllm_runner):
+def test_facebook_roberta_model_loading_with_params(vllm_runner, monkeypatch):
     """
     Test loading roberta-base model with no lm_head.
     """
+    # to use apply_model
+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
     model_name = "FacebookAI/roberta-base"
     with vllm_runner(model_name=model_name,
                      dtype="float16",
diff --git a/tests/models/language/generation/test_mistral.py b/tests/models/language/generation/test_mistral.py
index 81a88f2d485eb..af51a60edfd62 100644
--- a/tests/models/language/generation/test_mistral.py
+++ b/tests/models/language/generation/test_mistral.py
@@ -3,13 +3,11 @@
 import copy
 import json
 
-import jsonschema
-import jsonschema.exceptions
 import pytest
 
 from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (
     MistralToolCall, MistralToolParser)
-from vllm.sampling_params import GuidedDecodingParams, SamplingParams
+from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer import MistralTokenizer
 
 from ...utils import check_logprobs_close
@@ -274,53 +272,6 @@ def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:
         assert parsed_message.content is None
 
 
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("guided_backend",
-                         ["outlines", "lm-format-enforcer", "xgrammar"])
-def test_mistral_guided_decoding(
-    monkeypatch: pytest.MonkeyPatch,
-    vllm_runner,
-    model: str,
-    guided_backend: str,
-) -> None:
-    with monkeypatch.context() as m:
-        # Guided JSON not supported in xgrammar + V1 yet
-        m.setenv("VLLM_USE_V1", "0")
-
-        with vllm_runner(
-                model,
-                dtype='bfloat16',
-                tokenizer_mode="mistral",
-                guided_decoding_backend=guided_backend,
-        ) as vllm_model:
-            guided_decoding = GuidedDecodingParams(json=SAMPLE_JSON_SCHEMA)
-            params = SamplingParams(max_tokens=512,
-                                    temperature=0.7,
-                                    guided_decoding=guided_decoding)
-
-            messages = [{
-                "role": "system",
-                "content": "you are a helpful assistant"
-            }, {
-                "role":
-                "user",
-                "content":
-                f"Give an example JSON for an employee profile that "
-                f"fits this schema: {SAMPLE_JSON_SCHEMA}"
-            }]
-            outputs = vllm_model.llm.chat(messages, sampling_params=params)
-
-        generated_text = outputs[0].outputs[0].text
-        json_response = json.loads(generated_text)
-        assert outputs is not None
-
-        try:
-            jsonschema.validate(instance=json_response,
-                                schema=SAMPLE_JSON_SCHEMA)
-        except jsonschema.exceptions.ValidationError:
-            pytest.fail("Generated response is not valid with JSON schema")
-
-
 def test_mistral_function_call_nested_json():
     """Ensure that the function-name regex captures the entire outer-most
     JSON block, including nested braces."""
diff --git a/tests/models/language/pooling/embed_utils.py b/tests/models/language/pooling/embed_utils.py
index a663679a9c7cb..61c5fcab4f8a4 100644
--- a/tests/models/language/pooling/embed_utils.py
+++ b/tests/models/language/pooling/embed_utils.py
@@ -52,7 +52,7 @@ def correctness_test_embed_models(hf_runner,
     vllm_extra_kwargs["dtype"] = model_info.dtype
 
     with vllm_runner(model_info.name,
-                     task="embed",
+                     runner="pooling",
                      max_model_len=None,
                      **vllm_extra_kwargs) as vllm_model:
         vllm_outputs = vllm_model.embed(example_prompts)
diff --git a/tests/models/language/pooling/mteb_utils.py b/tests/models/language/pooling/mteb_utils.py
index 97362f6416659..8c93bbdc98c02 100644
--- a/tests/models/language/pooling/mteb_utils.py
+++ b/tests/models/language/pooling/mteb_utils.py
@@ -172,7 +172,7 @@ def mteb_test_embed_models(hf_runner,
     vllm_extra_kwargs["dtype"] = model_info.dtype
 
     with vllm_runner(model_info.name,
-                     task="embed",
+                     runner="pooling",
                      max_model_len=None,
                      **vllm_extra_kwargs) as vllm_model:
 
@@ -279,15 +279,12 @@ def mteb_test_rerank_models(hf_runner,
     vllm_extra_kwargs["dtype"] = model_info.dtype
 
     with vllm_runner(model_info.name,
-                     task="score",
+                     runner="pooling",
                      max_model_len=None,
                      max_num_seqs=8,
                      **vllm_extra_kwargs) as vllm_model:
 
         model_config = vllm_model.llm.llm_engine.model_config
-
-        if model_info.architecture:
-            assert (model_info.architecture in model_config.architectures)
         assert model_config.hf_config.num_labels == 1
 
         vllm_main_score = run_mteb_rerank(vllm_mteb_encoder(vllm_model),
diff --git a/tests/models/language/pooling/test_classification.py b/tests/models/language/pooling/test_classification.py
index 77df6d16a3673..c71fa96275335 100644
--- a/tests/models/language/pooling/test_classification.py
+++ b/tests/models/language/pooling/test_classification.py
@@ -6,14 +6,6 @@ from transformers import AutoModelForSequenceClassification
 
 from vllm.platforms import current_platform
 
-# TODO: enable when float32 is supported by V1
-# @pytest.fixture(autouse=True)
-# def v1(run_with_both_engines):
-#     # Simple autouse wrapper to run both engines for each test
-#     # This can be promoted up to conftest.py to run for every
-#     # test in a package
-#     pass
-
 
 @pytest.mark.parametrize(
     "model",
diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py
index cc9e4102d5b79..51283dc630c27 100644
--- a/tests/models/language/pooling/test_embedding.py
+++ b/tests/models/language/pooling/test_embedding.py
@@ -39,17 +39,9 @@ def v1(run_with_both_engines):
         pytest.param("ssmits/Qwen2-7B-Instruct-embed-base",
                      marks=[pytest.mark.skip_v0, pytest.mark.cpu_model]),
         # [Encoder-only]
-        pytest.param(
-            "BAAI/bge-base-en-v1.5",
-            marks=[
-                # CPU only supports V1
-                pytest.mark.core_model,
-                pytest.mark.skip_v1
-            ]),
-        pytest.param("sentence-transformers/all-MiniLM-L12-v2",
-                     marks=[pytest.mark.skip_v1]),
-        pytest.param("intfloat/multilingual-e5-small",
-                     marks=[pytest.mark.skip_v1]),
+        pytest.param("BAAI/bge-base-en-v1.5", marks=[pytest.mark.core_model]),
+        pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
+        pytest.param("intfloat/multilingual-e5-small"),
         pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
                      marks=[pytest.mark.skip_v1]),
         # [Cross-Encoder]
@@ -93,7 +85,7 @@ def test_models(
         hf_outputs = hf_model.encode(example_prompts)
 
     with vllm_runner(model,
-                     task="embed",
+                     runner="pooling",
                      max_model_len=max_model_len,
                      **vllm_extra_kwargs) as vllm_model:
         vllm_outputs = vllm_model.embed(example_prompts)
diff --git a/tests/models/language/pooling/test_gritlm.py b/tests/models/language/pooling/test_gritlm.py
index efa119bb76596..d21987571cbaa 100644
--- a/tests/models/language/pooling/test_gritlm.py
+++ b/tests/models/language/pooling/test_gritlm.py
@@ -28,10 +28,7 @@ def test_find_array():
 
     model_config = ModelConfig(
         MODEL_NAME,
-        task="embed",
-        tokenizer=MODEL_NAME,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
+        runner="pooling",
         dtype="bfloat16",
         seed=0,
     )
@@ -117,7 +114,7 @@ def test_gritlm_offline_embedding(vllm_runner):
 
     with vllm_runner(
             MODEL_NAME,
-            task="embed",
+            runner="pooling",
             max_model_len=MAX_MODEL_LEN,
     ) as vllm_model:
         llm = vllm_model.llm
@@ -140,7 +137,7 @@ def test_gritlm_offline_embedding(vllm_runner):
 async def test_gritlm_api_server_embedding():
     queries, q_instruction, documents, d_instruction = get_test_data()
 
-    args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)]
+    args = ["--runner", "pooling", "--max_model_len", str(MAX_MODEL_LEN)]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as server:
         client_embedding = server.get_async_client()
@@ -164,7 +161,7 @@ def test_gritlm_offline_generate(monkeypatch: pytest.MonkeyPatch, vllm_runner):
 
     with vllm_runner(
             MODEL_NAME,
-            task="generate",
+            runner="generate",
             max_model_len=MAX_MODEL_LEN,
     ) as vllm_model:
         llm = vllm_model.llm
@@ -179,7 +176,7 @@ def test_gritlm_offline_generate(monkeypatch: pytest.MonkeyPatch, vllm_runner):
 async def test_gritlm_api_server_generate():
     input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n"
 
-    args = ["--task", "generate", "--max_model_len", str(MAX_MODEL_LEN)]
+    args = ["--runner", "generate", "--max_model_len", str(MAX_MODEL_LEN)]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as server:
         client_generate = server.get_async_client()
diff --git a/tests/models/language/pooling/test_gte.py b/tests/models/language/pooling/test_gte.py
index 0ad54785308e8..6d2eff709961b 100644
--- a/tests/models/language/pooling/test_gte.py
+++ b/tests/models/language/pooling/test_gte.py
@@ -56,17 +56,10 @@ MODELS = [
                    enable_test=False),
 ]
 
-V1FlashAttentionImpNotSupported = [
-    "Alibaba-NLP/gte-Qwen2-1.5B-instruct", "Alibaba-NLP/gte-modernbert-base"
-]
-
 
 @pytest.mark.parametrize("model_info", MODELS)
-def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo,
-                           monkeypatch) -> None:
-    if model_info.name in V1FlashAttentionImpNotSupported:
-        monkeypatch.setenv("VLLM_USE_V1", "0")
-
+def test_embed_models_mteb(hf_runner, vllm_runner,
+                           model_info: EmbedModelInfo) -> None:
     vllm_extra_kwargs: dict[str, Any] = {}
     if model_info.architecture == "GteNewModel":
         vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]}
@@ -77,11 +70,8 @@ def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo,
 
 @pytest.mark.parametrize("model_info", MODELS)
 def test_embed_models_correctness(hf_runner, vllm_runner,
-                                  model_info: EmbedModelInfo, example_prompts,
-                                  monkeypatch) -> None:
-    if model_info.name in V1FlashAttentionImpNotSupported:
-        monkeypatch.setenv("VLLM_USE_V1", "0")
-
+                                  model_info: EmbedModelInfo,
+                                  example_prompts) -> None:
     vllm_extra_kwargs: dict[str, Any] = {}
     if model_info.architecture == "GteNewModel":
         vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]}
diff --git a/tests/models/language/pooling/test_jina.py b/tests/models/language/pooling/test_jina.py
index 16c711407aeae..59b634428ceff 100644
--- a/tests/models/language/pooling/test_jina.py
+++ b/tests/models/language/pooling/test_jina.py
@@ -84,7 +84,7 @@ def test_matryoshka(
         hf_outputs = matryoshka_fy(hf_outputs, dimensions)
 
     with vllm_runner(model_info.name,
-                     task="embed",
+                     runner="pooling",
                      dtype=dtype,
                      max_model_len=None) as vllm_model:
         assert vllm_model.llm.llm_engine.model_config.is_matryoshka
diff --git a/tests/models/language/pooling/test_nomic_max_model_len.py b/tests/models/language/pooling/test_nomic_max_model_len.py
index 7413ef578e38c..c34c36fd98150 100644
--- a/tests/models/language/pooling/test_nomic_max_model_len.py
+++ b/tests/models/language/pooling/test_nomic_max_model_len.py
@@ -21,7 +21,7 @@ max_model_len = int(original_max_position_embeddings * factor)
 
 @pytest.mark.parametrize("model_info", MODELS)
 def test_default(model_info, vllm_runner):
-    with vllm_runner(model_info.name, task="embed",
+    with vllm_runner(model_info.name, runner="pooling",
                      max_model_len=None) as vllm_model:
         model_config = vllm_model.llm.llm_engine.model_config
         if model_info.name == "nomic-ai/nomic-embed-text-v2-moe":
@@ -36,7 +36,7 @@ def test_default(model_info, vllm_runner):
 @pytest.mark.parametrize("model_info", MODELS)
 def test_set_max_model_len_legal(model_info, vllm_runner):
     # set max_model_len <= 512
-    with vllm_runner(model_info.name, task="embed",
+    with vllm_runner(model_info.name, runner="pooling",
                      max_model_len=256) as vllm_model:
         model_config = vllm_model.llm.llm_engine.model_config
         assert model_config.max_model_len == 256
@@ -46,11 +46,12 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
         # For nomic-embed-text-v2-moe the length is set to 512
         # by sentence_bert_config.json.
         with pytest.raises(ValueError):
-            with vllm_runner(model_info.name, task="embed",
+            with vllm_runner(model_info.name,
+                             runner="pooling",
                              max_model_len=1024):
                 pass
     else:
-        with vllm_runner(model_info.name, task="embed",
+        with vllm_runner(model_info.name, runner="pooling",
                          max_model_len=1024) as vllm_model:
             model_config = vllm_model.llm.llm_engine.model_config
             assert model_config.max_model_len == 1024
@@ -60,14 +61,15 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
 def test_set_max_model_len_illegal(model_info, vllm_runner):
     # set max_model_len > 2048
     with pytest.raises(ValueError):
-        with vllm_runner(model_info.name, task="embed", max_model_len=4096):
+        with vllm_runner(model_info.name, runner="pooling",
+                         max_model_len=4096):
             pass
 
     # set max_model_len > 2048 by hf_overrides
     hf_overrides = {"max_model_len": 4096}
     with pytest.raises(ValueError):
         with vllm_runner(model_info.name,
-                         task="embed",
+                         runner="pooling",
                          max_model_len=None,
                          hf_overrides=hf_overrides):
             pass
@@ -87,7 +89,7 @@ def test_use_rope_scaling_legal(model_info, vllm_runner):
     }
 
     with vllm_runner(model_info.name,
-                     task="embed",
+                     runner="pooling",
                      max_model_len=None,
                      hf_overrides=hf_overrides):
         pass
@@ -107,7 +109,7 @@ def test_use_rope_scaling_illegal(model_info, vllm_runner):
     # illegal max_model_len
     with pytest.raises(ValueError):
         with vllm_runner(model_info.name,
-                         task="embed",
+                         runner="pooling",
                          max_model_len=max_model_len + 1,
                          hf_overrides=hf_overrides):
             pass
@@ -125,7 +127,7 @@ def test_use_rope_scaling_illegal(model_info, vllm_runner):
     # illegal max_model_len by hf_overrides
     with pytest.raises(ValueError):
         with vllm_runner(model_info.name,
-                         task="embed",
+                         runner="pooling",
                          max_model_len=None,
                          hf_overrides=hf_overrides):
             pass
diff --git a/tests/models/language/pooling/test_qwen3_reranker.py b/tests/models/language/pooling/test_qwen3_reranker.py
index 9c6a833b41384..68e96f32700ca 100644
--- a/tests/models/language/pooling/test_qwen3_reranker.py
+++ b/tests/models/language/pooling/test_qwen3_reranker.py
@@ -83,9 +83,6 @@ def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
         }
     }
 
-    if model_info.name == "Qwen/Qwen3-Reranker-4B":
-        vllm_extra_kwargs["max_num_seqs"] = 1
-
     mteb_test_rerank_models(Qwen3RerankerHfRunner, vllm_runner, model_info,
                             vllm_extra_kwargs)
 
@@ -106,9 +103,6 @@ def test_rerank_models_mteb_tp(vllm_runner,
         "tensor_parallel_size": 2,
     }
 
-    if model_info.name == "Qwen/Qwen3-Reranker-4B":
-        vllm_extra_kwargs["max_num_seqs"] = 1
-
     mteb_test_rerank_models(Qwen3RerankerHfRunner,
                             vllm_runner,
                             model_info,
diff --git a/tests/models/language/pooling/test_reward.py b/tests/models/language/pooling/test_reward.py
index 3b7fab3ba5c99..a5f7dca76d822 100644
--- a/tests/models/language/pooling/test_reward.py
+++ b/tests/models/language/pooling/test_reward.py
@@ -95,7 +95,7 @@ def test_prm_models(
         monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
 
     with vllm_runner(model, max_model_len=1024, dtype=dtype) as vllm_model:
-        vllm_outputs = vllm_model.encode(math_step_prompts)
+        vllm_outputs = vllm_model.reward(math_step_prompts)
 
     with hf_runner(model, dtype=dtype, auto_cls=AutoModel) as hf_model:
         hf_model = step_reward_patch_hf_model(hf_model)
diff --git a/tests/models/language/pooling/test_scoring.py b/tests/models/language/pooling/test_scoring.py
index c75ff14456169..ef9d5530cde15 100644
--- a/tests/models/language/pooling/test_scoring.py
+++ b/tests/models/language/pooling/test_scoring.py
@@ -37,7 +37,9 @@ def test_cross_encoder_1_to_1(vllm_runner, hf_runner, model_name):
     with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
         hf_outputs = hf_model.predict([text_pair]).tolist()
 
-    with vllm_runner(model_name, task="score", dtype=DTYPE,
+    with vllm_runner(model_name,
+                     runner="pooling",
+                     dtype=DTYPE,
                      max_model_len=None) as vllm_model:
         vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
 
@@ -56,7 +58,9 @@ def test_cross_encoder_1_to_N(vllm_runner, hf_runner, model_name):
     with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
         hf_outputs = hf_model.predict(text_pairs).tolist()
 
-    with vllm_runner(model_name, task="score", dtype=DTYPE,
+    with vllm_runner(model_name,
+                     runner="pooling",
+                     dtype=DTYPE,
                      max_model_len=None) as vllm_model:
         vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
 
@@ -76,7 +80,9 @@ def test_cross_encoder_N_to_N(vllm_runner, hf_runner, model_name):
     with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
         hf_outputs = hf_model.predict(text_pairs).tolist()
 
-    with vllm_runner(model_name, task="score", dtype=DTYPE,
+    with vllm_runner(model_name,
+                     runner="pooling",
+                     dtype=DTYPE,
                      max_model_len=None) as vllm_model:
         vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)
 
@@ -103,7 +109,7 @@ def test_embedding_1_to_1(vllm_runner, hf_runner, emb_model_name):
         ]
 
     with vllm_runner(emb_model_name,
-                     task="embed",
+                     runner="pooling",
                      dtype=DTYPE,
                      max_model_len=None) as vllm_model:
         vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
@@ -131,7 +137,7 @@ def test_embedding_1_to_N(vllm_runner, hf_runner, emb_model_name):
         ]
 
     with vllm_runner(emb_model_name,
-                     task="embed",
+                     runner="pooling",
                      dtype=DTYPE,
                      max_model_len=None) as vllm_model:
         vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
@@ -160,7 +166,7 @@ def test_embedding_N_to_N(vllm_runner, hf_runner, emb_model_name):
         ]
 
     with vllm_runner(emb_model_name,
-                     task="embed",
+                     runner="pooling",
                      dtype=DTYPE,
                      max_model_len=None) as vllm_model:
         vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)
diff --git a/tests/models/language/pooling/test_truncation_control.py b/tests/models/language/pooling/test_truncation_control.py
index c7399e01c735b..c6ef899958a07 100644
--- a/tests/models/language/pooling/test_truncation_control.py
+++ b/tests/models/language/pooling/test_truncation_control.py
@@ -26,9 +26,9 @@ def test_smaller_truncation_size(vllm_runner,
 
     truncate_prompt_tokens = 10
 
-    with vllm_runner(model_name, task="embed",
+    with vllm_runner(model_name, runner="pooling",
                      max_model_len=max_model_len) as vllm_model:
-        vllm_output = vllm_model.llm.encode(
+        vllm_output = vllm_model.llm.embed(
             input_str, truncate_prompt_tokens=truncate_prompt_tokens)
 
     prompt_tokens = vllm_output[0].prompt_token_ids
@@ -41,9 +41,9 @@ def test_max_truncation_size(vllm_runner,
                              input_str=input_str):
     truncate_prompt_tokens = -1
 
-    with vllm_runner(model_name, task="embed",
+    with vllm_runner(model_name, runner="pooling",
                      max_model_len=max_model_len) as vllm_model:
-        vllm_output = vllm_model.llm.encode(
+        vllm_output = vllm_model.llm.embed(
             input_str, truncate_prompt_tokens=truncate_prompt_tokens)
 
     prompt_tokens = vllm_output[0].prompt_token_ids
@@ -58,10 +58,10 @@ def test_bigger_truncation_size(vllm_runner,
     truncate_prompt_tokens = max_model_len + 1
 
     with pytest.raises(ValueError), vllm_runner(
-            model_name, task="embed",
+            model_name, runner="pooling",
             max_model_len=max_model_len) as vllm_model:
 
-        llm_output = vllm_model.llm.encode(
+        llm_output = vllm_model.llm.embed(
             input_str, truncate_prompt_tokens=truncate_prompt_tokens)
 
         assert llm_output == f"""truncate_prompt_tokens value 
diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index e2e35e9b27218..5bff615fb1071 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -638,7 +638,7 @@ VLM_TEST_SETTINGS = {
         img_idx_to_prompt=lambda idx: f"<|image_{idx}|>\n",
         max_model_len=4096,
         max_num_seqs=2,
-        task="generate",
+        runner="generate",
         # use sdpa mode for hf runner since phi3v didn't work with flash_attn
         hf_model_kwargs={"_attn_implementation": "sdpa"},
         use_tokenizer_eos=True,
@@ -677,6 +677,7 @@ VLM_TEST_SETTINGS = {
         prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
         img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
         video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
+        multi_image_prompt="Picture 1: <vlm_image>\nPicture 2: <vlm_image>\nDescribe these two images with one paragraph respectively.",    # noqa: E501
         max_model_len=4096,
         max_num_seqs=2,
         auto_cls=AutoModelForVision2Seq,
diff --git a/tests/models/multimodal/generation/test_granite_speech.py b/tests/models/multimodal/generation/test_granite_speech.py
index c5ffa5f3a70af..f2e6fbfad6e80 100644
--- a/tests/models/multimodal/generation/test_granite_speech.py
+++ b/tests/models/multimodal/generation/test_granite_speech.py
@@ -65,7 +65,7 @@ def run_test(
     # max_model_len should be greater than image_feature_size
     with vllm_runner(
             model,
-            task="generate",
+            runner="generate",
             max_model_len=max_model_len,
             max_num_seqs=1,
             dtype=dtype,
diff --git a/tests/models/multimodal/generation/test_interleaved.py b/tests/models/multimodal/generation/test_interleaved.py
index 949c0a80d31bc..1ef56af33a094 100644
--- a/tests/models/multimodal/generation/test_interleaved.py
+++ b/tests/models/multimodal/generation/test_interleaved.py
@@ -48,7 +48,7 @@ def test_models(vllm_runner, model, dtype: str, max_tokens: int) -> None:
     ]
 
     with vllm_runner(model,
-                     task="generate",
+                     runner="generate",
                      dtype=dtype,
                      limit_mm_per_prompt={"image": 2},
                      max_model_len=32768,
diff --git a/tests/models/multimodal/generation/test_maverick.py b/tests/models/multimodal/generation/test_maverick.py
index 306cf39002df2..bacc9ef94f49d 100644
--- a/tests/models/multimodal/generation/test_maverick.py
+++ b/tests/models/multimodal/generation/test_maverick.py
@@ -22,6 +22,9 @@ from transformers import (AutoConfig, AutoProcessor, AutoTokenizer,
                           GenerationConfig)
 
 from vllm import LLM, SamplingParams
+from vllm.v1.executor.abstract import Executor
+from vllm.v1.kv_cache_interface import (ChunkedLocalAttentionSpec,
+                                        FullAttentionSpec)
 
 from ....utils import multi_gpu_test
 
@@ -69,6 +72,26 @@ def run_maverick_serving(model: str):
         raise
 
 
+def get_rope_layers_config(model_path: str) -> list[int]:
+    """
+    Get the interleaved RoPE configuration from HuggingFace config
+
+    Args:
+        model_path: Path to the local directory containing the reduced
+            Maverick model checkpoint
+
+    Returns:
+        List of 0 or 1 indicating whether each layer uses RoPE and local attn
+        0 indicates that RoPE is not used while 1 indicates that RoPE is used.
+    """
+    config_path = Path(model_path) / "config.json"
+    model_config = json.loads(config_path.read_text())
+    text_config = model_config["text_config"]
+    no_rope_layers = text_config["no_rope_layers"]
+    print(f"Found no_rope_layers: {no_rope_layers}")
+    return no_rope_layers
+
+
 def create_reduced_maverick_model(
     original_model_name:
     str = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
@@ -113,7 +136,6 @@ def create_reduced_maverick_model(
         print("Loading original model configuration...")
         original_config = AutoConfig.from_pretrained(original_model_name,
                                                      trust_remote_code=True)
-
         print("Creating reduced configuration...")
         reduced_config = create_reduced_config(original_config, text_layers,
                                                num_experts, vision_layers)
@@ -510,21 +532,32 @@ def save_weights_to_safetensors(weights: dict[str, torch.Tensor],
           f"{index_data['metadata']['total_size'] / (1024**3):.2f} GB")
 
 
-def run_reduced_model(model_path: str,
-                      should_profile: bool = False,
-                      **kwargs) -> None:
-    """Test the created reduced model with vLLM."""
-
-    print(f"\nTesting reduced model at {model_path}...")
-
-    llm = LLM(
-        model=model_path,
-        trust_remote_code=True,
-        max_model_len=512,  # Small context for testing
-        gpu_memory_utilization=0.3,  # Conservative memory usage
-        **kwargs,
+def check_attention_spec_interleaved_rope(
+    llm: LLM,
+    num_attention_layers: int,
+    num_ranks: int,
+    rope_layers: list[int],
+):
+    """Check that the attention spec is correct."""
+    assert isinstance(llm.llm_engine.model_executor, Executor)
+    kv_cache_specs_per_rank = llm.llm_engine.model_executor.get_kv_cache_specs(
     )
+    for rank in range(num_ranks):
+        kv_cache_specs = kv_cache_specs_per_rank[rank]
+        assert len(kv_cache_specs.keys()) == num_attention_layers
+        for i in range(num_attention_layers):
+            if rope_layers[i] == 0:
+                expected_spec = FullAttentionSpec
+            else:
+                expected_spec = ChunkedLocalAttentionSpec
+            assert isinstance(
+                kv_cache_specs[
+                    f"language_model.model.layers.{i}.self_attn.attn"],
+                expected_spec)
 
+
+def run_reduced_model(llm: LLM, should_profile: bool = False) -> None:
+    """Test the created reduced model with vLLM."""
     sampling_params = SamplingParams(temperature=0.8,
                                      top_p=0.95,
                                      max_tokens=50)
@@ -551,6 +584,7 @@ def run_reduced_model(model_path: str,
 @pytest.mark.parametrize("tp,ep", [(2, True)])
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
 def test_dummy_maverick(
+    monkeypatch,
     original_model_name: str,
     text_layers: int,
     num_experts: int,
@@ -562,6 +596,10 @@ def test_dummy_maverick(
     force_recreate: bool = True,
     profile: bool = False,
 ) -> None:
+    # Disable multiprocessing allows us to access model executor from LLM engine
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+
     model_path = create_reduced_maverick_model(
         original_model_name=original_model_name,
         output_dir=output_dir,
@@ -573,11 +611,27 @@ def test_dummy_maverick(
 
     print(f"\nReduced model created successfully at: {model_path}")
 
-    run_reduced_model(model_path=model_path,
-                      should_profile=profile,
-                      enforce_eager=enforce_eager,
-                      tensor_parallel_size=tp,
-                      enable_expert_parallel=ep)
+    rope_layers = get_rope_layers_config(model_path)
+
+    llm = LLM(
+        model=model_path,
+        trust_remote_code=True,
+        max_model_len=512,  # Small context for testing
+        gpu_memory_utilization=0.3,  # Conservative memory usage
+        enforce_eager=enforce_eager,
+        tensor_parallel_size=tp,
+        enable_expert_parallel=ep,
+    )
+
+    check_attention_spec_interleaved_rope(
+        llm,
+        text_layers,
+        tp,
+        rope_layers,
+    )
+
+    print(f"\nTesting reduced model at {model_path}...")
+    run_reduced_model(llm=llm, should_profile=profile)
 
 
 def main():
diff --git a/tests/models/multimodal/generation/test_phi4_multimodal.py b/tests/models/multimodal/generation/test_phi4_multimodal.py
new file mode 100644
index 0000000000000..db8984d8656fc
--- /dev/null
+++ b/tests/models/multimodal/generation/test_phi4_multimodal.py
@@ -0,0 +1,252 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+from collections.abc import Sequence
+from typing import Optional
+
+import librosa
+import pytest
+from huggingface_hub import snapshot_download
+
+from vllm.assets.image import ImageAsset
+from vllm.lora.request import LoRARequest
+from vllm.multimodal.image import rescale_image_size
+from vllm.platforms import current_platform
+
+from ....conftest import (IMAGE_ASSETS, HfRunner, PromptAudioInput,
+                          PromptImageInput, VllmRunner)
+from ....utils import large_gpu_test
+from ...utils import check_logprobs_close
+
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    "stop_sign":
+    "<|user|>\n<|image|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n",  # noqa: E501
+    "cherry_blossom":
+    "<|user|>\n<|image|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n",  # noqa: E501
+})
+HF_MULTIIMAGE_IMAGE_PROMPT = "<|user|>\n<|image|>\n<|image|>\nDescribe these images.<|end|>\n<|assistant|>\n"  # noqa: E501
+
+model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct",
+                               revision="refs/pr/70")
+# Since the vision-lora and speech-lora co-exist with the base model,
+# we have to manually specify the path of the lora weights.
+vision_lora_path = os.path.join(model_path, "vision-lora")
+speech_question = os.path.join(model_path, "examples",
+                               "what_is_shown_in_this_image.wav")
+models = [model_path]
+
+target_dtype = "half"
+
+# ROCm Triton FA can run into shared memory issues with these models,
+# use other backends in the meantime
+# FIXME (mattwong, gshtrasb, hongxiayan)
+if current_platform.is_rocm():
+    os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
+
+
+def run_test(
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    inputs: Sequence[tuple[list[str], PromptImageInput,
+                           Optional[PromptAudioInput]]],
+    model: str,
+    *,
+    max_model_len: int,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    mm_limit: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    """Inference result should be the same between hf and vllm.
+
+    All the image fixtures for the test are from IMAGE_ASSETS.
+    For huggingface runner, we provide the PIL images as input.
+    For vllm runner, we provide MultiModalDataDict objects
+    and corresponding MultiModalConfig as input.
+    Note, the text input is also adjusted to abide by vllm contract.
+    The text output is sanitized to be able to compare with hf.
+    """
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+    # max_model_len should be greater than image_feature_size
+    with vllm_runner(
+            model,
+            task="generate",
+            max_model_len=max_model_len,
+            max_num_seqs=2,
+            dtype=dtype,
+            limit_mm_per_prompt={"image": mm_limit},
+            tensor_parallel_size=tensor_parallel_size,
+            distributed_executor_backend=distributed_executor_backend,
+            enable_lora=True,
+            max_lora_rank=320,
+            gpu_memory_utilization=0.8,  # set to 0.8 to avoid OOM in CI
+            enforce_eager=True,
+            trust_remote_code=False,
+    ) as vllm_model:
+        lora_request = LoRARequest("vision", 1, vision_lora_path)
+        vllm_outputs_per_case = [
+            vllm_model.generate_greedy_logprobs(prompts,
+                                                max_tokens,
+                                                num_logprobs=num_logprobs,
+                                                images=images,
+                                                audios=audios,
+                                                lora_request=lora_request)
+            for prompts, images, audios in inputs
+        ]
+
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_model.model.load_adapter(
+            vision_lora_path,
+            adapter_name="vision",
+        )
+        hf_processor = hf_model.processor
+        eos_token_id = hf_processor.tokenizer.eos_token_id
+        hf_outputs_per_case = [
+            hf_model.generate_greedy_logprobs_limit(prompts,
+                                                    max_tokens,
+                                                    num_logprobs=num_logprobs,
+                                                    images=images,
+                                                    audios=audios,
+                                                    eos_token_id=eos_token_id)
+            for prompts, images, audios in inputs
+        ]
+
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
+                                        vllm_outputs_per_case):
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No image
+        [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_model_len", [12800])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [10])
+def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
+                dtype: str, max_model_len: int, max_tokens: int,
+                num_logprobs: int) -> None:
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_image = [(
+        [prompt for _ in size_factors],
+        [rescale_image_size(image, factor) for factor in size_factors],
+        None,
+    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+
+    run_test(
+        hf_runner,
+        vllm_runner,
+        inputs_per_image,
+        model,
+        dtype=dtype,
+        max_model_len=max_model_len,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        mm_limit=1,
+        tensor_parallel_size=1,
+    )
+
+
+@large_gpu_test(min_gb=48)
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No image
+        # [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_model_len", [25600])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [10])
+def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
+                             size_factors, dtype: str, max_model_len: int,
+                             max_tokens: int, num_logprobs: int) -> None:
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_case = [
+        (
+            [HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
+            [[rescale_image_size(image, factor) for image in images]
+             for factor in size_factors],
+            None,
+        ),
+    ]
+
+    run_test(
+        hf_runner,
+        vllm_runner,
+        inputs_per_case,
+        model,
+        dtype=dtype,
+        max_model_len=max_model_len,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        mm_limit=2,
+        tensor_parallel_size=1,
+    )
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_model_len", [12800])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [10])
+def test_vision_speech_models(hf_runner, vllm_runner, model, dtype: str,
+                              max_model_len: int, max_tokens: int,
+                              num_logprobs: int) -> None:
+
+    # use the example speech question so that the model outputs are reasonable
+    audio = librosa.load(speech_question, sr=16000)
+    image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
+
+    inputs_vision_speech = [
+        (
+            ["<|user|><|image|><|audio|><|end|><|assistant|>"],
+            [image],
+            [audio],
+        ),
+    ]
+
+    run_test(
+        hf_runner,
+        vllm_runner,
+        inputs_vision_speech,
+        model,
+        dtype=dtype,
+        max_model_len=max_model_len,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        mm_limit=1,
+        tensor_parallel_size=1,
+    )
diff --git a/tests/models/multimodal/generation/test_phi4mm.py b/tests/models/multimodal/generation/test_phi4mm.py
index 4e8465778e256..67d35213d6422 100644
--- a/tests/models/multimodal/generation/test_phi4mm.py
+++ b/tests/models/multimodal/generation/test_phi4mm.py
@@ -99,7 +99,7 @@ def run_test(
     # max_model_len should be greater than image_feature_size
     with vllm_runner(
             model,
-            task="generate",
+            runner="generate",
             max_model_len=max_model_len,
             max_num_seqs=2,
             dtype=dtype,
diff --git a/tests/models/multimodal/generation/test_qwen2_vl.py b/tests/models/multimodal/generation/test_qwen2_vl.py
index a2793b8c8ddf7..c61c27ae204a3 100644
--- a/tests/models/multimodal/generation/test_qwen2_vl.py
+++ b/tests/models/multimodal/generation/test_qwen2_vl.py
@@ -267,7 +267,7 @@ def run_embedding_input_test(
 
     # max_model_len should be greater than image_feature_size
     with vllm_runner(model,
-                     task="generate",
+                     runner="generate",
                      max_model_len=4000,
                      max_num_seqs=3,
                      dtype=dtype,
diff --git a/tests/models/multimodal/generation/vlm_utils/core.py b/tests/models/multimodal/generation/vlm_utils/core.py
index cf8962ce49750..f65385150d750 100644
--- a/tests/models/multimodal/generation/vlm_utils/core.py
+++ b/tests/models/multimodal/generation/vlm_utils/core.py
@@ -6,7 +6,7 @@ from typing import Any, Callable, Optional
 import torch
 from transformers.models.auto.auto_factory import _BaseAutoModelClass
 
-from vllm.config import TaskOption
+from vllm.config import RunnerOption
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 
 from .....conftest import HfRunner, VllmRunner
@@ -37,7 +37,7 @@ def run_test(
     vllm_runner_kwargs: Optional[dict[str, Any]],
     hf_model_kwargs: Optional[dict[str, Any]],
     patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]],
-    task: TaskOption = "auto",
+    runner: RunnerOption = "auto",
     distributed_executor_backend: Optional[str] = None,
     tensor_parallel_size: int = 1,
     vllm_embeddings: Optional[torch.Tensor] = None,
@@ -83,7 +83,7 @@ def run_test(
                      tensor_parallel_size=tensor_parallel_size,
                      distributed_executor_backend=distributed_executor_backend,
                      enforce_eager=enforce_eager,
-                     task=task,
+                     runner=runner,
                      **vllm_runner_kwargs_) as vllm_model:
         tokenizer = vllm_model.llm.get_tokenizer()
 
diff --git a/tests/models/multimodal/generation/vlm_utils/types.py b/tests/models/multimodal/generation/vlm_utils/types.py
index 0ec7909e744d7..9451131960885 100644
--- a/tests/models/multimodal/generation/vlm_utils/types.py
+++ b/tests/models/multimodal/generation/vlm_utils/types.py
@@ -11,7 +11,7 @@ from pytest import MarkDecorator
 from transformers import AutoModelForCausalLM
 from transformers.models.auto.auto_factory import _BaseAutoModelClass
 
-from vllm.config import TaskOption
+from vllm.config import RunnerOption
 from vllm.sequence import SampleLogprobs
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 
@@ -109,7 +109,7 @@ class VLMTestInfo(NamedTuple):
     enforce_eager: bool = True
     max_model_len: int = 1024
     max_num_seqs: int = 256
-    task: TaskOption = "auto"
+    runner: RunnerOption = "auto"
     tensor_parallel_size: int = 1
     vllm_runner_kwargs: Optional[dict[str, Any]] = None
 
@@ -173,7 +173,7 @@ class VLMTestInfo(NamedTuple):
             "enforce_eager": self.enforce_eager,
             "max_model_len": self.max_model_len,
             "max_num_seqs": self.max_num_seqs,
-            "task": self.task,
+            "runner": self.runner,
             "tensor_parallel_size": self.tensor_parallel_size,
             "vllm_runner_kwargs": self.vllm_runner_kwargs,
             "hf_output_post_proc": self.hf_output_post_proc,
diff --git a/tests/models/multimodal/pooling/test_dse_qwen2_vl.py b/tests/models/multimodal/pooling/test_dse_qwen2_vl.py
index a6f5aeccf94e0..f152ded3fb23a 100644
--- a/tests/models/multimodal/pooling/test_dse_qwen2_vl.py
+++ b/tests/models/multimodal/pooling/test_dse_qwen2_vl.py
@@ -92,7 +92,7 @@ def _run_test(
     # if we run HF first, the cuda initialization will be done and it
     # will hurt multiprocessing backend with fork method (the default method).
     with vllm_runner(model,
-                     task="embed",
+                     runner="pooling",
                      dtype=dtype,
                      enforce_eager=True,
                      max_model_len=8192) as vllm_model:
diff --git a/tests/models/multimodal/pooling/test_jinavl_reranker.py b/tests/models/multimodal/pooling/test_jinavl_reranker.py
index 712b6801de456..7ad7a8d284cba 100644
--- a/tests/models/multimodal/pooling/test_jinavl_reranker.py
+++ b/tests/models/multimodal/pooling/test_jinavl_reranker.py
@@ -49,7 +49,7 @@ def vllm_reranker(
 
     with vllm_runner(
             model_name,
-            task="score",
+            runner="pooling",
             dtype=dtype,
             max_num_seqs=2,
             max_model_len=2048,
diff --git a/tests/models/multimodal/pooling/test_llava_next.py b/tests/models/multimodal/pooling/test_llava_next.py
index 4a8f5cafbe485..50826677581d0 100644
--- a/tests/models/multimodal/pooling/test_llava_next.py
+++ b/tests/models/multimodal/pooling/test_llava_next.py
@@ -64,7 +64,7 @@ def _run_test(
     # if we run HF first, the cuda initialization will be done and it
     # will hurt multiprocessing backend with fork method (the default method).
     with vllm_runner(model,
-                     task="embed",
+                     runner="pooling",
                      dtype=dtype,
                      max_model_len=4096,
                      enforce_eager=True) as vllm_model:
diff --git a/tests/models/multimodal/pooling/test_phi3v.py b/tests/models/multimodal/pooling/test_phi3v.py
index 9a4b6d3ff8a81..f918a0bd781ea 100644
--- a/tests/models/multimodal/pooling/test_phi3v.py
+++ b/tests/models/multimodal/pooling/test_phi3v.py
@@ -44,7 +44,7 @@ def _run_test(
     # vLLM needs a fresh new process without cuda initialization.
     # if we run HF first, the cuda initialization will be done and it
     # will hurt multiprocessing backend with fork method (the default method).
-    with vllm_runner(model, task="embed", dtype=dtype,
+    with vllm_runner(model, runner="pooling", dtype=dtype,
                      enforce_eager=True) as vllm_model:
         vllm_outputs = vllm_model.embed(input_texts, images=input_images)
 
diff --git a/tests/models/multimodal/pooling/test_prithvi_mae.py b/tests/models/multimodal/pooling/test_prithvi_mae.py
index f08d83c082125..e9be79fba911f 100644
--- a/tests/models/multimodal/pooling/test_prithvi_mae.py
+++ b/tests/models/multimodal/pooling/test_prithvi_mae.py
@@ -34,7 +34,7 @@ def _run_test(
             set_default_torch_num_threads(1),
             vllm_runner(
                 model,
-                task="embed",
+                runner="pooling",
                 dtype=torch.float16,
                 enforce_eager=True,
                 skip_tokenizer_init=True,
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index c2e9a73fa82f0..f70e03d0f6691 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -41,24 +41,27 @@ def glm4_1v_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
 
 
 def _test_processing_correctness(
-    model_id: str,
+    model_id_or_arch: str,
     hit_rate: float,
     num_batches: int,
     simplify_rate: float,
 ):
-    model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
+    if model_id_or_arch in HF_EXAMPLE_MODELS.get_supported_archs():
+        # Use model architecture to get the default model id
+        model_info = HF_EXAMPLE_MODELS.get_hf_info(model_id_or_arch)
+        model_id = model_info.default
+    else:
+        model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id_or_arch)
+        model_id = model_id_or_arch
     model_info.check_available_online(on_fail="skip")
     model_info.check_transformers_version(on_fail="skip")
 
     model_config = ModelConfig(
         model_id,
-        task="auto",
         tokenizer=model_info.tokenizer or model_id,
         tokenizer_mode=model_info.tokenizer_mode,
+        revision=model_info.revision,
         trust_remote_code=model_info.trust_remote_code,
-        seed=0,
-        dtype="auto",
-        revision=None,
         hf_overrides=model_info.hf_overrides,
     )
 
@@ -272,6 +275,7 @@ def _test_processing_correctness_one(
     "THUDM/GLM-4.1V-9B-Thinking",
     "ibm-granite/granite-speech-3.3-2b",
     "h2oai/h2ovl-mississippi-800m",
+    "internlm/Intern-S1",
     "OpenGVLab/InternVL2-1B",
     "OpenGVLab/InternVL3-1B",
     "HuggingFaceM4/Idefics3-8B-Llama3",
@@ -331,6 +335,28 @@ def test_processing_correctness(
     )
 
 
+# Phi4MultimodalForCausalLM share same model repo with original format
+# Phi4MMForCausalLM, so we add it as a separate test case
+# Remove this test after conversion PR merged:
+# https://huggingface.co/microsoft/Phi-4-multimodal-instruct/discussions/70
+@pytest.mark.parametrize("model_arch", ["Phi4MultimodalForCausalLM"])
+@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
+@pytest.mark.parametrize("num_batches", [32])
+@pytest.mark.parametrize("simplify_rate", [1.0])
+def test_processing_correctness_phi4_multimodal(
+    model_arch: str,
+    hit_rate: float,
+    num_batches: int,
+    simplify_rate: float,
+):
+    _test_processing_correctness(
+        model_arch,
+        hit_rate=hit_rate,
+        num_batches=num_batches,
+        simplify_rate=simplify_rate,
+    )
+
+
 def _assert_inputs_equal(
     a: MultiModalInputs,
     b: MultiModalInputs,
diff --git a/tests/models/multimodal/processing/test_mllama4.py b/tests/models/multimodal/processing/test_mllama4.py
new file mode 100644
index 0000000000000..f3871b60c3f64
--- /dev/null
+++ b/tests/models/multimodal/processing/test_mllama4.py
@@ -0,0 +1,67 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for mllama's multimodal preprocessing and profiling."""
+import pytest
+from torch import prod
+from transformers import Llama4Config
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.profiling import MultiModalProfiler
+
+from ...utils import build_model_context
+
+
+@pytest.mark.parametrize("model_id", ["meta-llama/Llama-Guard-4-12B"])
+@pytest.mark.parametrize("max_model_len", [4096, 8192, 25600, 131072])
+def test_profiling(model_id: str, max_model_len: int):
+    model_config_kwargs = {
+        "max_model_len": max_model_len,
+    }
+    ctx = build_model_context(
+        model_id,
+        model_config_kwargs=model_config_kwargs,
+        limit_mm_per_prompt={"image": 1},
+    )
+
+    mm_config = ctx.get_mm_config()
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    profiler = MultiModalProfiler(processor)
+
+    decoder_dummy_data = profiler.get_decoder_dummy_data(
+        max_model_len,
+        mm_counts=mm_config.limit_per_prompt,
+    )
+    dummy_mm_data = processor.dummy_inputs.get_dummy_processor_inputs(
+        max_model_len,
+        mm_counts=mm_config.limit_per_prompt,
+    )
+
+    hf_config = ctx.get_hf_config(Llama4Config)
+
+    mm_kwargs = processor.apply(
+        prompt=dummy_mm_data.prompt,
+        mm_data=dummy_mm_data.mm_data,
+        hf_processor_mm_kwargs=dict(),
+    )["mm_kwargs"]
+
+    image_size = hf_config.vision_config.image_size
+    patch_size = hf_config.vision_config.patch_size
+    downsample_ratio = int(
+        round(1.0 / (hf_config.vision_config.pixel_shuffle_ratio**2)))
+    tokens_per_patch = ((image_size // patch_size)**2) // downsample_ratio
+    chunks_per_image = prod(mm_kwargs["patches_per_image"])
+    total_num_patches = chunks_per_image * tokens_per_patch
+    num_tiles = mm_kwargs["aspect_ratios"][0][0] * mm_kwargs["aspect_ratios"][
+        0][1]  # x-y seperator tokens
+    total_tokens = total_num_patches.item() + num_tiles.item(
+    ) + 3  # image start, image, image end
+
+    profiled_tokens = profiler.get_mm_max_contiguous_tokens(
+        max_model_len,
+        mm_counts=mm_config.limit_per_prompt,
+    )
+
+    assert total_tokens == profiled_tokens["image"]
+    assert total_tokens == sum(
+        placeholder.length for placeholder in
+        decoder_dummy_data.multi_modal_placeholders["image"])
diff --git a/tests/models/multimodal/test_mapping.py b/tests/models/multimodal/test_mapping.py
index f323dfd04cb95..7096810d8e15c 100644
--- a/tests/models/multimodal/test_mapping.py
+++ b/tests/models/multimodal/test_mapping.py
@@ -54,13 +54,10 @@ def test_hf_model_weights_mapper(model_arch: str):
 
     model_config = ModelConfig(
         model_id,
-        task="auto",
         tokenizer=model_info.tokenizer or model_id,
         tokenizer_mode=model_info.tokenizer_mode,
+        revision=model_info.revision,
         trust_remote_code=model_info.trust_remote_code,
-        seed=0,
-        dtype="auto",
-        revision=None,
         hf_overrides=model_info.hf_overrides,
     )
     model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
diff --git a/tests/models/quantization/test_bitsandbytes.py b/tests/models/quantization/test_bitsandbytes.py
index e53902cdb8f40..8cb269d7e9496 100644
--- a/tests/models/quantization/test_bitsandbytes.py
+++ b/tests/models/quantization/test_bitsandbytes.py
@@ -172,7 +172,7 @@ def test_4bit_bnb_embedding_model(
 
     # Inflight 4bit quantization
     with vllm_runner(model_name,
-                     task="embed",
+                     runner="pooling",
                      dtype=dtype,
                      gpu_memory_utilization=0.5,
                      quantization="bitsandbytes") as vllm_model:
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 1800262ced67f..8fcff5a8c5113 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -166,12 +166,14 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                          trust_remote_code=True),
     "DeepseekV3ForCausalLM": _HfExamplesInfo("deepseek-ai/DeepSeek-V3",  # noqa: E501
                                          trust_remote_code=True),
-    "Ernie4_5_ForCausalLM": _HfExamplesInfo("baidu/ERNIE-4.5-0.3B-PT",
+    "Ernie4_5ForCausalLM": _HfExamplesInfo("baidu/ERNIE-4.5-0.3B-PT",
                                             min_transformers_version="4.54"),
     "Ernie4_5_MoeForCausalLM": _HfExamplesInfo("baidu/ERNIE-4.5-21B-A3B-PT",
                                                min_transformers_version="4.54"),
-    "ExaoneForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"),  # noqa: E501
-    "Exaone4ForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-4.0-32B"),  # noqa: E501
+    "ExaoneForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct",
+                                         trust_remote_code=True),
+    "Exaone4ForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-4.0-32B",
+                                          min_transformers_version="4.54"),
     "Fairseq2LlamaForCausalLM": _HfExamplesInfo("mgleize/fairseq2-dummy-Llama-3.2-1B"),  # noqa: E501
     "FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"),
     "FalconH1ForCausalLM":_HfExamplesInfo("tiiuae/Falcon-H1-0.5B-Base",
@@ -199,8 +201,10 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                              trust_remote_code=True),
     "HunYuanMoEV1ForCausalLM": _HfExamplesInfo("tencent/Hunyuan-A13B-Instruct",
                                                trust_remote_code=True),
+    # TODO: Remove is_available_online once their config.json is fixed
     "HunYuanDenseV1ForCausalLM":_HfExamplesInfo("tencent/Hunyuan-7B-Instruct-0124",
-                                               trust_remote_code=True),
+                                                trust_remote_code=True,
+                                                is_available_online=False),
     "HCXVisionForCausalLM": _HfExamplesInfo(
         "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
         trust_remote_code=True),
@@ -221,6 +225,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                                 "fp8": "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"}),  # noqa: E501
     "LLaMAForCausalLM": _HfExamplesInfo("decapoda-research/llama-7b-hf",
                                         is_available_online=False),
+    "Llama4ForCausalLM": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501
+                                         is_available_online=False),
     "MambaForCausalLM": _HfExamplesInfo("state-spaces/mamba-130m-hf"),
     "Mamba2ForCausalLM": _HfExamplesInfo("mistralai/Mamba-Codestral-7B-v0.1"),
     "FalconMambaForCausalLM": _HfExamplesInfo("tiiuae/falcon-mamba-7b-instruct"),  # noqa: E501
@@ -273,7 +279,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     "StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b"),  # noqa: E501
     "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t"),
     "Starcoder2ForCausalLM": _HfExamplesInfo("bigcode/starcoder2-3b"),
-    "SolarForCausalLM": _HfExamplesInfo("upstage/solar-pro-preview-instruct"),
+    "SolarForCausalLM": _HfExamplesInfo("upstage/solar-pro-preview-instruct",
+                                        trust_remote_code=True),
     "TeleChat2ForCausalLM": _HfExamplesInfo("Tele-AI/TeleChat2-3B",
                                             trust_remote_code=True),
     "TeleFLMForCausalLM": _HfExamplesInfo("CofeAI/FLM-2-52B-Instruct-2407",
@@ -379,6 +386,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                          extras={"2B": "OpenGVLab/InternVL2-2B",
                                                  "3.0": "OpenGVLab/InternVL3-1B"},  # noqa: E501
                                          trust_remote_code=True),
+    "InternS1ForConditionalGeneration": _HfExamplesInfo("internlm/Intern-S1",
+                                         trust_remote_code=True),
     "Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3",  # noqa: E501
                                                         {"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}),  # noqa: E501
     "KeyeForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501
@@ -387,7 +396,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                                       extras={"thinking": "moonshotai/Kimi-VL-A3B-Thinking"},  # noqa: E501
                                                       trust_remote_code=True),
     "Llama4ForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct",   # noqa: E501
-                                                      max_model_len=10240),
+                                                      max_model_len=10240,
+                                                      extras={"llama-guard-4": "meta-llama/Llama-Guard-4-12B"}, # noqa: E501
+                                                      ),
     "LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf",
                                                      extras={"mistral": "mistral-community/pixtral-12b", # noqa: E501
                                                              "mistral-fp8": "nm-testing/pixtral-12b-FP8-dynamic"}),  # noqa: E501
@@ -429,6 +440,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                     "1.6-gemma": "AIDC-AI/Ovis1.6-Gemma2-9B"}),  # noqa: E501
     "Phi4MMForCausalLM": _HfExamplesInfo("microsoft/Phi-4-multimodal-instruct",
                                         trust_remote_code=True),
+    "Phi4MultimodalForCausalLM": _HfExamplesInfo("microsoft/Phi-4-multimodal-instruct",  # noqa: E501
+                                                 revision="refs/pr/70"),
     "PixtralForConditionalGeneration": _HfExamplesInfo("mistralai/Pixtral-12B-2409",  # noqa: E501
                                                        tokenizer_mode="mistral"),
     "QwenVLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen-VL",
@@ -441,7 +454,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                                           max_model_len=4096),
     "Qwen2_5OmniModel": _HfExamplesInfo("Qwen/Qwen2.5-Omni-3B"),
     "Qwen2_5OmniForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-Omni-7B-AWQ"),  # noqa: E501
-    "SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B"),
+    "SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B",
+                                           trust_remote_code=True),
     "SmolVLMForConditionalGeneration": _HfExamplesInfo("HuggingFaceTB/SmolVLM2-2.2B-Instruct"),  # noqa: E501
     "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b",  # noqa: E501
                                      trust_remote_code=True),
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index 14d243012b2fe..4c7da24fca32a 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -7,13 +7,15 @@ import pytest
 from transformers import PretrainedConfig
 
 from vllm import LLM
+from vllm.config import ModelImpl
 from vllm.engine.llm_engine import LLMEngine as V0LLMEngine
 from vllm.utils import GiB_bytes
 from vllm.v1.core.kv_cache_utils import get_kv_cache_config
 from vllm.v1.engine.core import EngineCore as V1EngineCore
 
 from ..utils import create_new_process_for_each_test
-from .registry import AUTO_EXAMPLE_MODELS, HF_EXAMPLE_MODELS, HfExampleModels
+from .registry import (_TRANSFORMERS_BACKEND_MODELS, AUTO_EXAMPLE_MODELS,
+                       HF_EXAMPLE_MODELS, HfExampleModels)
 
 
 @create_new_process_for_each_test()
@@ -31,12 +33,6 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
     model_info.check_available_online(on_fail="skip")
     model_info.check_transformers_version(on_fail="skip")
 
-    # FIXME: Possible memory leak in the previous tests?
-    if model_arch in ("Glm4vForConditionalGeneration",
-                      "GraniteSpeechForConditionalGeneration",
-                      "KimiVLForConditionalGeneration"):
-        pytest.skip("Avoid OOM")
-
     if model_arch in ("Llama4ForCausalLM", "EagleLlama4ForCausalLM"):
         from vllm.model_executor.models.llama4 import Llama4ForCausalLM
         from vllm.model_executor.models.registry import ModelRegistry
@@ -85,6 +81,14 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
                 "num_hidden_layers": 1,
             })
 
+        # e.g.: Qwen/Qwen2-Audio-7B-Instruct
+        if hasattr(hf_config, "audio_config"):
+            hf_config.audio_config.update({
+                "num_layers": 1,
+                "num_hidden_layers": 1,
+                "encoder_layers": 1,
+            })
+
         return hf_config
 
     # Avoid calling model.forward()
@@ -126,6 +130,8 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
             # these tests seem to produce leftover memory
             gpu_memory_utilization=0.80,
             load_format="dummy",
+            model_impl=ModelImpl.TRANSFORMERS
+            if model_arch in _TRANSFORMERS_BACKEND_MODELS else ModelImpl.VLLM,
             hf_overrides=hf_overrides,
         )
 
diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py
index 59de35644c12d..4aa7bb7297893 100644
--- a/tests/models/test_oot_registration.py
+++ b/tests/models/test_oot_registration.py
@@ -15,13 +15,10 @@ def test_plugin(
     monkeypatch: pytest.MonkeyPatch,
     dummy_opt_path: str,
 ):
-    # V1 shuts down rather than raising an error here.
     with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "0")
         m.setenv("VLLM_PLUGINS", "")
 
-        match = "Cannot find model module"
-        with pytest.raises(ValueError, match=match):
+        with pytest.raises(ValueError, match="are not supported for now"):
             LLM(model=dummy_opt_path, load_format="dummy")
 
 
diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py
index 1ce90070c5c85..8769ad45eb93e 100644
--- a/tests/models/test_registry.py
+++ b/tests/models/test_registry.py
@@ -24,11 +24,9 @@ from .registry import HF_EXAMPLE_MODELS
 
 @pytest.mark.parametrize("model_arch", ModelRegistry.get_supported_archs())
 def test_registry_imports(model_arch):
-    model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
-    model_info.check_transformers_version(on_fail="skip")
-
     # Ensure all model classes can be imported successfully
-    model_cls, _ = ModelRegistry.resolve_model_cls(model_arch)
+    model_cls = ModelRegistry._try_load_model_cls(model_arch)
+    assert model_cls is not None
 
     if model_arch in _SPECULATIVE_DECODING_MODELS:
         return  # Ignore these models which do not have a unified format
@@ -56,14 +54,16 @@ def test_registry_imports(model_arch):
     ("XLMRobertaForSequenceClassification", False, False, True),
 ])
 def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce):
-    assert ModelRegistry.is_multimodal_model(model_arch) is is_mm
+    model_info = ModelRegistry._try_inspect_model_cls(model_arch)
+    assert model_info is not None
 
-    assert ModelRegistry.is_cross_encoder_model(model_arch) is is_ce
+    assert model_info.supports_multimodal is is_mm
+    assert model_info.supports_cross_encoding is is_ce
 
     if init_cuda and current_platform.is_cuda_alike():
         assert not torch.cuda.is_initialized()
 
-        ModelRegistry.resolve_model_cls(model_arch)
+        ModelRegistry._try_load_model_cls(model_arch)
         if not torch.cuda.is_initialized():
             warnings.warn(
                 "This model no longer initializes CUDA on import. "
@@ -82,12 +82,15 @@ def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce):
         ("Qwen2VLForConditionalGeneration", True, True),
     ])
 def test_registry_is_pp(model_arch, is_pp, init_cuda):
-    assert ModelRegistry.is_pp_supported_model(model_arch) is is_pp
+    model_info = ModelRegistry._try_inspect_model_cls(model_arch)
+    assert model_info is not None
+
+    assert model_info.supports_pp is is_pp
 
     if init_cuda and current_platform.is_cuda_alike():
         assert not torch.cuda.is_initialized()
 
-        ModelRegistry.resolve_model_cls(model_arch)
+        ModelRegistry._try_load_model_cls(model_arch)
         if not torch.cuda.is_initialized():
             warnings.warn(
                 "This model no longer initializes CUDA on import. "
diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py
index cd5b6193d0013..5b7d90dfb896d 100644
--- a/tests/models/test_transformers.py
+++ b/tests/models/test_transformers.py
@@ -33,6 +33,10 @@ def check_implementation(
     args = (example_prompts, max_tokens, num_logprobs)
 
     with runner_test(model, **kwargs_test, **kwargs) as model_test:
+        model_config = model_test.llm.llm_engine.model_config
+        assert model_config.architecture == (
+            model_config._get_transformers_backend_cls())
+
         outputs_test = model_test.generate_greedy_logprobs(*args)
 
     with runner_ref(model, **kwargs_ref) as model_ref:
@@ -130,8 +134,13 @@ def test_quantization(
             model_impl="transformers",
             enforce_eager=True,
             **quantization_kwargs) as vllm_model:  # type: ignore[arg-type]
+        model_config = vllm_model.llm.llm_engine.model_config
+        assert model_config.architecture == (
+            model_config._get_transformers_backend_cls())
+
         transformers_outputs = vllm_model.generate_greedy_logprobs(
             example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs)
+
     check_logprobs_close(
         outputs_0_lst=transformers_outputs,
         outputs_1_lst=vllm_outputs,
@@ -151,7 +160,6 @@ def test_classify(
     example_prompts,
     model: str,
     dtype: str,
-    monkeypatch,
 ) -> None:
     import torch
     from transformers import AutoModelForSequenceClassification
@@ -160,6 +168,10 @@ def test_classify(
                      max_model_len=512,
                      dtype=dtype,
                      model_impl="transformers") as vllm_model:
+        model_config = vllm_model.llm.llm_engine.model_config
+        assert model_config.architecture == (
+            model_config._get_transformers_backend_cls())
+
         vllm_outputs = vllm_model.classify(example_prompts)
 
     with hf_runner(model,
diff --git a/tests/models/utils.py b/tests/models/utils.py
index cdf8d02df73c9..3cd0721be1b65 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -8,7 +8,7 @@ from typing import Any, NamedTuple, Optional, Union
 import torch
 import torch.nn.functional as F
 
-from vllm.config import ModelConfig, TaskOption
+from vllm.config import ModelConfig, RunnerOption
 from vllm.inputs import InputContext
 from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
 
@@ -255,7 +255,7 @@ def check_logprobs_close(
 
 def build_model_context(
     model_id: str,
-    task: TaskOption = "auto",
+    runner: RunnerOption = "auto",
     dtype: Union[str, torch.dtype] = "auto",
     model_config_kwargs: Optional[dict[str, Any]] = None,
     mm_processor_kwargs: Optional[dict[str, Any]] = None,
@@ -280,9 +280,10 @@ def build_model_context(
     model_config_kwargs = model_config_kwargs or {}
     model_config = ModelConfig(
         model_id,
-        task=task,
+        runner=runner,
         tokenizer=model_info.tokenizer or model_id,
         tokenizer_mode=model_info.tokenizer_mode,
+        revision=model_info.revision,
         trust_remote_code=model_info.trust_remote_code,
         dtype=dtype,
         seed=0,
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index 2f97475f121a0..8a3f09bdbe27e 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -954,13 +954,6 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
 
     model_config = ModelConfig(
         model=model_id,
-        task="auto",
-        tokenizer=model_id,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="auto",
-        revision=None,
         limit_mm_per_prompt=limit_mm_per_prompt,
     )
 
@@ -993,13 +986,6 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
 
     model_config = ModelConfig(
         model=model_id,
-        task="auto",
-        tokenizer=model_id,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="auto",
-        revision=None,
         limit_mm_per_prompt=limit_mm_per_prompt,
     )
 
@@ -1061,16 +1047,7 @@ class _ProcessorProxy:
 )
 # yapf: enable
 def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs):
-    model_config = ModelConfig(
-        model=model_id,
-        task="auto",
-        tokenizer=model_id,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="auto",
-        revision=None,
-    )
+    model_config = ModelConfig(model_id)
 
     processor = MULTIMODAL_REGISTRY.create_processor(model_config)
     orig_get_hf_processor = processor.info.get_hf_processor
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index db7e50eff72bc..296743dbfa041 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -17,7 +17,9 @@ from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tenso
     CompressedTensorsW4A4Fp4, CompressedTensorsW4A16Fp4,
     CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8,
     CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8,
-    CompressedTensorsWNA16, cutlass_fp4_supported)
+    CompressedTensorsWNA16)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    cutlass_fp4_supported)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     sparse_cutlass_supported)
 from vllm.platforms import current_platform
diff --git a/tests/quantization/test_configs.py b/tests/quantization/test_configs.py
index 8b0ffc0fe42f1..8cf8402436ff5 100644
--- a/tests/quantization/test_configs.py
+++ b/tests/quantization/test_configs.py
@@ -57,15 +57,7 @@ def test_auto_gptq(model_arg_exptype: tuple[str, None, str]) -> None:
     model_path, quantization_arg, expected_type = model_arg_exptype
 
     try:
-        model_config = ModelConfig(model_path,
-                                   task="auto",
-                                   tokenizer=model_path,
-                                   tokenizer_mode="auto",
-                                   trust_remote_code=False,
-                                   seed=0,
-                                   dtype="float16",
-                                   revision=None,
-                                   quantization=quantization_arg)
+        model_config = ModelConfig(model_path, quantization=quantization_arg)
         found_quantization_type = model_config.quantization
     except ValueError:
         found_quantization_type = "ERROR"
diff --git a/tests/quantization/test_experts_int8.py b/tests/quantization/test_experts_int8.py
index 50179b9a904d2..84a656a3b9da1 100644
--- a/tests/quantization/test_experts_int8.py
+++ b/tests/quantization/test_experts_int8.py
@@ -9,7 +9,7 @@ import pytest
 
 from tests.quantization.utils import is_quant_method_supported
 
-MODELS = ["ai21labs/Jamba-tiny-random"]
+MODELS = ["ai21labs/Jamba-tiny-random", "pfnet/plamo-2-1b"]
 
 
 @pytest.mark.skipif(not is_quant_method_supported("experts_int8"),
diff --git a/tests/quantization/test_rtn.py b/tests/quantization/test_rtn.py
index 133b2d9e4df69..bc2b468f97d8c 100644
--- a/tests/quantization/test_rtn.py
+++ b/tests/quantization/test_rtn.py
@@ -8,7 +8,10 @@ import pytest
 
 from tests.quantization.utils import is_quant_method_supported
 
-MODELS = ["microsoft/Phi-3-mini-4k-instruct"]
+MODELS = [
+    "microsoft/Phi-3-mini-4k-instruct",  # dense model
+    "ai21labs/Jamba-tiny-dev",  # MoE model
+]
 
 
 @pytest.mark.skipif(not is_quant_method_supported("rtn"),
diff --git a/tests/samplers/test_no_bad_words.py b/tests/samplers/test_no_bad_words.py
index 11803b8d7a5eb..128e8f552a161 100644
--- a/tests/samplers/test_no_bad_words.py
+++ b/tests/samplers/test_no_bad_words.py
@@ -14,9 +14,9 @@ from vllm import LLM, SamplingParams
 
 
 @pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    """We can run both engines for this test."""
-    pass
+def v1(monkeypatch):
+    """Only run on vLLM v1."""
+    monkeypatch.setenv('VLLM_USE_V1', '1')
 
 
 def _generate(
diff --git a/tests/standalone_tests/test_tensor_schema.py b/tests/standalone_tests/test_tensor_schema.py
index c5b77bb09bbba..e98aa3f53fb53 100644
--- a/tests/standalone_tests/test_tensor_schema.py
+++ b/tests/standalone_tests/test_tensor_schema.py
@@ -4,6 +4,8 @@
 import pytest
 import torch
 
+from vllm.model_executor.models.fuyu import FuyuImagePatchInputs
+from vllm.model_executor.models.glm4_1v import Glm4vImageEmbeddingInputs
 from vllm.model_executor.models.phi3v import Phi3VImagePixelInputs
 
 
@@ -124,3 +126,45 @@ def test_tensor_schema_with_invalid_resolve_binding_dims():
                 "w": 336
             },
         )
+
+
+def test_tensor_schema_with_list_of_symbolic_dim():
+    flat_data = torch.stack([torch.randn(768) for _ in range(3)])  # (bn=3, fn)
+    patches_per_image = [64, 64, 64]  # len = bn = 3
+
+    FuyuImagePatchInputs(
+        flat_data=flat_data,
+        patches_per_image=patches_per_image,
+    )
+
+
+def test_tensor_schema_with_list_of_symbolic_dim_mismatch_in_length():
+    flat_data = torch.stack([torch.randn(768) for _ in range(4)])  # (bn=4, fn)
+    patches_per_image = [64, 64, 64]  # len = 3 ≠ bn
+
+    with pytest.raises(ValueError, match="expected 'bn'=4, got 3"):
+        FuyuImagePatchInputs(
+            flat_data=flat_data,
+            patches_per_image=patches_per_image,
+        )
+
+
+def test_valid_tensor_schema_with_static_last_dim():
+    image_embeds = torch.randn(256, 1024)
+    image_grid_thw = torch.randint(0, 4, (2, 3))
+
+    Glm4vImageEmbeddingInputs(
+        image_embeds=image_embeds,
+        image_grid_thw=image_grid_thw,
+    )
+
+
+def test_invalid_tensor_schema_with_static_last_dim():
+    image_embeds = torch.randn(256, 1024)
+    image_grid_thw = torch.randint(0, 4, (2, 4))  # Wrong last dim
+
+    with pytest.raises(ValueError, match="dim\\[1\\] expected 3, got 4"):
+        Glm4vImageEmbeddingInputs(
+            image_embeds=image_embeds,
+            image_grid_thw=image_grid_thw,
+        )
diff --git a/tests/test_config.py b/tests/test_config.py
index 015baef918110..441c07b99acfa 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -74,115 +74,116 @@ def test_update_config():
         new_config3 = update_config(config3, {"a": "new_value"})
 
 
+# Can remove once --task option is fully deprecated
 @pytest.mark.parametrize(
-    ("model_id", "expected_runner_type", "expected_task"),
+    ("model_id", "expected_runner_type", "expected_convert_type",
+     "expected_task"),
     [
-        ("distilbert/distilgpt2", "generate", "generate"),
-        ("intfloat/multilingual-e5-small", "pooling", "embed"),
-        ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify"),
-        ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "classify"),
-        ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "reward"),
-        ("openai/whisper-small", "generate", "transcription"),
+        ("distilbert/distilgpt2", "generate", "none", "generate"),
+        ("intfloat/multilingual-e5-small", "pooling", "none", "embed"),
+        ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify", "classify"),
+        ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "none",
+         "classify"),
+        ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "none", "reward"),
+        ("openai/whisper-small", "generate", "none", "transcription"),
     ],
 )
-def test_auto_task(model_id, expected_runner_type, expected_task):
-    config = ModelConfig(
-        model_id,
-        task="auto",
-        tokenizer=model_id,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="float16",
-    )
+def test_auto_task(model_id, expected_runner_type, expected_convert_type,
+                   expected_task):
+    config = ModelConfig(model_id, task="auto")
 
     assert config.runner_type == expected_runner_type
+    assert config.convert_type == expected_convert_type
+    assert expected_task in config.supported_tasks
 
-    if config.runner_type == "pooling":
-        assert config.task == expected_task
-    else:
-        assert expected_task in config.supported_tasks
+
+# Can remove once --task option is fully deprecated
+@pytest.mark.parametrize(
+    ("model_id", "expected_runner_type", "expected_convert_type",
+     "expected_task"),
+    [
+        ("distilbert/distilgpt2", "pooling", "embed", "embed"),
+        ("intfloat/multilingual-e5-small", "pooling", "embed", "embed"),
+        ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify", "classify"),
+        ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "classify",
+         "classify"),
+        ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "embed", "embed"),
+        ("openai/whisper-small", "pooling", "embed", "embed"),
+    ],
+)
+def test_score_task(model_id, expected_runner_type, expected_convert_type,
+                    expected_task):
+    config = ModelConfig(model_id, task="score")
+
+    assert config.runner_type == expected_runner_type
+    assert config.convert_type == expected_convert_type
+    assert expected_task in config.supported_tasks
+
+
+# Can remove once --task option is fully deprecated
+@pytest.mark.parametrize(
+    ("model_id", "expected_runner_type", "expected_convert_type",
+     "expected_task"),
+    [
+        ("openai/whisper-small", "generate", "none", "transcription"),
+    ],
+)
+def test_transcription_task(model_id, expected_runner_type,
+                            expected_convert_type, expected_task):
+    config = ModelConfig(model_id, task="transcription")
+
+    assert config.runner_type == expected_runner_type
+    assert config.convert_type == expected_convert_type
+    assert expected_task in config.supported_tasks
 
 
 @pytest.mark.parametrize(
-    ("model_id", "expected_runner_type", "expected_task"),
+    ("model_id", "expected_runner_type", "expected_convert_type"),
+    [
+        ("distilbert/distilgpt2", "generate", "none"),
+        ("intfloat/multilingual-e5-small", "pooling", "none"),
+        ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify"),
+        ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "none"),
+        ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "none"),
+        ("openai/whisper-small", "generate", "none"),
+    ],
+)
+def test_auto_runner(model_id, expected_runner_type, expected_convert_type):
+    config = ModelConfig(model_id, runner="auto")
+
+    assert config.runner_type == expected_runner_type
+    assert config.convert_type == expected_convert_type
+
+
+@pytest.mark.parametrize(
+    ("model_id", "expected_runner_type", "expected_convert_type"),
     [
         ("distilbert/distilgpt2", "pooling", "embed"),
-        ("intfloat/multilingual-e5-small", "pooling", "embed"),
+        ("intfloat/multilingual-e5-small", "pooling", "none"),
         ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify"),
-        ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "classify"),
-        ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "embed"),
+        ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "none"),
+        ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "none"),
         ("openai/whisper-small", "pooling", "embed"),
     ],
 )
-def test_score_task(model_id, expected_runner_type, expected_task):
-    config = ModelConfig(
-        model_id,
-        task="score",
-        tokenizer=model_id,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="float16",
-    )
+def test_pooling_runner(model_id, expected_runner_type, expected_convert_type):
+    config = ModelConfig(model_id, runner="pooling")
 
     assert config.runner_type == expected_runner_type
-    assert config.task == expected_task
-
-
-@pytest.mark.parametrize(("model_id", "expected_runner_type", "expected_task"),
-                         [
-                             ("Qwen/Qwen2.5-1.5B-Instruct", "draft", "auto"),
-                         ])
-def test_draft_task(model_id, expected_runner_type, expected_task):
-    config = ModelConfig(
-        model_id,
-        runner="draft",
-        tokenizer=model_id,
-        seed=0,
-        dtype="float16",
-    )
-
-    assert config.runner_type == expected_runner_type
-    assert config.task == expected_task
+    assert config.convert_type == expected_convert_type
 
 
 @pytest.mark.parametrize(
-    ("model_id", "expected_runner_type", "expected_task"),
+    ("model_id", "expected_runner_type", "expected_convert_type"),
     [
-        ("openai/whisper-small", "generate", "transcription"),
+        ("Qwen/Qwen2.5-1.5B-Instruct", "draft", "none"),
     ],
 )
-def test_transcription_task(model_id, expected_runner_type, expected_task):
-    config = ModelConfig(
-        model_id,
-        task="transcription",
-        tokenizer=model_id,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="float16",
-    )
+def test_draft_runner(model_id, expected_runner_type, expected_convert_type):
+    config = ModelConfig(model_id, runner="draft")
 
     assert config.runner_type == expected_runner_type
-    assert config.task == expected_task
-
-
-@pytest.mark.parametrize(("model_id", "bad_task"), [
-    ("Qwen/Qwen2.5-Math-RM-72B", "generate"),
-    ("Qwen/Qwen3-0.6B", "transcription"),
-])
-def test_incorrect_task(model_id, bad_task):
-    with pytest.raises(ValueError, match=r"does not support task=.*"):
-        ModelConfig(
-            model_id,
-            task=bad_task,
-            tokenizer=model_id,
-            tokenizer_mode="auto",
-            trust_remote_code=False,
-            seed=0,
-            dtype="float16",
-        )
+    assert config.convert_type == expected_convert_type
 
 
 MODEL_IDS_EXPECTED = [
@@ -195,17 +196,7 @@ MODEL_IDS_EXPECTED = [
 @pytest.mark.parametrize("model_id_expected", MODEL_IDS_EXPECTED)
 def test_disable_sliding_window(model_id_expected):
     model_id, expected = model_id_expected
-    model_config = ModelConfig(
-        model_id,
-        task="auto",
-        tokenizer=model_id,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="float16",
-        revision=None,
-        disable_sliding_window=True,
-    )
+    model_config = ModelConfig(model_id, disable_sliding_window=True)
     assert model_config.max_model_len == expected
 
 
@@ -214,16 +205,7 @@ def test_get_sliding_window():
     # Test that the sliding window is correctly computed.
     # For Qwen1.5/Qwen2, get_sliding_window() should be None
     # when use_sliding_window is False.
-    qwen2_model_config = ModelConfig(
-        "Qwen/Qwen1.5-7B",
-        task="auto",
-        tokenizer="Qwen/Qwen1.5-7B",
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="float16",
-        revision=None,
-    )
+    qwen2_model_config = ModelConfig("Qwen/Qwen1.5-7B")
 
     qwen2_model_config.hf_config.use_sliding_window = False
     qwen2_model_config.hf_config.sliding_window = TEST_SLIDING_WINDOW
@@ -232,16 +214,7 @@ def test_get_sliding_window():
     qwen2_model_config.hf_config.use_sliding_window = True
     assert qwen2_model_config.get_sliding_window() == TEST_SLIDING_WINDOW
 
-    mistral_model_config = ModelConfig(
-        "mistralai/Mistral-7B-v0.1",
-        task="auto",
-        tokenizer="mistralai/Mistral-7B-v0.1",
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="float16",
-        revision=None,
-    )
+    mistral_model_config = ModelConfig("mistralai/Mistral-7B-v0.1")
     mistral_model_config.hf_config.sliding_window = None
     assert mistral_model_config.get_sliding_window() is None
 
@@ -253,16 +226,7 @@ def test_get_sliding_window():
                     reason="Xformers backend is not supported on ROCm.")
 def test_get_pooling_config():
     model_id = "sentence-transformers/all-MiniLM-L12-v2"
-    model_config = ModelConfig(
-        model_id,
-        task="auto",
-        tokenizer=model_id,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="float16",
-        revision=None,
-    )
+    model_config = ModelConfig(model_id)
 
     pooling_config = model_config._init_pooler_config()
     assert pooling_config is not None
@@ -275,14 +239,7 @@ def test_get_pooling_config():
                     reason="Xformers backend is not supported on ROCm.")
 def test_get_pooling_config_from_args():
     model_id = "sentence-transformers/all-MiniLM-L12-v2"
-    model_config = ModelConfig(model_id,
-                               task="auto",
-                               tokenizer=model_id,
-                               tokenizer_mode="auto",
-                               trust_remote_code=False,
-                               seed=0,
-                               dtype="float16",
-                               revision=None)
+    model_config = ModelConfig(model_id)
 
     override_pooler_config = PoolerConfig(pooling_type='CLS', normalize=True)
     model_config.override_pooler_config = override_pooler_config
@@ -295,16 +252,8 @@ def test_get_pooling_config_from_args():
 @pytest.mark.skipif(current_platform.is_rocm(),
                     reason="Xformers backend is not supported on ROCm.")
 def test_get_bert_tokenization_sentence_transformer_config():
-    bge_model_config = ModelConfig(
-        model="BAAI/bge-base-en-v1.5",
-        task="auto",
-        tokenizer="BAAI/bge-base-en-v1.5",
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="float16",
-        revision=None,
-    )
+    model_id = "BAAI/bge-base-en-v1.5"
+    bge_model_config = ModelConfig(model_id)
 
     bert_bge_model_config = bge_model_config._get_encoder_config()
 
@@ -317,27 +266,13 @@ def test_rope_customization():
     TEST_ROPE_THETA = 16_000_000.0
     LONGCHAT_ROPE_SCALING = {"rope_type": "linear", "factor": 8.0}
 
-    llama_model_config = ModelConfig(
-        "meta-llama/Meta-Llama-3-8B-Instruct",
-        task="auto",
-        tokenizer="meta-llama/Meta-Llama-3-8B-Instruct",
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        dtype="float16",
-        seed=0,
-    )
+    llama_model_config = ModelConfig("meta-llama/Meta-Llama-3-8B-Instruct")
     assert getattr(llama_model_config.hf_config, "rope_scaling", None) is None
     assert getattr(llama_model_config.hf_config, "rope_theta", None) == 500_000
     assert llama_model_config.max_model_len == 8192
 
     llama_model_config = ModelConfig(
         "meta-llama/Meta-Llama-3-8B-Instruct",
-        task="auto",
-        tokenizer="meta-llama/Meta-Llama-3-8B-Instruct",
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        dtype="float16",
-        seed=0,
         hf_overrides={
             "rope_scaling": TEST_ROPE_SCALING,
             "rope_theta": TEST_ROPE_THETA,
@@ -349,15 +284,7 @@ def test_rope_customization():
                    None) == TEST_ROPE_THETA
     assert llama_model_config.max_model_len == 16384
 
-    longchat_model_config = ModelConfig(
-        "lmsys/longchat-13b-16k",
-        task="auto",
-        tokenizer="lmsys/longchat-13b-16k",
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        dtype="float16",
-        seed=0,
-    )
+    longchat_model_config = ModelConfig("lmsys/longchat-13b-16k")
     # Check if LONGCHAT_ROPE_SCALING entries are in longchat_model_config
     assert all(
         longchat_model_config.hf_config.rope_scaling.get(key) == value
@@ -366,12 +293,6 @@ def test_rope_customization():
 
     longchat_model_config = ModelConfig(
         "lmsys/longchat-13b-16k",
-        task="auto",
-        tokenizer="lmsys/longchat-13b-16k",
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        dtype="float16",
-        seed=0,
         hf_overrides={
             "rope_scaling": TEST_ROPE_SCALING,
         },
@@ -390,15 +311,7 @@ def test_rope_customization():
     ("meta-llama/Llama-3.2-11B-Vision", True),
 ])
 def test_is_encoder_decoder(model_id, is_encoder_decoder):
-    config = ModelConfig(
-        model_id,
-        task="auto",
-        tokenizer=model_id,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        dtype="float16",
-        seed=0,
-    )
+    config = ModelConfig(model_id)
 
     assert config.is_encoder_decoder == is_encoder_decoder
 
@@ -408,15 +321,7 @@ def test_is_encoder_decoder(model_id, is_encoder_decoder):
     ("Qwen/Qwen2-VL-2B-Instruct", True),
 ])
 def test_uses_mrope(model_id, uses_mrope):
-    config = ModelConfig(
-        model_id,
-        task="auto",
-        tokenizer=model_id,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        dtype="float16",
-        seed=0,
-    )
+    config = ModelConfig(model_id)
 
     assert config.uses_mrope == uses_mrope
 
@@ -426,26 +331,12 @@ def test_generation_config_loading():
 
     # When set generation_config to "vllm", the default generation config
     # will not be loaded.
-    model_config = ModelConfig(model_id,
-                               task="auto",
-                               tokenizer=model_id,
-                               tokenizer_mode="auto",
-                               trust_remote_code=False,
-                               seed=0,
-                               dtype="float16",
-                               generation_config="vllm")
+    model_config = ModelConfig(model_id, generation_config="vllm")
     assert model_config.get_diff_sampling_param() == {}
 
     # When set generation_config to "auto", the default generation config
     # should be loaded.
-    model_config = ModelConfig(model_id,
-                               task="auto",
-                               tokenizer=model_id,
-                               tokenizer_mode="auto",
-                               trust_remote_code=False,
-                               seed=0,
-                               dtype="float16",
-                               generation_config="auto")
+    model_config = ModelConfig(model_id, generation_config="auto")
 
     correct_generation_config = {
         "repetition_penalty": 1.1,
@@ -461,12 +352,6 @@ def test_generation_config_loading():
 
     model_config = ModelConfig(
         model_id,
-        task="auto",
-        tokenizer=model_id,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="float16",
         generation_config="auto",
         override_generation_config=override_generation_config)
 
@@ -479,12 +364,6 @@ def test_generation_config_loading():
     # is set, the override_generation_config should be used directly.
     model_config = ModelConfig(
         model_id,
-        task="auto",
-        tokenizer=model_id,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="float16",
         generation_config="vllm",
         override_generation_config=override_generation_config)
 
@@ -515,16 +394,7 @@ def test_load_config_pt_load_map_location(pt_load_map_location):
 def test_get_and_verify_max_len(model_id, max_model_len, expected_max_len,
                                 should_raise):
     """Test get_and_verify_max_len with different configurations."""
-    model_config = ModelConfig(
-        model_id,
-        task="auto",
-        tokenizer=model_id,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="float16",
-        revision=None,
-    )
+    model_config = ModelConfig(model_id)
 
     if should_raise:
         with pytest.raises(ValueError):
diff --git a/tests/test_sampling_params.py b/tests/test_sampling_params.py
index 39e3808d831ca..7330f61e67689 100644
--- a/tests/test_sampling_params.py
+++ b/tests/test_sampling_params.py
@@ -21,13 +21,8 @@ def test_max_tokens_none():
 def model_config():
     return ModelConfig(
         MODEL_NAME,
-        task="auto",
-        tokenizer=MODEL_NAME,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
         seed=0,
         dtype="float16",
-        revision=None,
     )
 
 
@@ -61,8 +56,7 @@ def test_sampling_params_from_request_with_no_guided_decoding_backend(
 
 
 @pytest.mark.parametrize("request_level_guided_decoding_backend,expected",
-                         [("xgrammar", "xgrammar"),
-                          ("lm-format-enforcer", "lm-format-enforcer"),
+                         [("xgrammar", "xgrammar"), ("guidance", "guidance"),
                           ("outlines", "outlines")])
 def test_sampling_params_from_request_with_guided_decoding_backend(
         request_level_guided_decoding_backend: str, expected: str,
diff --git a/tests/test_sharded_state_loader.py b/tests/test_sharded_state_loader.py
index 64706defb5960..1bb4203d21c3e 100644
--- a/tests/test_sharded_state_loader.py
+++ b/tests/test_sharded_state_loader.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import fnmatch
 import multiprocessing as mp
 import os
 import shutil
@@ -64,9 +65,10 @@ def _run_writer(input_dir, output_dir, weights_patterns, **kwargs):
     # Copy metadata files to output directory
     for file in os.listdir(input_dir):
         if os.path.isdir(os.path.join(input_dir, file)):
-            continue
-        if not any(file.endswith(ext) for ext in weights_patterns):
-            shutil.copy(f"{input_dir}/{file}", output_dir)
+            shutil.copytree(os.path.join(input_dir, file),
+                            os.path.join(output_dir, file))
+        elif not any(fnmatch.fnmatch(file, ext) for ext in weights_patterns):
+            shutil.copy(os.path.join(input_dir, file), output_dir)
 
 
 def _run_generate(input_dir, queue: mp.Queue, **kwargs):
diff --git a/tests/v1/attention/test_attention_backends.py b/tests/v1/attention/test_attention_backends.py
index 9bd0b99798d77..f197cbb7bbba0 100644
--- a/tests/v1/attention/test_attention_backends.py
+++ b/tests/v1/attention/test_attention_backends.py
@@ -198,7 +198,8 @@ class MockAttentionLayer:
 
 
 def run_attention_backend(backend: _Backend, kv_cache_spec: FullAttentionSpec,
-                          vllm_config, device: torch.device,
+                          layer_names: list[str], vllm_config,
+                          device: torch.device,
                           common_attn_metadata: CommonAttentionMetadata,
                           query: torch.Tensor, key: torch.Tensor,
                           value: torch.Tensor,
@@ -211,31 +212,33 @@ def run_attention_backend(backend: _Backend, kv_cache_spec: FullAttentionSpec,
     if backend == _Backend.FLASHINFER_VLLM_V1:
         import unittest.mock
 
-        from vllm.v1.attention.backends.flashinfer import PerLayerParameters
+        from vllm.v1.attention.backends.utils import PerLayerParameters
 
-        def mock_get_per_layer_parameters(vllm_config, impl_cls):
+        def mock_get_per_layer_parameters(vllm_config, layer_names, impl_cls):
             # Return mock parameters for a single layer
             head_size = vllm_config.model_config.get_head_size()
             return {
-                "mock_layer":
+                layer_name:
                 PerLayerParameters(
                     window_left=-1,  # No sliding window
                     logits_soft_cap=0.0,  # No soft cap
                     sm_scale=1.0 / (head_size**0.5)  # Standard scale
                 )
+                for layer_name in layer_names
             }
 
         with unittest.mock.patch(
                 'vllm.v1.attention.backends.flashinfer.get_per_layer_parameters',
                 mock_get_per_layer_parameters):
-            builder = builder_cls(kv_cache_spec, vllm_config, device)
+            builder = builder_cls(kv_cache_spec, layer_names, vllm_config,
+                                  device)
             attn_metadata = builder.build(
                 common_prefix_len=0,
                 common_attn_metadata=common_attn_metadata,
             )
     else:
         # Build metadata
-        builder = builder_cls(kv_cache_spec, vllm_config, device)
+        builder = builder_cls(kv_cache_spec, layer_names, vllm_config, device)
         attn_metadata = builder.build(
             common_prefix_len=0,
             common_attn_metadata=common_attn_metadata,
@@ -427,8 +430,8 @@ def test_backend_correctness(batch_spec_name: str, model: str):
             set_kv_cache_layout("HND")
 
         backend_output = run_attention_backend(backend_name, kv_cache_spec,
-                                               vllm_config, device,
-                                               common_attn_metadata,
+                                               ["placeholder"], vllm_config,
+                                               device, common_attn_metadata,
                                                query_vllm, key_vllm,
                                                value_vllm,
                                                kv_cache_for_backend)
diff --git a/tests/v1/attention/test_mamba_selectors.py b/tests/v1/attention/test_mamba_selectors.py
new file mode 100644
index 0000000000000..8eaafc5e16816
--- /dev/null
+++ b/tests/v1/attention/test_mamba_selectors.py
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for mamba attention backend selectors."""
+
+import pytest
+
+from vllm.v1.attention.backends.mamba_attn import Mamba2AttentionBackend
+from vllm.v1.attention.backends.mamba_selectors import get_mamba_attn_backend
+
+
+@pytest.mark.parametrize(argnames=["mamba_type", "expected_backend"],
+                         argvalues=[("mamba2", Mamba2AttentionBackend)])
+def test_get_mamba_attn_backend_mamba2(mamba_type, expected_backend):
+    backend_class = get_mamba_attn_backend(mamba_type)
+
+    assert backend_class is expected_backend
+
+
+def test_get_mamba_attn_backend_unsupported():
+    unsupported_types = ["mamba", ""]
+
+    for mamba_type in unsupported_types:
+        err_message = f"Mamba Attention type {mamba_type} is not supported yet."
+        with pytest.raises(NotImplementedError, match=err_message):
+            get_mamba_attn_backend(mamba_type)
diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py
index 69bd4a2060ae5..ae2ab6e6413c0 100644
--- a/tests/v1/attention/utils.py
+++ b/tests/v1/attention/utils.py
@@ -93,6 +93,7 @@ def create_common_attn_metadata(
         max_query_len=max_query_len,
         block_table_tensor=block_table_tensor,
         slot_mapping=slot_mapping,
+        causal=True,
     )
 
 
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index ccdbe79dfea4c..bff3724d95e68 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -17,7 +17,7 @@ from vllm.v1.core.kv_cache_utils import (
     estimate_max_model_len, generate_block_hash_extra_keys,
     get_kv_cache_config, get_max_concurrency_for_kv_cache_config,
     hash_block_tokens, hash_request_tokens, init_none_hash,
-    unify_kv_cache_configs)
+    is_kv_cache_type_uniform, unify_kv_cache_configs)
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheGroupSpec, KVCacheTensor,
                                         SlidingWindowSpec)
@@ -112,9 +112,9 @@ def test_kv_cache_block():
     assert block.block_hash is None
 
     # Test reference count manipulation
-    block.incr_ref()
+    block.ref_cnt += 1
     assert block.ref_cnt == 1
-    block.decr_ref()
+    block.ref_cnt -= 1
     assert block.ref_cnt == 0
 
     # Test block hash setting and resetting
@@ -685,6 +685,38 @@ def test_merge_kv_cache_spec():
     assert merged_layer_spec.sliding_window == 1
 
 
+def test_is_kv_cache_type_uniform():
+    kv_cache_spec = {
+        "layer_1": new_kv_cache_spec(num_kv_heads=32),
+        "layer_2": new_kv_cache_spec(num_kv_heads=32),
+    }
+    assert is_kv_cache_type_uniform(kv_cache_spec)
+
+    kv_cache_spec = {
+        "layer_1": new_kv_cache_spec(num_kv_heads=32),
+        "layer_2": new_kv_cache_spec(num_kv_heads=32, sliding_window=1),
+    }
+    assert is_kv_cache_type_uniform(kv_cache_spec)
+
+    kv_cache_spec = {
+        "layer_1": new_kv_cache_spec(num_kv_heads=32),
+        "layer_2": new_sliding_window_spec(num_kv_heads=32, sliding_window=1),
+    }
+    assert not is_kv_cache_type_uniform(kv_cache_spec)
+
+    kv_cache_spec = {
+        "layer_1": new_sliding_window_spec(num_kv_heads=32, sliding_window=1),
+        "layer_2": new_sliding_window_spec(num_kv_heads=32, sliding_window=1),
+    }
+    assert is_kv_cache_type_uniform(kv_cache_spec)
+
+    kv_cache_spec = {
+        "layer_1": new_sliding_window_spec(num_kv_heads=32, sliding_window=1),
+        "layer_2": new_sliding_window_spec(num_kv_heads=32, sliding_window=2),
+    }
+    assert not is_kv_cache_type_uniform(kv_cache_spec)
+
+
 @pytest.mark.parametrize(
     ("model_id", "max_model_len", "want_estimated_max_len"), [
         ("Qwen/Qwen1.5-7B", 16385, 16384),
@@ -695,11 +727,7 @@ def test_estimate_max_model_len(model_id, max_model_len,
     # Create a VllmConfig
     model_config = ModelConfig(
         model_id,
-        task="generate",
-        tokenizer=model_id,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
+        runner="generate",
         dtype="float16",
         max_model_len=max_model_len,
     )
@@ -733,11 +761,7 @@ def test_get_max_concurrency_for_kv_cache_config():
     max_model_len = 16384
     model_config = ModelConfig(
         model_id,
-        task="generate",
-        tokenizer=model_id,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
+        runner="generate",
         dtype="float16",
         max_model_len=max_model_len,
     )
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index a858a4d8c823b..c719d1975bba2 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -1248,9 +1248,6 @@ def create_scheduler_with_priority(
     )
     model_config = ModelConfig(
         model=model,
-        task="auto",
-        tokenizer=model,
-        tokenizer_mode="auto",
         trust_remote_code=True,
         dtype="float16",
         seed=42,
diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py
index 0b7d8251b6406..02ca4498db192 100644
--- a/tests/v1/core/utils.py
+++ b/tests/v1/core/utils.py
@@ -59,9 +59,6 @@ def create_scheduler(
     )
     model_config = ModelConfig(
         model=model,
-        task="auto",
-        tokenizer=model,
-        tokenizer_mode="auto",
         trust_remote_code=True,
         dtype="float16",
         seed=42,
diff --git a/tests/v1/e2e/test_correctness_sliding_window.py b/tests/v1/e2e/test_correctness_sliding_window.py
index 277ea3c838505..4dfe1d3bb33fa 100644
--- a/tests/v1/e2e/test_correctness_sliding_window.py
+++ b/tests/v1/e2e/test_correctness_sliding_window.py
@@ -30,7 +30,9 @@ model_config = {
     ])
 @pytest.mark.parametrize("batch_size", [5])
 @pytest.mark.parametrize("seed", [1])
-def test_sliding_window_retrieval(monkeypatch, model, batch_size, seed):
+@pytest.mark.parametrize("disable_hybrid_kv_cache_manager", [True, False])
+def test_sliding_window_retrieval(monkeypatch, model, batch_size, seed,
+                                  disable_hybrid_kv_cache_manager):
     """
     The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then
     asks for value of one of them (which is outside the sliding window).
@@ -42,7 +44,9 @@ def test_sliding_window_retrieval(monkeypatch, model, batch_size, seed):
 
         test_config = model_config[model]
 
-        llm = LLM(model=model)
+        llm = LLM(
+            model=model,
+            disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager)
         sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
 
         prompts, answer, indices = prep_prompts(batch_size,
diff --git a/tests/v1/e2e/test_kv_sharing_fast_prefill.py b/tests/v1/e2e/test_kv_sharing_fast_prefill.py
new file mode 100644
index 0000000000000..616fc7a860599
--- /dev/null
+++ b/tests/v1/e2e/test_kv_sharing_fast_prefill.py
@@ -0,0 +1,143 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import gc
+import random
+from typing import Optional, Union
+
+import pytest
+import torch
+
+from vllm import LLM, SamplingParams
+from vllm.config import CompilationConfig, CompilationLevel
+from vllm.forward_context import get_forward_context
+from vllm.model_executor.models.gemma3n import Gemma3nForConditionalGeneration
+from vllm.model_executor.models.registry import ModelRegistry
+from vllm.model_executor.models.utils import extract_layer_index
+from vllm.sequence import IntermediateTensors
+
+from ...utils import fork_new_process_for_each_test
+
+
+class TestGemma3nForConditionalGeneration(Gemma3nForConditionalGeneration):
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds, **kwargs)
+        attn_metadata = get_forward_context().attn_metadata
+        # attn_metadata is None during dummy runs
+        if (attn_metadata is not None
+                and self.cache_config.kv_sharing_fast_prefill):
+            assert isinstance(attn_metadata, dict)  # true in V1
+            # Gemma3n-E2B has 30 layers, with last 20 layers being
+            # cross-decoder layers. Check attention metadata is correct
+            for layer_name, metadata in attn_metadata.items():
+                layer_idx = extract_layer_index(layer_name)
+                if layer_idx >= 20:
+                    assert hasattr(metadata, 'logits_indices_padded')
+                    assert hasattr(metadata, 'num_logits_indices')
+                else:
+                    assert not hasattr(metadata, 'logits_indices_padded')
+                    assert not hasattr(metadata, 'num_logits_indices')
+
+            # Last layer will be a KV sharing layer
+            layer_attn_metadata = attn_metadata[
+                self.model.language_model.layers[-1].self_attn.attn.layer_name]
+            logits_indices_padded = (layer_attn_metadata.logits_indices_padded)
+            assert logits_indices_padded is not None
+            num_logits_indices = layer_attn_metadata.num_logits_indices
+            assert num_logits_indices > 0
+            # Reset hidden states to random values and
+            # only set logits at logits_indices to valid values
+            # Because logits_indices are the only positions that are used
+            # for output token sampling, this still produces same outputs
+            logits_hs = hidden_states[logits_indices_padded]
+            hidden_states = torch.randn_like(hidden_states)
+            gen_indices = logits_indices_padded[:num_logits_indices]
+            hidden_states[gen_indices] = logits_hs[:num_logits_indices]
+
+        return hidden_states
+
+
+@pytest.fixture
+def test_prompts():
+    """
+    Adapted from tests/v1/e2e/test_spec_decode.py
+    """
+    prompt_types = ["repeat", "sentence"]
+    # Setting higher num prompts increases the chance of numerics mismatch
+    # due to matrix multiplication numerics depending on batch dimension
+    num_prompts = 10
+    prompts = []
+
+    random.seed(0)
+    random_prompt_type_choices = random.choices(prompt_types, k=num_prompts)
+
+    for kind in random_prompt_type_choices:
+        word_choices = ["test", "temp", "hello", "where"]
+        word = random.choice(word_choices)
+        if kind == "repeat":
+            prompt = f"""please repeat the word '{word}' 10 times."""
+        elif kind == "sentence":
+            prompt = f"""please give a ten-word sentence that
+            uses the word {word} at least once."""
+        else:
+            raise ValueError(f"Unknown prompt type: {kind}")
+        prompts.append(prompt)
+
+    return prompts
+
+
+@fork_new_process_for_each_test
+@pytest.mark.parametrize("enforce_eager", [True, False])
+def test_kv_sharing_fast_prefill(
+    monkeypatch: pytest.MonkeyPatch,
+    enforce_eager: bool,
+    test_prompts: list[str],
+):
+    ModelRegistry.register_model("Gemma3nForConditionalGeneration",
+                                 TestGemma3nForConditionalGeneration)
+    sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
+    compilation_config = CompilationConfig(
+        # This allows vLLM compilation backend to handle allocating and
+        # managing buffers for cudagraph
+        cudagraph_copy_inputs=True,
+        level=CompilationLevel.PIECEWISE
+        if not enforce_eager else CompilationLevel.NO_COMPILATION)
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        llm = LLM(
+            model="google/gemma-3n-E2B-it",
+            enforce_eager=enforce_eager,
+            compilation_config=compilation_config,
+        )
+        ref_responses = llm.generate(test_prompts, sampling_params)
+
+        del llm
+        gc.collect()
+        torch.cuda.empty_cache()
+
+        llm = LLM(model="google/gemma-3n-E2B-it",
+                  enforce_eager=enforce_eager,
+                  compilation_config=compilation_config,
+                  kv_sharing_fast_prefill=True)
+        optimized_responses = llm.generate(test_prompts, sampling_params)
+
+        misses = 0
+
+        for ref_response, optimized_response in zip(ref_responses,
+                                                    optimized_responses):
+            if ref_response.outputs[0].text != optimized_response.outputs[
+                    0].text:
+                misses += 1
+
+        assert misses == 0
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index bbdc73e9608a1..c52b98967126b 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -65,7 +65,8 @@ def test_engine_core(monkeypatch: pytest.MonkeyPatch):
         """Test basic request lifecycle."""
 
         # First request.
-        engine_core.add_request(make_request())
+        engine_core.add_request(
+            *engine_core.preprocess_add_request(make_request()))
         assert len(engine_core.scheduler.waiting) == 1
         assert len(engine_core.scheduler.running) == 0
 
@@ -74,7 +75,8 @@ def test_engine_core(monkeypatch: pytest.MonkeyPatch):
         assert len(engine_core.scheduler.running) == 1
 
         # Second request.
-        engine_core.add_request(make_request())
+        engine_core.add_request(
+            *engine_core.preprocess_add_request(make_request()))
         assert len(engine_core.scheduler.waiting) == 1
         assert len(engine_core.scheduler.running) == 1
 
@@ -83,8 +85,10 @@ def test_engine_core(monkeypatch: pytest.MonkeyPatch):
         assert len(engine_core.scheduler.running) == 2
 
         # Add two requests in a row.
-        engine_core.add_request(make_request())
-        engine_core.add_request(make_request())
+        engine_core.add_request(
+            *engine_core.preprocess_add_request(make_request()))
+        engine_core.add_request(
+            *engine_core.preprocess_add_request(make_request()))
         assert len(engine_core.scheduler.waiting) == 2
         assert len(engine_core.scheduler.running) == 2
 
@@ -104,7 +108,7 @@ def test_engine_core(monkeypatch: pytest.MonkeyPatch):
         req = make_request()
         request_id = req.request_id
 
-        engine_core.add_request(req)
+        engine_core.add_request(*engine_core.preprocess_add_request(req))
         assert len(engine_core.scheduler.waiting) == 1
         assert len(engine_core.scheduler.running) == 0
         assert engine_core.scheduler.has_unfinished_requests()
@@ -131,8 +135,8 @@ def test_engine_core(monkeypatch: pytest.MonkeyPatch):
         req1 = make_request()
         req2 = make_request()
 
-        engine_core.add_request(req0)
-        engine_core.add_request(req1)
+        engine_core.add_request(*engine_core.preprocess_add_request(req0))
+        engine_core.add_request(*engine_core.preprocess_add_request(req1))
         assert len(engine_core.scheduler.waiting) == 2
         assert len(engine_core.scheduler.running) == 0
 
@@ -140,7 +144,7 @@ def test_engine_core(monkeypatch: pytest.MonkeyPatch):
         assert len(engine_core.scheduler.waiting) == 0
         assert len(engine_core.scheduler.running) == 2
 
-        engine_core.add_request(req2)
+        engine_core.add_request(*engine_core.preprocess_add_request(req2))
         assert len(engine_core.scheduler.waiting) == 1
         assert len(engine_core.scheduler.running) == 2
 
@@ -166,12 +170,12 @@ def test_engine_core(monkeypatch: pytest.MonkeyPatch):
         req0 = make_request()
         req1 = make_request()
         req0.request_id = req1.request_id = "test"
-        engine_core.add_request(req0)
+        engine_core.add_request(*engine_core.preprocess_add_request(req0))
 
         while (outs := engine_core.step()[0].get(0)) and outs.outputs:
             pass
 
-        engine_core.add_request(req1)
+        engine_core.add_request(*engine_core.preprocess_add_request(req1))
         while (outs := engine_core.step()[0].get(0)) and outs.outputs:
             pass
 
@@ -207,7 +211,7 @@ def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch):
             repetition_penalty=0.1,
             stop_token_ids=[1001, 1002],
         )
-        engine_core.add_request(request)
+        engine_core.add_request(*engine_core.preprocess_add_request(request))
 
         def _check_engine_state():
             assert len(engine_core.scheduler.waiting) == 1
@@ -226,7 +230,7 @@ def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch):
             top_p=0.99,
             top_k=50,
         )
-        engine_core.add_request(request2)
+        engine_core.add_request(*engine_core.preprocess_add_request(request2))
         _check_engine_state()
 
 
@@ -236,7 +240,7 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
     Test that the engine can handle multiple concurrent batches.
     """
 
-    def make_request_with_max_tokens(req_id: int,
+    def make_request_with_max_tokens(req_id: str,
                                      max_tokens: int) -> EngineCoreRequest:
         request = make_request()
         request.request_id = req_id
@@ -297,16 +301,16 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
         assert engine_core.batch_queue is not None
 
         # Add two requests in a row. Each request have 12 prompt tokens.
-        req0 = make_request_with_max_tokens(0, 5)
-        engine_core.add_request(req0)
-        req1 = make_request_with_max_tokens(1, 5)
-        engine_core.add_request(req1)
+        req0 = make_request_with_max_tokens("0", 5)
+        engine_core.add_request(*engine_core.preprocess_add_request(req0))
+        req1 = make_request_with_max_tokens("1", 5)
+        engine_core.add_request(*engine_core.preprocess_add_request(req1))
 
         # Schedule Batch 1: (10, req0)
         assert engine_core.step_with_batch_queue()[0] is None
         assert engine_core.batch_queue.qsize() == 1
         scheduler_output = engine_core.batch_queue.queue[-1][1]
-        assert scheduler_output.num_scheduled_tokens[0] == 10
+        assert scheduler_output.num_scheduled_tokens["0"] == 10
         # num_computed_tokens should have been updated immediately.
         assert engine_core.scheduler.requests[
             req0.request_id].num_computed_tokens == 10
@@ -315,11 +319,11 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
         assert engine_core.step_with_batch_queue()[0] is None
         assert engine_core.batch_queue.qsize() == 2
         scheduler_output = engine_core.batch_queue.queue[-1][1]
-        assert scheduler_output.num_scheduled_tokens[0] == 2
-        assert scheduler_output.num_scheduled_tokens[1] == 8
+        assert scheduler_output.num_scheduled_tokens["0"] == 2
+        assert scheduler_output.num_scheduled_tokens["1"] == 8
         # num_computed_tokens should have been updated immediately.
-        assert engine_core.scheduler.requests[0].num_computed_tokens == 12
-        assert engine_core.scheduler.requests[1].num_computed_tokens == 8
+        assert engine_core.scheduler.requests["0"].num_computed_tokens == 12
+        assert engine_core.scheduler.requests["1"].num_computed_tokens == 8
 
         assert engine_core.scheduler.get_num_unfinished_requests() == 2
 
@@ -331,7 +335,7 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
         engine_core.step_with_batch_queue()
         assert engine_core.batch_queue.qsize() == 2
         scheduler_output = engine_core.batch_queue.queue[-1][1]
-        assert scheduler_output.num_scheduled_tokens[1] == 4
+        assert scheduler_output.num_scheduled_tokens["1"] == 4
 
         # Batch queue is full. Finish Batch 2. Get first token of req0.
         output = engine_core.step_with_batch_queue()[0].get(0)
@@ -343,7 +347,7 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
         engine_core.step_with_batch_queue()
         assert engine_core.batch_queue.qsize() == 2
         scheduler_output = engine_core.batch_queue.queue[-1][1]
-        assert scheduler_output.num_scheduled_tokens[0] == 1
+        assert scheduler_output.num_scheduled_tokens["0"] == 1
 
         # Batch queue is full. Finish Batch 3. Get first token of req1.
         output = engine_core.step_with_batch_queue()[0].get(0)
@@ -355,14 +359,14 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
         engine_core.step_with_batch_queue()
         assert engine_core.batch_queue.qsize() == 2
         scheduler_output = engine_core.batch_queue.queue[-1][1]
-        assert scheduler_output.num_scheduled_tokens[1] == 1
+        assert scheduler_output.num_scheduled_tokens["1"] == 1
 
         # Loop until req0 is finished.
         step = 0
         req_id = 0
         expected_num_tokens = [
-            engine_core.scheduler.requests[0].num_tokens + 1,
-            engine_core.scheduler.requests[1].num_tokens + 1,
+            engine_core.scheduler.requests["0"].num_tokens + 1,
+            engine_core.scheduler.requests["1"].num_tokens + 1,
         ]
         while engine_core.scheduler.get_num_unfinished_requests() == 2:
             output = engine_core.step_with_batch_queue()[0]
@@ -413,3 +417,53 @@ def test_engine_core_tp(monkeypatch: pytest.MonkeyPatch):
             get_worker_cache_config_field, args=("num_cpu_blocks", ))
         assert all(x is not None for x in num_gpu_blocks)
         assert all(x is not None for x in num_cpu_blocks)
+
+
+@create_new_process_for_each_test()
+def test_engine_core_invalid_request_id_type(monkeypatch: pytest.MonkeyPatch):
+    """Test that engine raises TypeError for non-string request_id."""
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        engine_args = EngineArgs(model=MODEL_NAME)
+        vllm_config = engine_args.create_engine_config()
+        executor_class = Executor.get_class(vllm_config)
+
+        with set_default_torch_num_threads(1):
+            engine_core = EngineCore(vllm_config=vllm_config,
+                                     executor_class=executor_class,
+                                     log_stats=True)
+
+        # Test with UUID object (common mistake)
+        uuid_request = make_request()
+        uuid_request.request_id = uuid.uuid4()  # UUID object instead of string
+
+        with pytest.raises(TypeError,
+                           match="request_id must be a string, got.*UUID"):
+            engine_core.add_request(
+                *engine_core.preprocess_add_request(uuid_request))
+
+        # Test with integer
+        int_request = make_request()
+        int_request.request_id = 12345
+
+        with pytest.raises(TypeError,
+                           match="request_id must be a string, got.*int"):
+            engine_core.add_request(
+                *engine_core.preprocess_add_request(int_request))
+
+        # Test with None
+        none_request = make_request()
+        none_request.request_id = None
+
+        with pytest.raises(TypeError,
+                           match="request_id must be a string, got.*NoneType"):
+            engine_core.add_request(
+                *engine_core.preprocess_add_request(none_request))
+
+        # Verify engine is still functional after errors
+        valid_request = make_request()
+        engine_core.add_request(
+            *engine_core.preprocess_add_request(valid_request))
+        assert len(engine_core.scheduler.waiting) == 1
+        assert len(engine_core.scheduler.running) == 0
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index 2ac6dc796bd10..1329ce5f69cbd 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -6,8 +6,9 @@ import os
 import signal
 import time
 import uuid
+from dataclasses import dataclass
 from threading import Thread
-from typing import Optional
+from typing import Optional, Union
 from unittest.mock import MagicMock
 
 import pytest
@@ -292,6 +293,77 @@ async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch):
             client.shutdown()
 
 
+@dataclass
+class MyDataclass:
+    message: str
+
+
+# Dummy utility function to monkey-patch into engine core.
+def echo_dc(
+    self,
+    msg: str,
+    return_list: bool = False,
+) -> Union[MyDataclass, list[MyDataclass]]:
+    print(f"echo dc util function called: {msg}")
+    val = None if msg is None else MyDataclass(msg)
+    # Return dataclass to verify support for returning custom types
+    # (for which there is special handling to make it work with msgspec).
+    return [val for _ in range(3)] if return_list else val
+
+
+@pytest.mark.asyncio(loop_scope="function")
+async def test_engine_core_client_util_method_custom_return(
+        monkeypatch: pytest.MonkeyPatch):
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        # Must set insecure serialization to allow returning custom types.
+        m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
+
+        # Monkey-patch core engine utility function to test.
+        m.setattr(EngineCore, "echo_dc", echo_dc, raising=False)
+
+        engine_args = EngineArgs(model=MODEL_NAME, enforce_eager=True)
+        vllm_config = engine_args.create_engine_config(
+            usage_context=UsageContext.UNKNOWN_CONTEXT)
+        executor_class = Executor.get_class(vllm_config)
+
+        with set_default_torch_num_threads(1):
+            client = EngineCoreClient.make_client(
+                multiprocess_mode=True,
+                asyncio_mode=True,
+                vllm_config=vllm_config,
+                executor_class=executor_class,
+                log_stats=True,
+            )
+
+        try:
+            # Test utility method returning custom / non-native data type.
+            core_client: AsyncMPClient = client
+
+            result = await core_client.call_utility_async(
+                "echo_dc", "testarg2", False)
+            assert isinstance(result,
+                              MyDataclass) and result.message == "testarg2"
+            result = await core_client.call_utility_async(
+                "echo_dc", "testarg2", True)
+            assert isinstance(result, list) and all(
+                isinstance(r, MyDataclass) and r.message == "testarg2"
+                for r in result)
+
+            # Test returning None and list of Nones
+            result = await core_client.call_utility_async(
+                "echo_dc", None, False)
+            assert result is None
+            result = await core_client.call_utility_async(
+                "echo_dc", None, True)
+            assert isinstance(result, list) and all(r is None for r in result)
+
+        finally:
+            client.shutdown()
+
+
 @pytest.mark.parametrize(
     "multiprocessing_mode,publisher_config",
     [(True, "tcp"), (False, "inproc")],
diff --git a/tests/v1/kv_connector/unit/test_shared_storage_connector.py b/tests/v1/kv_connector/unit/test_shared_storage_connector.py
new file mode 100644
index 0000000000000..11b7e378441a4
--- /dev/null
+++ b/tests/v1/kv_connector/unit/test_shared_storage_connector.py
@@ -0,0 +1,217 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import asdict
+from typing import NamedTuple
+
+from PIL import Image
+
+from vllm import LLM, EngineArgs, SamplingParams
+from vllm.assets.image import ImageAsset
+from vllm.config import KVTransferConfig
+from vllm.multimodal.utils import encode_image_base64
+
+MODEL_NAME = "RedHatAI/Qwen2.5-VL-3B-Instruct-quantized.w4a16"
+
+SAMPLING_PARAMS = SamplingParams(temperature=0.0, top_k=1, max_tokens=128)
+
+TEXT_PROMPTS = [
+    "What's in the image(s)? Around 30 words. What's special in 2nd image?",
+    "The future of AI is",
+]
+
+
+class InputCase(NamedTuple):
+    text: str
+    img: list[Image]
+    expected_len: int
+    info: str
+
+
+def _check_path_len(path):
+    """Return the latest length in path"""
+    return len(list(path.iterdir()))
+
+
+def _list_path(path):
+    """Return the list of foldername (hashes generatd) under the path"""
+    return list(path.iterdir())
+
+
+def run_test(tmp_path, processor, llm: LLM, question: str,
+             image_urls: list[Image], expected_len: int, info: str):
+    """
+    One individual test to process the prompt and output base on 1 set of input
+    Then check if the length in the strorage path matches the expected length
+    `info` introduces details or purpose of the individual test
+    """
+    print(f"***info: {info}***")
+    print(
+        f"**Expected storage path length after llm generate: {expected_len}**")
+    process_prompt(processor, llm, question, image_urls)
+
+    print(f"Path matched expected length: {_check_path_len(tmp_path)}")
+    print(f"Hashes under the storage path: {_list_path(tmp_path)}")
+
+    assert _check_path_len(tmp_path) == expected_len, (
+        f"Expect storage path length {expected_len} ;",
+        f"but end up {_check_path_len(tmp_path)} instead. ", f"Info: {info}")
+
+
+def process_prompt(processor, llm: LLM, question: str,
+                   image_urls: list[Image]):
+    """
+    Form the prompt based on the text and image input, then llm generate output
+    """
+    placeholders = [{
+        "type": "image_url",
+        "image_url": {
+            "url": f"data:image;base64,{encode_image_base64(image_pil)}"
+        }
+    } for image_pil in image_urls]
+
+    messages = [
+        {
+            "role": "system",
+            "content": "You are a helpful assistant."
+        },
+        {
+            "role": "user",
+            "content": [
+                *placeholders,
+                {
+                    "type": "text",
+                    "text": question
+                },
+            ],
+        },
+    ]
+
+    prompt = processor.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True)
+
+    outputs = llm.generate(
+        {
+            "prompt":
+            prompt,
+            **({
+                "multi_modal_data": {
+                    "image": [*image_urls]
+                }
+            } if image_urls else {})
+        },
+        sampling_params=SAMPLING_PARAMS,
+    )
+
+    print("-" * 50)
+    print("Output:")
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+        print("-" * 50)
+
+
+def test_shared_storage_connector_hashes(tmp_path):
+    """
+    Tests that SharedStorageConnector saves KV to the storage locations
+    with proper hashes; that are unique for inputs with identical text but 
+    differnt images (same size), or same multiple images but different orders.
+    """
+    # Using tmp_path as the storage path to store KV
+    print(f"KV storage path at: {str(tmp_path)}")
+
+    # Configure the SharedStorageConnector
+    kv_transfer_config = KVTransferConfig(
+        kv_connector="SharedStorageConnector",
+        kv_role="kv_both",
+        kv_connector_extra_config={"shared_storage_path": str(tmp_path)})
+
+    engine_args = EngineArgs(
+        model=MODEL_NAME,
+        max_model_len=8192,
+        max_num_seqs=1,
+        gpu_memory_utilization=0.4,
+        enforce_eager=True,
+        kv_transfer_config=kv_transfer_config,
+        limit_mm_per_prompt={"image": 2},
+    )
+
+    # don't put this import at the top level
+    # it will call torch.cuda.device_count()
+    from transformers import AutoProcessor  # noqa: F401
+
+    # Create processor to handle the chat prompt
+    processor = AutoProcessor.from_pretrained(MODEL_NAME)
+
+    # Prepare images for the tests
+    # Resize to the same size to check hashes correctness
+    image_1 = ImageAsset("stop_sign").pil_image.resize((1280, 720))
+    image_2 = ImageAsset("cherry_blossom").pil_image.resize((1280, 720))
+
+    # Make sure that they are not the same picture
+    assert image_1 != image_2, "The images should not be identical"
+
+    # Create the LLM instance
+    engine_args = asdict(engine_args)
+    llm = LLM(**engine_args)
+
+    # Prepare the input cases
+    input_cases = [
+        InputCase(text=TEXT_PROMPTS[0],
+                  img=[image_1],
+                  expected_len=1,
+                  info="image_1 single input the first time."),
+        InputCase(text=TEXT_PROMPTS[0],
+                  img=[image_2],
+                  expected_len=2,
+                  info=("image_2 single input the first time. "
+                        "It is in same pixel size with image_1, yet it "
+                        "should be able to form a new unique hash.")),
+        InputCase(text=TEXT_PROMPTS[0],
+                  img=[image_1],
+                  expected_len=2,
+                  info=("image_1 single input the 2nd time. "
+                        "It should not form aother new hash.")),
+        InputCase(text=TEXT_PROMPTS[0],
+                  img=[image_2],
+                  expected_len=2,
+                  info=("image_2 single input the 2nd time. "
+                        "It should not form aother new hash.")),
+        InputCase(text=TEXT_PROMPTS[0],
+                  img=[image_1, image_2],
+                  expected_len=3,
+                  info="image_1 with image_2 input the first time."),
+        InputCase(text=TEXT_PROMPTS[0],
+                  img=[image_2, image_1],
+                  expected_len=4,
+                  info="The image order is swapped. Should form new hash."),
+        InputCase(text=TEXT_PROMPTS[0],
+                  img=[image_1, image_2],
+                  expected_len=4,
+                  info=("[image_1, image_2] input the 2nd time. "
+                        "It should not form aother new hash.")),
+        InputCase(text=TEXT_PROMPTS[0],
+                  img=[image_2, image_1],
+                  expected_len=4,
+                  info=("[image_2, image_1] input the 2nd time. "
+                        "It should not form aother new hash.")),
+        InputCase(text=TEXT_PROMPTS[0],
+                  img=[],
+                  expected_len=5,
+                  info="Pure text input test as a case-control"),
+        InputCase(text=TEXT_PROMPTS[0],
+                  img=[],
+                  expected_len=5,
+                  info="Identical pure text input as a case-control"),
+        InputCase(text=TEXT_PROMPTS[1],
+                  img=[],
+                  expected_len=6,
+                  info="Another pure text input as a case-control"),
+    ]
+
+    # Run tests
+    for case_id, (text, img, expected_len, info) in enumerate(input_cases):
+        print("\n", "=" * 25, f"Below running input case: {case_id}", "=" * 25)
+        run_test(tmp_path, processor, llm, text, img, expected_len, info)
+
+    print("All tests passed successfully!")
diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py
index cf20d44fbaaed..480a7074cdf4e 100644
--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -68,9 +68,6 @@ def create_vllm_config(
     )
     model_config = ModelConfig(
         model=model,
-        task="auto",
-        tokenizer=model,
-        tokenizer_mode="auto",
         trust_remote_code=True,
         dtype="float16",
         seed=42,
diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
index 5c74a286c4a9d..a126c7c943ed0 100644
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -24,13 +24,8 @@ eagle3_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
 
 def _create_proposer(method: str, k: int) -> EagleProposer:
     model_config = ModelConfig(model=model_dir,
-                               task="generate",
-                               max_model_len=100,
-                               tokenizer=model_dir,
-                               tokenizer_mode="auto",
-                               dtype="auto",
-                               seed=None,
-                               trust_remote_code=False)
+                               runner="generate",
+                               max_model_len=100)
 
     # Choose model directory based on method
     draft_model_dir = eagle_dir if method == "eagle" else eagle3_dir
@@ -310,6 +305,7 @@ def test_propose(num_speculative_tokens):
         _Backend.FLASH_ATTN_VLLM_V1)
     attn_metadata_builder = attn_metadata_builder_cls(
         kv_cache_spec=create_standard_kv_cache_spec(proposer.vllm_config),
+        layer_names=proposer.attn_layer_names,
         vllm_config=proposer.vllm_config,
         device=device,
     )
diff --git a/tests/v1/spec_decode/test_ngram.py b/tests/v1/spec_decode/test_ngram.py
index ffea86d0d19ca..c844925e6caed 100644
--- a/tests/v1/spec_decode/test_ngram.py
+++ b/tests/v1/spec_decode/test_ngram.py
@@ -44,14 +44,7 @@ def test_ngram_proposer():
 
     def ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
         # Dummy model config. Just to set max_model_len.
-        model_config = ModelConfig(model="facebook/opt-125m",
-                                   task="generate",
-                                   max_model_len=100,
-                                   tokenizer="facebook/opt-125m",
-                                   tokenizer_mode="auto",
-                                   dtype="auto",
-                                   seed=None,
-                                   trust_remote_code=False)
+        model_config = ModelConfig(model="facebook/opt-125m")
         return NgramProposer(
             vllm_config=VllmConfig(model_config=model_config,
                                    speculative_config=SpeculativeConfig.
diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py
index b4d4348c7fd9b..b68ed298a1895 100644
--- a/tests/v1/test_oracle.py
+++ b/tests/v1/test_oracle.py
@@ -13,7 +13,6 @@ UNSUPPORTED_MODELS_V1 = [
     "openai/whisper-large-v3",  # transcription
     "facebook/bart-large-cnn",  # encoder decoder
     "state-spaces/mamba-130m-hf",  # mamba1
-    "BAAI/bge-m3",  # embedding
 ]
 
 MODEL = "meta-llama/Llama-3.2-1B-Instruct"
@@ -48,13 +47,6 @@ def test_unsupported_configs(monkeypatch):
                 },
             ).create_engine_config()
 
-        with pytest.raises(NotImplementedError):
-            AsyncEngineArgs(
-                model=MODEL,
-                guided_decoding_backend="lm-format-enforcer",
-                guided_decoding_disable_fallback=True,
-            ).create_engine_config()
-
         with pytest.raises(NotImplementedError):
             AsyncEngineArgs(
                 model=MODEL,
diff --git a/tests/v1/test_utils.py b/tests/v1/test_utils.py
index 0b892bd9dffdc..00d98a873a310 100644
--- a/tests/v1/test_utils.py
+++ b/tests/v1/test_utils.py
@@ -1,9 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import re
-
 import pytest
+import regex as re
 import requests
 import torch
 
diff --git a/tests/v1/tpu/test_basic.py b/tests/v1/tpu/test_basic.py
index dd89059ded524..865b58bc7f4b0 100644
--- a/tests/v1/tpu/test_basic.py
+++ b/tests/v1/tpu/test_basic.py
@@ -59,7 +59,7 @@ def test_basic(
                 # actually test chunked prompt
                 max_num_batched_tokens=1024,
                 max_model_len=8192,
-                gpu_memory_utilization=0.95,
+                gpu_memory_utilization=0.7,
                 max_num_seqs=max_num_seqs,
                 tensor_parallel_size=tensor_parallel_size) as vllm_model:
             vllm_outputs = vllm_model.generate_greedy(example_prompts,
diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py
index 40db0b2afe0d9..215be09bf5a2c 100644
--- a/tests/v1/tpu/worker/test_tpu_model_runner.py
+++ b/tests/v1/tpu/worker/test_tpu_model_runner.py
@@ -26,10 +26,6 @@ def get_vllm_config():
     )
     model_config = ModelConfig(
         model="facebook/opt-125m",
-        task="generate",
-        tokenizer="facebook/opt-125m",
-        tokenizer_mode="auto",
-        trust_remote_code=True,
         dtype="bfloat16",  # TPUs typically use bfloat16
         seed=42,
     )
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index 7fec4782517cf..231dfcbb68848 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -76,10 +76,6 @@ def get_vllm_config():
     )
     model_config = ModelConfig(
         model="facebook/opt-125m",
-        task="generate",
-        tokenizer="facebook/opt-125m",
-        tokenizer_mode="auto",
-        trust_remote_code=True,
         dtype="float16",
         seed=42,
     )
@@ -749,7 +745,8 @@ def test_hybrid_attention_mamba_tensor_shapes(monkeypatch):
     layer_4 = "model.layers.4.mixer"
     layer_5 = "model.layers.5.mixer"
 
-    with set_current_vllm_config(vllm_config):
+    with set_current_vllm_config(vllm_config), monkeypatch.context() as m:
+        m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
         hf_config = vllm_config.model_config.hf_config
         fwd_context = {}
         for key in [layer_0, layer_1]:
diff --git a/tools/check_pickle_imports.py b/tools/check_pickle_imports.py
index ef197d1fbace1..5e99dc63ebe0c 100644
--- a/tools/check_pickle_imports.py
+++ b/tools/check_pickle_imports.py
@@ -34,7 +34,6 @@ ALLOWED_FILES = set([
     'vllm/model_executor/models/registry.py',
     'tests/test_utils.py',
     'tests/tokenization/test_cached_tokenizer.py',
-    'tests/model_executor/test_guided_processors.py',
     'vllm/distributed/utils.py',
     'vllm/distributed/parallel_state.py',
     'vllm/engine/multiprocessing/client.py',
diff --git a/tools/ep_kernels/README.md b/tools/ep_kernels/README.md
index f1479146f053c..273e0f378e343 100644
--- a/tools/ep_kernels/README.md
+++ b/tools/ep_kernels/README.md
@@ -1,6 +1,9 @@
+# Expert parallel kernels
+
 Large-scale cluster-level expert parallel, as described in the [DeepSeek-V3 Technical Report](http://arxiv.org/abs/2412.19437), is an efficient way to deploy sparse MoE models with many experts. However, such deployment requires many components beyond a normal Python package, including system package support and system driver support. It is impossible to bundle all these components into a Python package.
 
 Here we break down the requirements in 2 steps:
+
 1. Build and install the Python libraries (both [pplx-kernels](https://github.com/ppl-ai/pplx-kernels) and [DeepEP](https://github.com/deepseek-ai/DeepEP)), including necessary dependencies like NVSHMEM. This step does not require any privileged access. Any user can do this.
 2. Configure NVIDIA driver to enable IBGDA. This step requires root access, and must be done on the host machine.
 
@@ -8,15 +11,15 @@ Here we break down the requirements in 2 steps:
 
 All scripts accept a positional argument as workspace path for staging the build, defaulting to `$(pwd)/ep_kernels_workspace`.
 
-# Usage
+## Usage
 
-## Single-node
+### Single-node
 
 ```bash
 bash install_python_libraries.sh
 ```
 
-## Multi-node
+### Multi-node
 
 ```bash
 bash install_python_libraries.sh
diff --git a/tools/ep_kernels/configure_system_drivers.sh b/tools/ep_kernels/configure_system_drivers.sh
index cf15c1daccaec..b8bd8b8f6f550 100644
--- a/tools/ep_kernels/configure_system_drivers.sh
+++ b/tools/ep_kernels/configure_system_drivers.sh
@@ -2,6 +2,16 @@ set -ex
 
 # turn on IBGDA
 echo 'options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1;"' | tee -a /etc/modprobe.d/nvidia.conf
-update-initramfs -u
+
+if command -v update-initramfs &> /dev/null; then
+    # for Debian/Ubuntu
+    sudo update-initramfs -u
+elif command -v dracut &> /dev/null; then
+    # for Fedora/CentOS
+    sudo dracut --force
+else
+    echo "No supported initramfs update tool found."
+    exit 1
+fi
 
 echo "Please reboot the system to apply the changes"
diff --git a/tools/ep_kernels/install_python_libraries.sh b/tools/ep_kernels/install_python_libraries.sh
index 83643c084bf9a..9d1b2da3b4122 100644
--- a/tools/ep_kernels/install_python_libraries.sh
+++ b/tools/ep_kernels/install_python_libraries.sh
@@ -53,9 +53,45 @@ popd
 
 export CMAKE_PREFIX_PATH=$WORKSPACE/nvshmem_install:$CMAKE_PREFIX_PATH
 
+is_git_dirty() {
+    local dir=$1
+    pushd "$dir" > /dev/null
+
+    if [ -d ".git" ] && [ -n "$(git status --porcelain 2>/dev/null)" ]; then
+        popd > /dev/null
+        return 0  # dirty (true)
+    else
+        popd > /dev/null
+        return 1  # clean (false)
+    fi
+}
+
+# Function to handle git repository cloning with dirty/incomplete checks
+clone_repo() {
+    local repo_url=$1
+    local dir_name=$2
+    local key_file=$3
+
+    if [ -d "$dir_name" ]; then
+        # Check if directory has uncommitted changes (dirty)
+        if is_git_dirty "$dir_name"; then
+            echo "$dir_name directory is dirty, skipping clone"
+        # Check if clone failed (directory exists but not a valid git repo or missing key files)
+        elif [ ! -d "$dir_name/.git" ] || [ ! -f "$dir_name/$key_file" ]; then
+            echo "$dir_name directory exists but clone appears incomplete, cleaning up and re-cloning"
+            rm -rf "$dir_name"
+            git clone "$repo_url"
+        else
+            echo "$dir_name directory exists and appears complete; manually update if needed"
+        fi
+    else
+        git clone "$repo_url"
+    fi
+}
+
 # build and install pplx, require pytorch installed
 pushd $WORKSPACE
-git clone https://github.com/ppl-ai/pplx-kernels
+clone_repo "https://github.com/ppl-ai/pplx-kernels" "pplx-kernels" "setup.py"
 cd pplx-kernels
 # see https://github.com/pypa/pip/issues/9955#issuecomment-838065925
 # PIP_NO_BUILD_ISOLATION=0 disables build isolation
@@ -64,7 +100,7 @@ popd
 
 # build and install deepep, require pytorch installed
 pushd $WORKSPACE
-git clone https://github.com/deepseek-ai/DeepEP
+clone_repo "https://github.com/deepseek-ai/DeepEP" "DeepEP" "setup.py"
 cd DeepEP
 export NVSHMEM_DIR=$WORKSPACE/nvshmem_install
 PIP_NO_BUILD_ISOLATION=0 pip install -vvv -e  .
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index cf296a3b534bc..35345b1be01c2 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1282,10 +1282,11 @@ def scaled_fp8_quant(
                 output, input.contiguous(), scale, scale_ub)
         else:
             scale = torch.zeros(1, device=input.device, dtype=torch.float32)
-            torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale)
+            torch.ops._C.dynamic_scaled_fp8_quant(output, input.contiguous(),
+                                                  scale)
     else:
         assert scale.numel() == 1, f"{scale.shape}"
-        torch.ops._C.static_scaled_fp8_quant(output, input, scale)
+        torch.ops._C.static_scaled_fp8_quant(output, input.contiguous(), scale)
 
     return output, scale
 
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index e6e60e7562482..824ff8cca201a 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -1104,7 +1104,12 @@ class FlashInferImpl(AttentionImpl):
         window_left = window_size[0] if window_size is not None else -1
 
         prefill_output: Optional[torch.Tensor] = None
-        decode_output: Optional[torch.Tensor] = None
+        if num_decode_tokens > 0:
+            decode_output = torch.empty(decode_query.shape,
+                                        dtype=decode_query.dtype,
+                                        device=decode_query.device)
+        else:
+            decode_output = None
         stride_order = FlashInferBackend.get_kv_cache_stride_order()
         if prefill_meta := attn_metadata.prefill_metadata:
             # We will use flash attention for prefill
@@ -1155,17 +1160,18 @@ class FlashInferImpl(AttentionImpl):
                     num_decode_tokens, attn_metadata.max_decode_seq_len,
                     kv_cache_dtype, attn_metadata.num_qo_heads,
                     attn_metadata.num_kv_heads, attn_metadata.head_dim):
-                decode_output = decode_meta.decode_wrapper.run(
+                decode_meta.decode_wrapper.run(
                     decode_query,
                     kv_cache.permute(*stride_order),
                     k_scale=layer._k_scale_float,
                     v_scale=layer._v_scale_float,
+                    out=decode_output,
                 )
             else:
                 workspace_buffer = (
-                    decode_meta.decode_wrapper._int_workspace_buffer)
+                    decode_meta.decode_wrapper._float_workspace_buffer)
                 assert FlashInferState.get_kv_cache_layout() == "HND"
-                decode_output = trtllm_batch_decode_with_kv_cache(
+                trtllm_batch_decode_with_kv_cache(
                     query=decode_query,
                     kv_cache=kv_cache.permute(*stride_order),
                     workspace_buffer=workspace_buffer,
@@ -1174,6 +1180,7 @@ class FlashInferImpl(AttentionImpl):
                     max_seq_len=attn_metadata.max_decode_seq_len,
                     bmm1_scale=layer._k_scale_float * softmax_scale,
                     bmm2_scale=layer._v_scale_float,
+                    out=decode_output,
                 )
 
         if prefill_output is None and decode_output is not None:
diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index f4506c9ce6f45..bd2b1e5990c83 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -470,20 +470,6 @@ async def benchmark(
                                      pbar=pbar)))
     outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
 
-    if profile:
-        print("Stopping profiler...")
-        profile_input = RequestFuncInput(
-            model=model_id,
-            prompt=test_prompt,
-            api_url=base_url + "/stop_profile",
-            prompt_len=test_prompt_len,
-            output_len=test_output_len,
-            logprobs=logprobs,
-        )
-        profile_output = await request_func(request_func_input=profile_input)
-        if profile_output.success:
-            print("Profiler stopped")
-
     if pbar is not None:
         pbar.close()
 
@@ -500,6 +486,12 @@ async def benchmark(
 
     print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
     print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
+    if max_concurrency is not None:
+        print("{:<40} {:<10}".format("Maximum request concurrency:",
+                                     max_concurrency))
+    if request_rate != float('inf'):
+        print("{:<40} {:<10.2f}".format("Request rate configured (RPS):",
+                                        request_rate ))
     print("{:<40} {:<10.2f}".format("Benchmark duration (s):",
                                     benchmark_duration))
     print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
@@ -576,6 +568,19 @@ async def benchmark(
 
     print("=" * 50)
 
+    if profile:
+        print("Stopping profiler...")
+        profile_input = RequestFuncInput(
+            model=model_id,
+            prompt=test_prompt,
+            api_url=base_url + "/stop_profile",
+            prompt_len=test_prompt_len,
+            output_len=test_output_len,
+            logprobs=logprobs,
+        )
+        profile_output = await request_func(request_func_input=profile_input)
+        if profile_output.success:
+            print("Profiler stopped")
     return result
 
 
diff --git a/vllm/benchmarks/utils.py b/vllm/benchmarks/utils.py
index f0bb99326ab40..5f95fdcc75829 100644
--- a/vllm/benchmarks/utils.py
+++ b/vllm/benchmarks/utils.py
@@ -67,4 +67,9 @@ class InfEncoder(json.JSONEncoder):
 
 def write_to_json(filename: str, records: list) -> None:
     with open(filename, "w") as f:
-        json.dump(records, f, cls=InfEncoder)
+        json.dump(
+            records,
+            f,
+            cls=InfEncoder,
+            default=lambda o: f"<{type(o).__name__} is not JSON serializable>",
+        )
diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py
index 0e7961841bd33..cb99fe8310e73 100644
--- a/vllm/compilation/collective_fusion.py
+++ b/vllm/compilation/collective_fusion.py
@@ -15,10 +15,13 @@ from vllm.distributed import get_tp_group, tensor_model_parallel_all_reduce
 from vllm.distributed.parallel_state import (
     get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 from vllm.utils import direct_register_custom_op
 
 from .vllm_inductor_pass import VllmInductorPass
 
+FP8_DTYPE = current_platform.fp8_dtype()
+
 if find_spec("flashinfer"):
     try:
         import flashinfer.comm as flashinfer_comm
@@ -28,7 +31,6 @@ if find_spec("flashinfer"):
         flashinfer_comm = None
 else:
     flashinfer_comm = None
-from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
 
@@ -118,6 +120,230 @@ class AllGatherGEMMPattern(BasePattern):
                                 pm.fwd_only, pm_pass)
 
 
+class ScaledMMReduceScatterPattern(BasePattern):
+
+    def get_inputs(self):
+        input = torch.empty([16, 16], device=self.device, dtype=FP8_DTYPE)
+        mm_weight = torch.empty([16, 16], device=self.device,
+                                dtype=FP8_DTYPE).contiguous().transpose(0, 1)
+        scale_a = torch.empty([16, 1], device=self.device, dtype=torch.float32)
+        scale_b = torch.empty([1, 16], device=self.device, dtype=torch.float32)
+        return [input, mm_weight, scale_a, scale_b]
+
+    def register(self, pm_pass: PatternMatcherPass):
+
+        def pattern(input: torch.Tensor, mat2: torch.Tensor,
+                    scale_a: torch.Tensor,
+                    scale_b: torch.Tensor) -> torch.Tensor:
+            scaled_mm = torch.ops.aten._scaled_mm.default(input,
+                                                          mat2=mat2,
+                                                          scale_a=scale_a,
+                                                          scale_b=scale_b,
+                                                          bias=None,
+                                                          scale_result=None,
+                                                          out_dtype=self.dtype)
+            reduce_scatter = torch.ops.vllm.reduce_scatter.default(
+                scaled_mm,
+                dim=0,
+                world_size=self.tp_size,
+                group_name=self.tp.unique_name)
+            return reduce_scatter
+
+        def replacement(input: torch.Tensor, mat2: torch.Tensor,
+                        scale_a: torch.Tensor,
+                        scale_b: torch.Tensor) -> torch.Tensor:
+            gemm_rs = torch.ops.symm_mem.fused_scaled_matmul_reduce_scatter(
+                input,
+                mat2,
+                scale_a,
+                scale_b,
+                "avg",
+                scatter_dim=0,
+                out_dtype=self.dtype,
+                group_name=self.tp.device_group.group_name,
+            )
+
+            return gemm_rs
+
+        pm.register_replacement(pattern, replacement, self.get_inputs(),
+                                pm.fwd_only, pm_pass)
+
+
+class AllGatherScaledMMPattern(BasePattern):
+
+    def get_inputs(self):
+        x = torch.empty([8, 16], device=self.device, dtype=FP8_DTYPE)
+        weight = torch.empty([16, 16], device=self.device,
+                             dtype=FP8_DTYPE).contiguous().transpose(0, 1)
+
+        s1 = x.shape[0] * self.tp_size
+
+        scale_a = torch.empty([s1, 1], device=self.device, dtype=torch.float32)
+        scale_b = torch.empty([1, 16], device=self.device, dtype=torch.float32)
+
+        return [x, weight, scale_a, scale_b]
+
+    def register(self, pm_pass: PatternMatcherPass):
+
+        def pattern(
+            x: torch.Tensor,
+            weight: torch.Tensor,
+            scale_a: torch.Tensor,
+            scale_b: torch.Tensor,
+        ) -> torch.Tensor:
+            all_gather = torch.ops.vllm.all_gather.default(
+                x,
+                dim=0,
+                world_size=self.tp_size,
+                group_name=self.tp.unique_name)
+
+            return torch.ops.aten._scaled_mm.default(all_gather,
+                                                     mat2=weight,
+                                                     scale_a=scale_a,
+                                                     scale_b=scale_b,
+                                                     bias=None,
+                                                     scale_result=None,
+                                                     out_dtype=self.dtype)
+
+        def replacement(x: torch.Tensor, weight: torch.Tensor,
+                        scale_a: torch.Tensor,
+                        scale_b: torch.Tensor) -> torch.Tensor:
+            ag_output, mm_outputs = torch.ops.symm_mem.fused_all_gather_scaled_matmul(  # noqa
+                x,
+                [weight],
+                scale_a,
+                [scale_b],
+                gather_dim=0,
+                biases=[None],
+                result_scales=[None],
+                out_dtypes=[self.dtype],
+                use_fast_accum=[False],
+                group_name=self.tp.device_group.group_name,
+            )
+            return mm_outputs
+
+        pm.register_replacement(pattern, replacement, self.get_inputs(),
+                                pm.fwd_only, pm_pass)
+
+
+class CutlassScaledMMReduceScatterPattern(BasePattern):
+
+    def get_inputs(self):
+        input = torch.empty([16, 16], device=self.device, dtype=FP8_DTYPE)
+        mm_weight = torch.empty([16, 16], device=self.device,
+                                dtype=FP8_DTYPE).contiguous().transpose(0, 1)
+        scale_a = torch.empty([16, 1], device=self.device, dtype=torch.float32)
+        scale_b = torch.empty([1, 16], device=self.device, dtype=torch.float32)
+
+        cutlass_mm_output = torch.empty([16, 16],
+                                        device=self.device,
+                                        dtype=self.dtype)
+        return [input, mm_weight, scale_a, scale_b, cutlass_mm_output]
+
+    def register(self, pm_pass: PatternMatcherPass):
+
+        def pattern(input: torch.Tensor, weight: torch.Tensor,
+                    scale_a: torch.Tensor, scale_b: torch.Tensor,
+                    cutlass_mm_output: torch.Tensor) -> torch.Tensor:
+            cutlass_scaled_mm = torch.ops.higher_order.auto_functionalized(
+                torch.ops._C.cutlass_scaled_mm.default,
+                out=cutlass_mm_output,
+                a=input,
+                b=weight,
+                a_scales=scale_a,
+                b_scales=scale_b,
+                bias=None)
+
+            reduce_scatter = torch.ops.vllm.reduce_scatter.default(
+                cutlass_scaled_mm[1],
+                dim=0,
+                world_size=self.tp_size,
+                group_name=self.tp.unique_name)
+            return reduce_scatter
+
+        def replacement(input: torch.Tensor, mat2: torch.Tensor,
+                        scale_a: torch.Tensor, scale_b: torch.Tensor,
+                        cutlass_mm_output: torch.Tensor) -> torch.Tensor:
+            gemm_rs = torch.ops.symm_mem.fused_scaled_matmul_reduce_scatter(
+                input,
+                mat2,
+                scale_a,
+                scale_b,
+                "avg",
+                scatter_dim=0,
+                out_dtype=self.dtype,
+                group_name=self.tp.device_group.group_name,
+            )
+
+            return gemm_rs
+
+        pm.register_replacement(pattern, replacement, self.get_inputs(),
+                                pm.fwd_only, pm_pass)
+
+
+class AllGatherCutlassScaledMMPattern(BasePattern):
+
+    def get_inputs(self):
+        x = torch.empty([8, 16], device=self.device, dtype=FP8_DTYPE)
+        weight = torch.empty([16, 16], device=self.device,
+                             dtype=FP8_DTYPE).contiguous().transpose(0, 1)
+
+        s1 = x.shape[0] * self.tp_size
+
+        scale_a = torch.empty([s1, 1], device=self.device, dtype=torch.float32)
+        scale_b = torch.empty([1, 16], device=self.device, dtype=torch.float32)
+
+        s2 = weight.shape[1]
+        output = torch.empty([s1, s2], device=self.device, dtype=self.dtype)
+
+        return [x, weight, scale_a, scale_b, output]
+
+    def register(self, pm_pass: PatternMatcherPass):
+
+        def pattern(
+            x: torch.Tensor,
+            weight: torch.Tensor,
+            scale_a: torch.Tensor,
+            scale_b: torch.Tensor,
+            output: torch.Tensor,
+        ) -> torch.Tensor:
+            all_gather = torch.ops.vllm.all_gather.default(
+                x,
+                dim=0,
+                world_size=self.tp_size,
+                group_name=self.tp.unique_name)
+
+            cutlass_scaled_mm = torch.ops.higher_order.auto_functionalized(
+                torch.ops._C.cutlass_scaled_mm.default,
+                out=output,
+                a=all_gather,
+                b=weight,
+                a_scales=scale_a,
+                b_scales=scale_b,
+                bias=None)
+            return cutlass_scaled_mm[1]
+
+        def replacement(x: torch.Tensor, weight: torch.Tensor,
+                        scale_a: torch.Tensor, scale_b: torch.Tensor,
+                        output: torch.Tensor) -> torch.Tensor:
+            ag_output, mm_outputs = torch.ops.symm_mem.fused_all_gather_scaled_matmul(  # noqa
+                x,
+                [weight],
+                scale_a,
+                [scale_b],
+                gather_dim=0,
+                biases=[None],
+                result_scales=[None],
+                out_dtypes=[self.dtype],
+                use_fast_accum=[False],
+                group_name=self.tp.device_group.group_name,
+            )
+            return mm_outputs
+
+        pm.register_replacement(pattern, replacement, self.get_inputs(),
+                                pm.fwd_only, pm_pass)
+
+
 class AsyncTPPass(VllmInductorPass):
 
     def __init__(self, config: VllmConfig):
@@ -133,6 +359,20 @@ class AsyncTPPass(VllmInductorPass):
         AllGatherGEMMPattern(self.model_dtype,
                              self.device).register(self.patterns)
 
+        # These fusions are enabled only for bfloat16 models because
+        # `scaled_mm` or `cutlass_scaled_mm` with per-token (row-wise) scaling
+        # only supports bfloat16 as the output dtype.
+        if self.model_dtype == torch.bfloat16:
+            ScaledMMReduceScatterPattern(self.model_dtype,
+                                         self.device).register(self.patterns)
+            AllGatherScaledMMPattern(self.model_dtype,
+                                     self.device).register(self.patterns)
+
+            CutlassScaledMMReduceScatterPattern(
+                self.model_dtype, self.device).register(self.patterns)
+            AllGatherCutlassScaledMMPattern(
+                self.model_dtype, self.device).register(self.patterns)
+
     def is_applicable_for_shape(self, shape: Optional[int]) -> bool:
         # only do replace for specific shapes
         tp_size = get_tensor_model_parallel_world_size()
@@ -142,7 +382,7 @@ class AsyncTPPass(VllmInductorPass):
         self.begin()
         self.dump_graph(graph, "before_async_tp_pass")
         count = self.patterns.apply(graph)
-        logger.debug("Replaced %s patterns", count)
+        logger.debug("Replaced %s patterns with async TP pass.", count)
         self.dump_graph(graph, "after_async_tp_pass")
         self.end_and_log()
 
diff --git a/vllm/compilation/counter.py b/vllm/compilation/counter.py
index 6acb8abb3deb1..e01dd3915a3a1 100644
--- a/vllm/compilation/counter.py
+++ b/vllm/compilation/counter.py
@@ -27,6 +27,8 @@ class CompilationCounter:
     num_cache_entries_updated: int = 0
     # The number of standalone_compile compiled artifacts saved
     num_compiled_artifacts_saved: int = 0
+    # Number of times a model was loaded with CompilationLevel.DYNAMO_AS_IS
+    dynamo_as_is_count: int = 0
 
     def clone(self) -> "CompilationCounter":
         return copy.deepcopy(self)
diff --git a/vllm/compilation/sequence_parallelism.py b/vllm/compilation/sequence_parallelism.py
index 6107046e40dcd..ebc025cba71ed 100644
--- a/vllm/compilation/sequence_parallelism.py
+++ b/vllm/compilation/sequence_parallelism.py
@@ -477,6 +477,6 @@ class SequenceParallelismPass(VllmInductorPass):
         self.begin()
         self.dump_graph(graph, "before_sequence_parallelism_pass")
         count = self.patterns.apply(graph)
-        logger.debug("Replaced %s patterns", count)
+        logger.debug("Replaced %s patterns with sequence parallelism", count)
         self.dump_graph(graph, "after_sequence_parallelism_pass")
         self.end_and_log()
diff --git a/vllm/config.py b/vllm/config.py
index a74821f28140e..dd6ff26c186c7 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -26,7 +26,7 @@ from pydantic import (ConfigDict, SkipValidation, TypeAdapter, field_validator,
 from pydantic.dataclasses import dataclass
 from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE
 from torch.distributed import ProcessGroup, ReduceOp
-from typing_extensions import Self, runtime_checkable
+from typing_extensions import Self, assert_never, runtime_checkable
 
 import vllm.envs as envs
 from vllm import version
@@ -102,12 +102,63 @@ RunnerOption = Literal["auto", "generate", "pooling", "draft"]
 
 RunnerType = Literal["generate", "pooling", "draft"]
 
-_RUNNER_TASKS: dict[RunnerType, list[_ResolvedTask]] = {
+ConvertOption = Literal["auto", "none", "embed", "classify", "reward"]
+
+ConvertType = Literal["none", "embed", "classify", "reward"]
+
+_RUNNER_TASKS: dict[RunnerType, list[TaskOption]] = {
     "generate": ["generate", "transcription"],
-    "pooling": ["encode", "embed", "classify", "reward"],
+    "pooling": ["embedding", "embed", "classify", "score", "reward"],
+    "draft": ["draft"],
+}
+
+_RUNNER_CONVERTS: dict[RunnerType, list[ConvertType]] = {
+    "generate": [],
+    "pooling": ["embed", "classify", "reward"],
     "draft": [],
 }
 
+# Some model suffixes are based on auto classes from Transformers:
+# https://huggingface.co/docs/transformers/en/model_doc/auto
+# NOTE: Items higher on this list priority over lower ones
+_SUFFIX_TO_DEFAULTS: list[tuple[str, tuple[RunnerType, ConvertType]]] = [
+    ("ForCausalLM", ("generate", "none")),
+    ("ForConditionalGeneration", ("generate", "none")),
+    ("ChatModel", ("generate", "none")),
+    ("LMHeadModel", ("generate", "none")),
+    ("ForTextEncoding", ("pooling", "embed")),
+    ("EmbeddingModel", ("pooling", "embed")),
+    ("ForSequenceClassification", ("pooling", "classify")),
+    ("ForAudioClassification", ("pooling", "classify")),
+    ("ForImageClassification", ("pooling", "classify")),
+    ("ForVideoClassification", ("pooling", "classify")),
+    ("ClassificationModel", ("pooling", "classify")),
+    ("ForRewardModeling", ("pooling", "reward")),
+    ("RewardModel", ("pooling", "reward")),
+    # Let other `*Model`s take priority
+    ("Model", ("pooling", "embed")),
+]
+
+
+def iter_architecture_defaults():
+    yield from _SUFFIX_TO_DEFAULTS
+
+
+def try_match_architecture_defaults(
+    architecture: str,
+    *,
+    runner_type: Optional[RunnerType] = None,
+    convert_type: Optional[ConvertType] = None,
+) -> Optional[tuple[str, tuple[RunnerType, ConvertType]]]:
+    for suffix, (default_runner_type,
+                 default_convert_type) in iter_architecture_defaults():
+        if ((runner_type is None or runner_type == default_runner_type) and
+            (convert_type is None or convert_type == default_convert_type)
+                and architecture.endswith(suffix)):
+            return suffix, (default_runner_type, default_convert_type)
+
+    return None
+
 
 @runtime_checkable
 class SupportsHash(Protocol):
@@ -236,11 +287,16 @@ class ModelConfig:
     runner: RunnerOption = "auto"
     """The type of model runner to use. Each vLLM instance only supports one
     model runner, even if the same model can be used for multiple types."""
-    task: TaskOption = "auto"
-    """The task to use the model for. If the model supports more than one
-    model runner, this is used to select which model runner to run.
+    convert: ConvertOption = "auto"
+    """Convert the model using adapters defined in
+    [vllm.model_executor.models.adapters][]. The most common use case is to
+    adapt a text generation model to be used for pooling tasks."""
+    task: Optional[TaskOption] = None
+    """[DEPRECATED] The task to use the model for. If the model supports more
+    than one model runner, this is used to select which model runner to run.
 
-    Note that the model may support other tasks using the same model runner."""
+    Note that the model may support other tasks using the same model runner.
+    """
     tokenizer: SkipValidation[str] = None  # type: ignore
     """Name or path of the Hugging Face tokenizer to use. If unspecified, model
     name or path will be used."""
@@ -558,48 +614,103 @@ class ModelConfig:
         self.hf_image_processor_config = get_hf_image_processor_config(
             self.model, hf_token=self.hf_token, revision=self.revision)
 
-        # For pooling models, self.task is used to indicate the
-        # user-selected task
-        if self.task == "score":
-            if self._is_classify_task(self.architectures):
-                self.task = "classify"
+        architectures = self.architectures
+        registry = self.registry
+        is_generative_model = registry.is_text_generation_model(
+            architectures, self)
+        is_pooling_model = registry.is_pooling_model(architectures, self)
+
+        def _task_to_convert(task: TaskOption) -> ConvertType:
+            if task == "embedding" or task == "embed":
+                return "embed"
+            if task == "classify":
+                return "classify"
+            if task == "reward":
+                return "reward"
+            if task == "score":
+                new_task = self._get_default_pooling_task(architectures)
+                return "classify" if new_task == "classify" else "embed"
+
+            return "none"
+
+        if self.task is not None:
+            runner: RunnerOption = "auto"
+            convert: ConvertOption = "auto"
+            msg_prefix = ("The 'task' option has been deprecated and will be "
+                          "removed in v0.13.0 or v1.0, whichever comes first.")
+            msg_hint = "Please remove this option."
+
+            is_generative_task = self.task in _RUNNER_TASKS["generate"]
+            is_pooling_task = self.task in _RUNNER_TASKS["pooling"]
+
+            if is_generative_model and is_pooling_model:
+                if is_generative_task:
+                    runner = "generate"
+                    convert = "auto"
+                    msg_hint = ("Please replace this option with `--runner "
+                                "generate` to continue using this model "
+                                "as a generative model.")
+                elif is_pooling_task:
+                    runner = "pooling"
+                    convert = "auto"
+                    msg_hint = ("Please replace this option with `--runner "
+                                "pooling` to continue using this model "
+                                "as a pooling model.")
+                else:  # task == "auto"
+                    pass
+            elif is_generative_model or is_pooling_model:
+                if is_generative_task:
+                    runner = "generate"
+                    convert = "auto"
+                    msg_hint = "Please remove this option"
+                elif is_pooling_task:
+                    runner = "pooling"
+                    convert = _task_to_convert(self.task)
+                    msg_hint = ("Please replace this option with `--convert "
+                                f"{convert}` to continue using this model "
+                                "as a pooling model.")
+                else:  # task == "auto"
+                    pass
             else:
-                self.task = "embed"
-        elif self.task == "embedding":
-            msg = ("The 'embedding' task has been renamed to 'embed', please "
-                   "use the new name. The old name will be removed in v1.0.")
+                raise AssertionError("The model should be a generative or "
+                                     "pooling model when task is set to "
+                                     f"{self.task!r}.")
+
+            self.runner = runner
+            self.convert = convert
+
+            msg = f"{msg_prefix} {msg_hint}"
             warnings.warn(msg, DeprecationWarning, stacklevel=2)
 
-            self.task = "embed"
+        self.runner_type = self._get_runner_type(architectures, self.runner)
+        self.convert_type = self._get_convert_type(architectures,
+                                                   self.runner_type,
+                                                   self.convert)
 
-        model_info, arch = self.registry.inspect_model_cls(self.architectures)
+        if self.runner_type == "generate" and not is_generative_model:
+            generate_converts = _RUNNER_CONVERTS["generate"]
+            if self.convert_type not in generate_converts:
+                # Currently we don't have any converters for generative models
+                raise ValueError(
+                    "This model does not support `--runner generate`.")
+        if self.runner_type == "pooling" and not is_pooling_model:
+            pooling_converts = _RUNNER_CONVERTS["pooling"]
+            if self.convert_type not in pooling_converts:
+                convert_option = "<" + "|".join(pooling_converts) + ">"
+                raise ValueError(
+                    "This model does not support `--runner pooling`. "
+                    f"You can pass `--convert {convert_option} to adapt "
+                    "it into a pooling model.")
+
+        self.supported_tasks = self._get_supported_tasks(
+            architectures, self.runner_type, self.convert_type)
+
+        # Note: Initialize these attributes early because transformers fallback
+        # may fail to load dynamic modules in child processes
+        model_info, arch = registry.inspect_model_cls(architectures, self)
         self._model_info = model_info
         self._architecture = arch
-
-        all_supported_tasks = self._get_supported_tasks(self.task)
-        logger.debug("Tasks supported by runner type: %s", all_supported_tasks)
-        supported_runner_types = self._get_supported_runner_types(
-            all_supported_tasks)
-        runner_type = self._resolve_runner(self.runner, self.task,
-                                           supported_runner_types,
-                                           all_supported_tasks)
-
-        logger.debug("Selected runner type: %s", runner_type)
-        # For pooling models, self.task is used to indicate the
-        # user-selected task
-        if runner_type == "pooling" and self.task == "auto":
-            selected_task = all_supported_tasks[runner_type][-1]
-            assert selected_task != "encode"
-            self.task = selected_task
-        self.supported_runner_types = supported_runner_types
-        self.runner_type = runner_type
-        self.supported_tasks = all_supported_tasks[runner_type]
-
-        if self.runner_type in ("draft",
-                                "generate") and self.task != "transcription":
-            self.truncation_side = "left"
-        else:
-            self.truncation_side = "right"
+        logger.info("Resolved architecture: %s", arch)
 
         self.pooler_config = self._init_pooler_config()
 
@@ -612,11 +723,16 @@ class ModelConfig:
         )
 
         # Workaround for Gemma 2 which uses interleaved sliding window
-        # attention, but it's not specified in its config. TODO: remove this
-        # when Gemma 2 is fixed in Transformers.
+        # attention, but it's not specified in its config.
+        # TODO: remove this when Gemma 2 config updated in HuggingFace.
         if self.hf_text_config.model_type == "gemma2":
             self.hf_text_config.sliding_window_pattern = 2
 
+        # TODO: remove this when Gemma 3n config updated in HuggingFace.
+        if self.hf_text_config.model_type == "gemma3n_text":
+            # 4 sliding window attention followed by 1 full attention
+            self.hf_text_config.sliding_window_pattern = "LLLLG"
+
         sliding_window = getattr(self.hf_text_config, "sliding_window", None)
         sliding_window_pattern = getattr(self.hf_text_config,
                                          "sliding_window_pattern", None)
@@ -624,8 +740,8 @@ class ModelConfig:
             isinstance(sliding_window, list))
 
         if not self.disable_sliding_window and has_interleaved_attention:
-            if (backend :=
-                    envs.VLLM_ATTENTION_BACKEND) in ("XFORMERS", "FLASHINFER"):
+            if not envs.VLLM_USE_V1 and (backend := envs.VLLM_ATTENTION_BACKEND
+                                         ) in ("XFORMERS", "FLASHINFER"):
                 sliding_window_len_min = get_min_sliding_window(
                     self.hf_text_config.sliding_window)
 
@@ -652,20 +768,17 @@ class ModelConfig:
         self.original_max_model_len = self.max_model_len
         self.max_model_len = self.get_and_verify_max_len(self.max_model_len)
         self.multimodal_config = self._init_multimodal_config()
-        self.model_supports_multimodal_raw_input = (
-            self.registry.supports_multimodal_raw_input(self.architectures))
+
         if not self.skip_tokenizer_init:
             self._verify_tokenizer_mode()
 
-        self.is_attention_free = self._init_attention_free()
-        self.is_hybrid = self._init_is_hybrid()
-        self.has_noops = self._init_has_noops()
-        self.has_inner_state = self._init_has_inner_state()
-
         if (not current_platform.is_neuron() and self.override_neuron_config):
             raise ValueError(
                 "`override_neuron_config` is only supported on Neuron.")
 
+        # Avoid running try_verify_and_update_config multiple times
+        self.config_updated = False
+
         self._verify_quantization()
         self._verify_cuda_graph()
         self._verify_bnb_config()
@@ -702,30 +815,13 @@ class ModelConfig:
 
     @property
     def architectures(self) -> list[str]:
-        # architectures in the model config.
-        architectures = getattr(self.hf_config, "architectures", [])
-        # The registry assumes that it can always inspect the vLLM model class
-        # for a given architecture. This assumption breaks down for the
-        # Transformers backend, which may use a different class depending on
-        # the model type. To work around this, we add the correct Transformers
-        # backend class to the architectures list. We must do this here because
-        # we need access to the `hf_config` to determine the backend class.
-        transformers_backend_cls = self._get_transformers_backend_cls()
-        if (self.model_impl != ModelImpl.VLLM.value
-                and all(arch != transformers_backend_cls
-                        for arch in architectures)):
-            architectures.append(transformers_backend_cls)
-        return architectures
+        return getattr(self.hf_config, "architectures", [])
 
     @property
     def architecture(self) -> str:
-        # The architecture vllm actually used.
+        """The architecture vllm actually used."""
         return self._architecture
 
-    @property
-    def model_info(self):
-        return self._model_info
-
     def maybe_pull_model_tokenizer_for_s3(self, model: str,
                                           tokenizer: str) -> None:
         """Pull model/tokenizer from S3 to temporary directory when needed.
@@ -763,7 +859,7 @@ class ModelConfig:
             self.tokenizer = s3_tokenizer.dir
 
     def _init_multimodal_config(self) -> Optional["MultiModalConfig"]:
-        if self.registry.is_multimodal_model(self.architectures):
+        if self._model_info.supports_multimodal:
             return MultiModalConfig(
                 limit_per_prompt=self.limit_mm_per_prompt,
                 media_io_kwargs=self.media_io_kwargs,
@@ -772,19 +868,6 @@ class ModelConfig:
                 disable_mm_preprocessor_cache,
                 interleave_mm_strings=self.interleave_mm_strings)
 
-        if self.limit_mm_per_prompt:
-            raise ValueError("`limit_mm_per_prompt` is only supported for "
-                             "multimodal models.")
-        if self.mm_processor_kwargs:
-            raise ValueError("`mm_processor_kwargs` is only supported for "
-                             "multimodal models.")
-        if self.disable_mm_preprocessor_cache:
-            raise ValueError("`disable_mm_preprocessor_cache` is only "
-                             "supported for multimodal models.")
-        if self.interleave_mm_strings:
-            raise ValueError("`interleave_mm_strings` is only "
-                             "supported for multimodal models.")
-
         return None
 
     def _get_encoder_config(self):
@@ -819,19 +902,6 @@ class ModelConfig:
 
         return None
 
-    def _init_attention_free(self) -> bool:
-        return self.registry.is_attention_free_model(self.architectures)
-
-    def _init_is_hybrid(self) -> bool:
-        return self.registry.is_hybrid_model(self.architectures)
-
-    def _init_has_noops(self) -> bool:
-        architectures = getattr(self.hf_config, "architectures", [])
-        return self.registry.is_noops_model(architectures)
-
-    def _init_has_inner_state(self) -> bool:
-        return self.registry.model_has_inner_state(self.architectures)
-
     def _verify_tokenizer_mode(self) -> None:
         tokenizer_mode = cast(TokenizerMode, self.tokenizer_mode.lower())
         if tokenizer_mode not in get_args(TokenizerMode):
@@ -840,155 +910,174 @@ class ModelConfig:
                 f"one of {get_args(TokenizerMode)}.")
         self.tokenizer_mode = tokenizer_mode
 
-    def _is_classify_task(self, architectures: list[str]):
-        for arch in architectures:
-            if arch.endswith("ForSequenceClassification"):
-                return True
-        return self.registry.is_cross_encoder_model(architectures)
-
-    def _get_preferred_pooling_task(
+    def _get_default_runner_type(
         self,
         architectures: list[str],
-    ) -> _ResolvedTask:
-        model_id = self.model
-        if get_pooling_config(model_id, self.revision):
+    ) -> RunnerType:
+        registry = self.registry
+
+        # Some Sentence Transformers models use *ForCausalLM archs
+        if get_pooling_config(self.model, self.revision):
+            return "pooling"
+
+        for arch in architectures:
+            if arch in registry.get_supported_archs():
+                if registry.is_pooling_model(architectures, self):
+                    return "pooling"
+                if registry.is_text_generation_model(architectures, self):
+                    return "generate"
+
+            match = try_match_architecture_defaults(arch)
+            if match:
+                _, (runner_type, _) = match
+                return runner_type
+
+        return "generate"
+
+    def _get_runner_type(
+        self,
+        architectures: list[str],
+        runner: RunnerOption,
+    ) -> RunnerType:
+        if runner != "auto":
+            return runner
+
+        runner_type = self._get_default_runner_type(architectures)
+
+        # Don't log the most common case
+        if runner_type != "generate":
+            logger.info(
+                "Resolved `--runner auto` to `--runner %s`. "
+                "Pass the value explicitly to silence this message.",
+                runner_type)
+
+        return runner_type
+
+    def _get_default_convert_type(
+        self,
+        architectures: list[str],
+        runner_type: RunnerType,
+    ) -> ConvertType:
+        registry = self.registry
+
+        for arch in architectures:
+            if arch in registry.get_supported_archs():
+                if (runner_type == "generate"
+                        and registry.is_text_generation_model(
+                            architectures, self)):
+                    return "none"
+                if (runner_type == "pooling"
+                        and registry.is_pooling_model(architectures, self)):
+                    return "none"
+
+            match = try_match_architecture_defaults(arch,
+                                                    runner_type=runner_type)
+            if match:
+                _, (_, convert_type) = match
+                return convert_type
+
+        # This is to handle Sentence Transformers models that use *ForCausalLM
+        # and also multi-modal pooling models which are not defined as
+        # Sentence Transformers models
+        if runner_type == "pooling":
             return "embed"
-        if self.registry.is_transcription_model(architectures):
-            return "transcription"
 
-        suffix_to_preferred_task: list[tuple[str, _ResolvedTask]] = [
-            # Other models follow this pattern
-            ("EmbeddingModel", "embed"),
-            ("RewardModel", "reward"),
-        ]
+        return "none"
 
-        for suffix, pref_task in suffix_to_preferred_task:
-            if self.architecture.endswith(suffix):
-                return pref_task
+    def _get_convert_type(
+        self,
+        architectures: list[str],
+        runner_type: RunnerType,
+        convert: ConvertOption,
+    ) -> ConvertType:
+        if convert != "auto":
+            return convert
 
-        return "embed"
+        convert_type = self._get_default_convert_type(architectures,
+                                                      runner_type)
+
+        # Don't log the most common case
+        if convert_type != "none":
+            logger.info(
+                "Resolved `--convert auto` to `--convert %s`. "
+                "Pass the value explicitly to silence this message.",
+                convert_type)
+
+        return convert_type
 
     def _get_supported_generation_tasks(
         self,
-        task_option: TaskOption,
+        architectures: list[str],
+        convert_type: ConvertType,
     ) -> list[_ResolvedTask]:
         registry = self.registry
-        architectures = self.architectures
 
-        if registry.is_transcription_only_model(architectures):
+        if registry.is_transcription_only_model(architectures, self):
             return ["transcription"]
 
+        # TODO: Use get_supported_generation_tasks once V0 is removed
         supported_tasks = list[_ResolvedTask]()
-        if registry.is_text_generation_model(architectures):
+        if (registry.is_text_generation_model(architectures, self)
+                or convert_type in _RUNNER_CONVERTS["generate"]):
             supported_tasks.append("generate")
 
-            if registry.is_transcription_model(architectures):
-                supported_tasks.append("transcription")
+        if registry.is_transcription_model(architectures, self):
+            supported_tasks.append("transcription")
 
         return supported_tasks
 
+    def _get_default_pooling_task(
+        self,
+        architectures: list[str],
+    ) -> Literal["embed", "classify", "reward"]:
+        if self.registry.is_cross_encoder_model(architectures, self):
+            return "classify"
+
+        for arch in architectures:
+            match = try_match_architecture_defaults(arch,
+                                                    runner_type="pooling")
+            if match:
+                _, (_, convert_type) = match
+                assert convert_type != "none"
+                return convert_type
+
+        return "embed"
+
     def _get_supported_pooling_tasks(
         self,
-        task_option: TaskOption,
+        architectures: list[str],
+        convert_type: ConvertType,
     ) -> list[_ResolvedTask]:
         registry = self.registry
-        architectures = self.architectures
 
+        # TODO: Use get_supported_pooling_tasks once V0 is removed
         supported_tasks = list[_ResolvedTask]()
-        if registry.is_pooling_model(architectures):
+        if (registry.is_pooling_model(architectures, self)
+                or convert_type in _RUNNER_CONVERTS["pooling"]):
             supported_tasks.append("encode")
 
-            # For now, users must specify the task (other than "pooling")
-            # to use for pooling models
-            if task_option == "auto":
-                preferred_task = self._get_preferred_pooling_task(
-                    architectures)
-
-                supported_tasks.append(preferred_task)
-            elif task_option in _RUNNER_TASKS["pooling"]:
-                supported_tasks.append(cast(_ResolvedTask, task_option))
+            extra_task = (self._get_default_pooling_task(architectures)
+                          if convert_type == "none" else convert_type)
+            supported_tasks.append(extra_task)
 
         return supported_tasks
 
     def _get_supported_tasks(
         self,
-        task_option: TaskOption,
-    ) -> dict[RunnerType, list[_ResolvedTask]]:
-        if self._is_classify_task(self.architectures):
-            return {"generate": [], "pooling": ["classify"], "draft": []}
-        else:
-            return {
-                "generate": self._get_supported_generation_tasks(task_option),
-                "pooling": self._get_supported_pooling_tasks(task_option),
-                "draft": ["draft"]
-            }
+        architectures: list[str],
+        runner_type: RunnerType,
+        convert_type: ConvertType,
+    ) -> list[_ResolvedTask]:
+        if runner_type == "generate":
+            return self._get_supported_generation_tasks(
+                architectures, convert_type)
+        if runner_type == "pooling":
+            return self._get_supported_pooling_tasks(architectures,
+                                                     convert_type)
+        if runner_type == "draft":
+            return ["draft"]
 
-    def _get_supported_runner_types(
-        self,
-        supported_tasks: dict[RunnerType, list[_ResolvedTask]],
-    ) -> set[RunnerType]:
-        return {
-            runner
-            for runner, runner_tasks in supported_tasks.items()
-            if len(runner_tasks) > 0
-        }
-
-    def _resolve_runner(
-        self,
-        runner_option: RunnerOption,
-        task_option: TaskOption,
-        supported_runner_types: set[RunnerType],
-        supported_tasks: dict[RunnerType, list[_ResolvedTask]],
-    ) -> RunnerType:
-        if not supported_runner_types:
-            raise ValueError("This model does not support any model runners!")
-
-        if runner_option != "auto":
-            if runner_option not in supported_runner_types:
-                raise ValueError(
-                    f"This model does not support runner={runner_option!r}. "
-                    f"Available runners: {supported_runner_types}")
-
-            return runner_option
-
-        if task_option != "auto":
-            for runner, runner_tasks in supported_tasks.items():
-                if task_option in runner_tasks:
-                    return runner
-            else:
-                task_runner: RunnerType = next(
-                    runner for runner, tasks in _RUNNER_TASKS.items()
-                    if task_option in tasks)
-                raise ValueError(
-                    f"This model does not support task={task_option!r}. "
-                    f"Available tasks for runner={task_runner!r}: "
-                    f"{supported_tasks[task_runner]}")
-
-        if "classify" in supported_tasks.get("pooling", []):
-            # When multiple pooling tasks are present, default to
-            # pooling (eg cross-encoder) for non-standard architectures.
-            return "pooling"
-
-        suffix_to_preferred_runner: list[tuple[str, RunnerType]] = [
-            ("ForCausalLM", "generate"),
-            ("ForConditionalGeneration", "generate"),
-            ("ChatModel", "generate"),
-            ("LMHeadModel", "generate"),
-            ("EmbeddingModel", "pooling"),
-            ("RewardModel", "pooling"),
-        ]
-
-        for suffix, pref_runner in suffix_to_preferred_runner:
-            if self.architecture.endswith(
-                    suffix) and pref_runner in supported_runner_types:
-                return pref_runner
-
-        if "generate" in supported_runner_types:
-            return "generate"
-        if "pooling" in supported_runner_types:
-            return "pooling"
-
-        raise AssertionError("This line should not be reached")
+        assert_never(runner_type)
 
     def _parse_quant_hf_config(self):
         quant_cfg = getattr(self.hf_config, "quantization_config", None)
@@ -1216,7 +1305,8 @@ class ModelConfig:
 
         pipeline_parallel_size = parallel_config.pipeline_parallel_size
         if pipeline_parallel_size > 1:
-            if not self.registry.is_pp_supported_model(self.architectures):
+            if not self.registry.is_pp_supported_model(self.architectures,
+                                                       self):
                 raise NotImplementedError(
                     "Pipeline parallelism is not supported for this model. "
                     "Supported models implement the `SupportsPP` interface.")
@@ -1558,17 +1648,41 @@ class ModelConfig:
 
     @property
     def is_cross_encoder(self) -> bool:
-        return self.task == "classify"
+        return (self._model_info.supports_cross_encoding
+                or self.convert_type == "classify")
+
+    @property
+    def is_pp_supported(self) -> bool:
+        return self._model_info.supports_pp
+
+    @property
+    def is_multimodal_raw_input_supported(self) -> bool:
+        return self._model_info.supports_multimodal_raw_input
+
+    @property
+    def is_attention_free(self) -> bool:
+        return self._model_info.is_attention_free
+
+    @property
+    def is_hybrid(self) -> bool:
+        return self._model_info.is_hybrid
+
+    @property
+    def has_noops(self) -> bool:
+        return self._model_info.has_noops
+
+    @property
+    def has_inner_state(self):
+        return self._model_info.has_inner_state
+
+    @property
+    def is_v1_compatible(self) -> bool:
+        return not self._model_info.supports_v0_only
 
     @property
     def use_mla(self) -> bool:
         return self.is_deepseek_mla and not envs.VLLM_MLA_DISABLE
 
-    @property
-    def is_v1_compatible(self) -> bool:
-        architectures = getattr(self.hf_config, "architectures", [])
-        return me_models.ModelRegistry.is_v1_compatible(architectures)
-
     @property
     def is_matryoshka(self) -> bool:
         return (bool(getattr(self.hf_config, "matryoshka_dimensions", None))
@@ -1655,8 +1769,8 @@ class CacheConfig:
     - "builtin" is Python's built-in hash.\n
     - "sha256" is collision resistant but with certain overheads.
     This option uses Pickle for object serialization before hashing.\n
-    - "sha256_cbor_64bit" provides a reproducible, cross-language compatible 
-    hash. It serializes objects using canonical CBOR and hashes them with 
+    - "sha256_cbor_64bit" provides a reproducible, cross-language compatible
+    hash. It serializes objects using canonical CBOR and hashes them with
     SHA-256. The resulting hash consists of the lower 64 bits of the SHA-256
     digest."""
     cpu_offload_gb: float = 0
@@ -1684,6 +1798,16 @@ class CacheConfig:
     num_cpu_blocks: Optional[int] = field(default=None, init=False)
     """The number of blocks to allocate for CPU memory."""
 
+    kv_sharing_fast_prefill: bool = False
+    """This feature is work in progress and no prefill optimization takes place
+    with this flag enabled currently.
+
+    In some KV sharing setups, e.g. YOCO (https://arxiv.org/abs/2405.05254),
+    some layers can skip tokens corresponding to prefill. This flag enables
+    attention metadata for eligible layers to be overriden with metadata
+    necessary for implementating this optimization in some models (e.g. Gemma3n)
+    """
+
     def compute_hash(self) -> str:
         """
         WARNING: Whenever a new field is added to this config,
@@ -1725,6 +1849,11 @@ class CacheConfig:
                 "GPU memory utilization must be less than 1.0. Got "
                 f"{self.gpu_memory_utilization}.")
 
+        if self.kv_sharing_fast_prefill:
+            logger.warning_once(
+                "--kv-sharing-fast-prefill is currently work in progress "
+                "and not functional yet (i.e. no prefill savings)")
+
         return self
 
     def _verify_cache_dtype(self) -> None:
@@ -3156,7 +3285,7 @@ class MultiModalConfig:
     Defaults to 1 (V0) or 999 (V1) for each modality.
 
     For example, to allow up to 16 images and 2 videos per prompt:
-    `{"images": 16, "videos": 2}`
+    `{"image": 16, "video": 2}`
     """
 
     media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
@@ -3613,12 +3742,7 @@ def get_served_model_name(model: str,
     return served_model_name
 
 
-GuidedDecodingBackendV0 = Literal["auto", "outlines", "lm-format-enforcer",
-                                  "xgrammar", "guidance"]
-
-GuidedDecodingBackendV1 = Literal["auto", "xgrammar", "guidance", "outlines"]
-GuidedDecodingBackend = Literal[GuidedDecodingBackendV0,
-                                GuidedDecodingBackendV1]
+GuidedDecodingBackend = Literal["auto", "xgrammar", "guidance", "outlines"]
 
 
 @config
@@ -3626,7 +3750,7 @@ GuidedDecodingBackend = Literal[GuidedDecodingBackendV0,
 class DecodingConfig:
     """Dataclass which contains the decoding strategy of the engine."""
 
-    backend: GuidedDecodingBackend = "auto" if envs.VLLM_USE_V1 else "xgrammar"
+    backend: GuidedDecodingBackend = "auto"
     """Which engine will be used for guided decoding (JSON schema / regex etc)
     by default. With "auto", we will make opinionated choices based on request
     contents and what the backend libraries currently support, so the behavior
@@ -3668,13 +3792,6 @@ class DecodingConfig:
         return hash_str
 
     def __post_init__(self):
-        if envs.VLLM_USE_V1:
-            valid_guided_backends = get_args(GuidedDecodingBackendV1)
-        else:
-            valid_guided_backends = get_args(GuidedDecodingBackendV0)
-        if self.backend not in valid_guided_backends:
-            raise ValueError(f"Invalid backend '{self.backend}',"
-                             f" must be one of {valid_guided_backends}")
         if (self.disable_any_whitespace
                 and self.backend not in ("xgrammar", "guidance")):
             raise ValueError("disable_any_whitespace is only supported for "
@@ -4010,9 +4127,11 @@ class CompilationConfig:
         certain small batchsizes, where inductor is good at optimizing.
     """
     # Top-level Compilation control
-    level: int = 0
+    level: Optional[int] = None
     """The level of compilation:
 
+    - None: If None, we will select the default compilation level.
+      For V1 engine this is 3, for V0 engine this is 0.
     - 0: no compilation.
     - 1: dynamo as is.
     - 2: dynamo once.
@@ -4568,6 +4687,22 @@ class VllmConfig:
                 "To workaround this limitation, vLLM will set 'ieee' input "
                 "precision for chunked prefill triton kernels.")
 
+        # If the user does not explicitly set a compilation level, then
+        # we use the default level. The default level depends on other
+        # settings (see the below code).
+        if self.compilation_config.level is None:
+            if envs.VLLM_USE_V1:
+                if (self.model_config is not None
+                        and not self.model_config.enforce_eager):
+                    self.compilation_config.level = CompilationLevel.PIECEWISE
+                else:
+                    self.compilation_config.level = \
+                            CompilationLevel.NO_COMPILATION
+            else:
+                # NB: Passing both --enforce-eager and a compilation level
+                # in V0 means the compilation level wins out.
+                self.compilation_config.level = CompilationLevel.NO_COMPILATION
+
         # async tp is built on top of sequence parallelism
         # and requires it to be enabled.
         if self.compilation_config.pass_config.enable_async_tp:
@@ -4580,7 +4715,6 @@ class VllmConfig:
             # By default, V1 uses piecewise CUDA graphs. If full_cuda_graph
             # is set to True, full CUDA graphs will be used.
             self.compilation_config.cudagraph_num_of_warmups = 1
-            self.compilation_config.level = CompilationLevel.PIECEWISE
             self.compilation_config.set_splitting_ops_for_v1()
 
         self._set_cudagraph_sizes()
@@ -4669,12 +4803,23 @@ class VllmConfig:
                 # Hybrid KV cache manager is not compatible with KV events.
                 self.scheduler_config.disable_hybrid_kv_cache_manager = True
             if self.model_config is not None and \
-                self.model_config.attention_chunk_size is not None and \
-                self.speculative_config is not None and \
-                self.speculative_config.use_eagle():
-                # Hybrid KV cache manager is not yet supported with chunked
-                # local attention + eagle.
-                self.scheduler_config.disable_hybrid_kv_cache_manager = True
+                self.model_config.attention_chunk_size is not None:
+                if self.speculative_config is not None and \
+                    self.speculative_config.use_eagle():
+                    # Hybrid KV cache manager is not yet supported with chunked
+                    # local attention + eagle.
+                    self.scheduler_config.disable_hybrid_kv_cache_manager = True
+                elif \
+                    not envs.VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE:
+                    logger.warning(
+                        "There is a latency regression when using chunked local"
+                        " attention with the hybrid KV cache manager. Disabling"
+                        " it, by default. To enable it, set the environment "
+                        "VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE=1."
+                    )
+                    # Hybrid KV cache manager is not yet supported with chunked
+                    # local attention.
+                    self.scheduler_config.disable_hybrid_kv_cache_manager = True
 
     def update_sizes_for_sequence_parallelism(self,
                                               possible_sizes: list) -> list:
@@ -4788,7 +4933,15 @@ class VllmConfig:
         self.scheduler_config.max_model_len = max_model_len
 
     def try_verify_and_update_config(self):
-        architecture = getattr(self.model_config, "architecture", None)
+        if self.model_config is None:
+            return
+
+        # Avoid running try_verify_and_update_config multiple times
+        if getattr(self.model_config, "config_updated", False):
+            return
+        self.model_config.config_updated = True
+
+        architecture = self.model_config.architecture
         if architecture is None:
             return
 
@@ -4801,7 +4954,7 @@ class VllmConfig:
         if self.model_config.is_hybrid:
             HybridAttentionMambaModelConfig.verify_and_update_config(self)
 
-        if self.model_config.task == "classify":
+        if self.model_config.convert_type == "classify":
             # Maybe convert ForCausalLM into ForSequenceClassification model.
             from vllm.model_executor.models.adapters import (
                 SequenceClassificationConfig)
@@ -4809,26 +4962,26 @@ class VllmConfig:
 
     def __str__(self):
         return (
-            f"model={self.model_config.model!r},"
-            f" speculative_config={self.speculative_config!r},"
-            f" tokenizer={self.model_config.tokenizer!r}, "
-            f"skip_tokenizer_init={self.model_config.skip_tokenizer_init},"
-            f" tokenizer_mode={self.model_config.tokenizer_mode}, "
+            f"model={self.model_config.model!r}, "
+            f"speculative_config={self.speculative_config!r}, "
+            f"tokenizer={self.model_config.tokenizer!r}, "
+            f"skip_tokenizer_init={self.model_config.skip_tokenizer_init}, "
+            f"tokenizer_mode={self.model_config.tokenizer_mode}, "
             f"revision={self.model_config.revision}, "
-            f"override_neuron_config={self.model_config.override_neuron_config},"
-            f" tokenizer_revision={self.model_config.tokenizer_revision}, "
+            f"override_neuron_config={self.model_config.override_neuron_config}, "  # noqa
+            f"tokenizer_revision={self.model_config.tokenizer_revision}, "
             f"trust_remote_code={self.model_config.trust_remote_code}, "
             f"dtype={self.model_config.dtype}, "
-            f"max_seq_len={self.model_config.max_model_len},"
-            f" download_dir={self.load_config.download_dir!r}, "
+            f"max_seq_len={self.model_config.max_model_len}, "
+            f"download_dir={self.load_config.download_dir!r}, "
             f"load_format={self.load_config.load_format}, "
-            f"tensor_parallel_size={self.parallel_config.tensor_parallel_size},"
-            f" pipeline_parallel_size={self.parallel_config.pipeline_parallel_size}, "  # noqa
+            f"tensor_parallel_size={self.parallel_config.tensor_parallel_size}, "  # noqa
+            f"pipeline_parallel_size={self.parallel_config.pipeline_parallel_size}, "  # noqa
             f"disable_custom_all_reduce={self.parallel_config.disable_custom_all_reduce}, "  # noqa
             f"quantization={self.model_config.quantization}, "
             f"enforce_eager={self.model_config.enforce_eager}, "
             f"kv_cache_dtype={self.cache_config.cache_dtype}, "
-            f" device_config={self.device_config.device}, "
+            f"device_config={self.device_config.device}, "
             f"decoding_config={self.decoding_config!r}, "
             f"observability_config={self.observability_config!r}, "
             f"seed={self.model_config.seed}, "
@@ -4941,13 +5094,29 @@ def assert_hashable(text):
 T = TypeVar("T")
 
 
-def get_layers_from_vllm_config(vllm_config: VllmConfig,
-                                layer_type: type[T]) -> dict[str, T]:
+def get_layers_from_vllm_config(
+        vllm_config: VllmConfig,
+        layer_type: type[T],
+        layer_names: Optional[list[str]] = None) -> dict[str, T]:
+    """
+    Get layers from the vLLM config.
+
+    Args:
+        vllm_config: The vLLM config.
+        layer_type: The type of the layer to get.
+        layer_names: The names of the layers to get. If None, return all layers.
+    """
+
+    if layer_names is None:
+        layer_names = list(
+            vllm_config.compilation_config.static_forward_context.keys())
+
+    forward_context = vllm_config.compilation_config.static_forward_context
+
     return {
-        layer_name: layer
-        for layer_name, layer in
-        vllm_config.compilation_config.static_forward_context.items()
-        if isinstance(layer, layer_type)
+        layer_name: forward_context[layer_name]
+        for layer_name in layer_names
+        if isinstance(forward_context[layer_name], layer_type)
     }
 
 
diff --git a/vllm/distributed/kv_transfer/kv_connector/base.py b/vllm/distributed/kv_transfer/kv_connector/base.py
index 181c33925da76..868b227fc8994 100644
--- a/vllm/distributed/kv_transfer/kv_connector/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/base.py
@@ -9,7 +9,7 @@ The class provides two primary abstract methods:
 """
 
 from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, Union
+from typing import TYPE_CHECKING, Optional, Union
 
 import torch
 
@@ -124,5 +124,19 @@ class KVConnectorBase(ABC):
 
         raise NotImplementedError
 
+    @classmethod
+    def get_required_kvcache_layout(
+            cls, vllm_config: "VllmConfig") -> Optional[str]:
+        """
+        Get the required KV cache layout for this connector.
+        Args:
+            vllm_config (VllmConfig): the vllm config.
+
+        Returns:
+            str: the required KV cache layout. e.g. HND, or NHD.
+            None if the connector does not require a specific layout.
+        """
+        return None
+
 
 KVConnectorBaseType = Union[KVConnectorBase, KVConnectorBase_V1]
diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py
index be9ce72dea67a..cf7cde2c43771 100644
--- a/vllm/distributed/kv_transfer/kv_connector/factory.py
+++ b/vllm/distributed/kv_transfer/kv_connector/factory.py
@@ -5,6 +5,7 @@ import importlib
 from typing import TYPE_CHECKING, Callable
 
 import vllm.envs as envs
+from vllm.config import KVTransferConfig
 from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBaseType
 from vllm.distributed.kv_transfer.kv_connector.v1 import (KVConnectorBase_V1,
                                                           KVConnectorRole)
@@ -41,14 +42,27 @@ class KVConnectorFactory:
             raise ValueError("Attempting to initialize a V0 Connector, "
                              f"but found {envs.VLLM_USE_V1=}")
 
-        connector_name = config.kv_transfer_config.kv_connector
-        if connector_name not in cls._registry:
-            raise ValueError(f"Unsupported connector type: {connector_name}")
-
-        connector_cls = cls._registry[connector_name]()
+        connector_cls = cls.get_connector_class(config.kv_transfer_config)
         assert issubclass(connector_cls, KVConnectorBase)
         return connector_cls(rank, local_rank, config)
 
+    @classmethod
+    def get_connector_class(
+            cls, kv_transfer_config: "KVTransferConfig"
+    ) -> type[KVConnectorBaseType]:
+        """Get the connector class by name."""
+        connector_name = kv_transfer_config.kv_connector
+        if connector_name in cls._registry:
+            connector_cls = cls._registry[connector_name]()
+        else:
+            connector_module_path = kv_transfer_config.kv_connector_module_path
+            if connector_module_path is None:
+                raise ValueError(
+                    f"Unsupported connector type: {connector_name}")
+            connector_module = importlib.import_module(connector_module_path)
+            connector_cls = getattr(connector_module, connector_name)
+        return connector_cls
+
     @classmethod
     def create_connector_v1(
         cls,
@@ -60,19 +74,10 @@ class KVConnectorFactory:
                              f"but found {envs.VLLM_USE_V1=}")
 
         kv_transfer_config = config.kv_transfer_config
-        connector_name = kv_transfer_config.kv_connector
-        if connector_name in cls._registry:
-            connector_cls = cls._registry[connector_name]()
-        else:
-            connector_module_path = kv_transfer_config.kv_connector_module_path
-            if connector_module_path is None:
-                raise ValueError(
-                    f"Unsupported connector type: {connector_name}")
-            connector_module = importlib.import_module(connector_module_path)
-            connector_cls = getattr(connector_module, connector_name)
+        connector_cls = cls.get_connector_class(kv_transfer_config)
         assert issubclass(connector_cls, KVConnectorBase_V1)
         logger.info("Creating v1 connector with name: %s and engine_id: %s",
-                    connector_name, kv_transfer_config.engine_id)
+                    connector_cls.__name__, kv_transfer_config.engine_id)
         # NOTE(Kuntai): v1 connector is explicitly separated into two roles.
         # Scheduler connector:
         # - Co-locate with scheduler process
diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py
index 459a532989140..559c233947ce8 100644
--- a/vllm/distributed/kv_transfer/kv_connector/utils.py
+++ b/vllm/distributed/kv_transfer/kv_connector/utils.py
@@ -13,6 +13,8 @@ import torch
 import vllm.envs as envs
 from vllm import _custom_ops as ops
 from vllm.config import VllmConfig, get_current_vllm_config
+from vllm.distributed.kv_transfer.kv_connector.factory import (
+    KVConnectorFactory)
 from vllm.logger import init_logger
 from vllm.v1.outputs import ModelRunnerOutput
 
@@ -103,15 +105,14 @@ def get_kv_connector_cache_layout():
     # used for faster transfer.
     vllm_config = get_current_vllm_config()
     kv_config = vllm_config.kv_transfer_config
-    if kv_config is not None and vllm_config.model_config is None:
-        logger.warning_once("Unable to detect current VLLM config. " \
-        "Defaulting to NHD kv cache layout.")
-    elif kv_config is not None:
-        use_mla = vllm_config.model_config.use_mla
-        if not use_mla and kv_config.kv_connector == "NixlConnector":
-            logger.info_once("NixlConnector detected. Setting KV cache " \
-            "layout to HND for better xfer performance.")
-            return "HND"
+    if kv_config is not None:
+        connector_cls = KVConnectorFactory.get_connector_class(kv_config)
+        required_kvcache_layout = connector_cls.get_required_kvcache_layout(
+            vllm_config)
+        if required_kvcache_layout is not None:
+            return required_kvcache_layout
+        logger.info_once("Connectors do not specify a " \
+                         "kv cache layout, defaulting to NHD.")
     return "NHD"
 
 
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
index 8bbdd7e0621c6..7a2ccb58656fd 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -299,3 +299,17 @@ class KVConnectorBase_V1(ABC):
             returned by the engine.
         """
         return False, None
+
+    @classmethod
+    def get_required_kvcache_layout(
+            cls, vllm_config: "VllmConfig") -> Optional[str]:
+        """
+        Get the required KV cache layout for this connector.
+        Args:
+            vllm_config (VllmConfig): the vllm config.
+
+        Returns:
+            str: the required KV cache layout. e.g. HND, or NHD.
+            None if the connector does not require a specific layout.
+        """
+        return None
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
index a2eaa0040191e..934a03a12ee5e 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
@@ -202,3 +202,36 @@ class MultiConnector(KVConnectorBase_V1):
         self._requests_to_connector.pop(request.request_id, None)
 
         return async_saves > 0, kv_txfer_params
+
+    @classmethod
+    def get_required_kvcache_layout(
+            cls, vllm_config: "VllmConfig") -> Optional[str]:
+        """
+        Get the required KV cache layout for this connector.
+        Args:
+            vllm_config (VllmConfig): the vllm config.
+
+        Returns:
+            str: the required KV cache layout. e.g. HND, or NHD.
+            None if the connector does not require a specific layout.
+        """
+        ktcs = vllm_config.kv_transfer_config.kv_connector_extra_config.get(
+            "connectors")
+        assert ktcs is not None
+        layouts: set[str] = set()
+        temp_vllm_config = copy.copy(vllm_config)
+        for ktc in ktcs:
+            kv_transfer_config = KVTransferConfig(**ktc)
+            temp_vllm_config.kv_transfer_config = kv_transfer_config
+            required_kvcache_layout = KVConnectorFactory.get_connector_class(
+                kv_transfer_config).get_required_kvcache_layout(
+                    temp_vllm_config)
+            if required_kvcache_layout is not None:
+                layouts.add(required_kvcache_layout)
+
+        if len(layouts) > 1:
+            raise ValueError(f"KV cache layout mismatch: "
+                             f"found {len(layouts)} different layouts "
+                             f"({', '.join(layouts) })."
+                             f"All connectors must use the same layout.")
+        return next(iter(layouts), None)
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index c06cda356f571..e7fc2b118145c 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -133,6 +133,25 @@ class NixlConnector(KVConnectorBase_V1):
             self.connector_worker = NixlConnectorWorker(
                 vllm_config, self.engine_id)
 
+    ############################################################
+    # Class Methods
+    ############################################################
+    @classmethod
+    def get_required_kvcache_layout(cls, vllm_config: VllmConfig):
+        if vllm_config.model_config is None:
+            logger.warning_once("Unable to detect current VLLM config. "
+                                "Fallback to default kv cache layout.")
+            return None
+        use_mla = vllm_config.model_config.use_mla
+        if use_mla:
+            # return None when we have mla
+            # as the layout should not matter in that case,
+            # which fallback to the default behavior.
+            return None
+        logger.info_once("NixlConnector setting KV cache "
+                         "layout to HND for better xfer performance.")
+        return "HND"
+
     ############################################################
     # Scheduler Side Methods
     ############################################################
@@ -236,13 +255,13 @@ class NixlConnectorScheduler:
         """
         For remote prefill, pull all prompt blocks from remote
         asynchronously relative to engine execution.
-        
+
         Args:
             request (Request): the request object.
             num_computed_tokens (int): the number of locally
                 computed tokens for this request
         Returns:
-            * the number of tokens that can be loaded from the 
+            * the number of tokens that can be loaded from the
               external KV cache beyond what is already computed.
             * true if the external KV cache tokens will be loaded
               asynchronously (between scheduler steps).
@@ -1025,6 +1044,11 @@ class NixlConnectorWorker:
             # Sorted dict, oldest requests are put first so we can exit early.
             if now < expires:
                 break
+            count = self.consumer_notification_counts_by_req.pop(req_id, 0)
+            logger.warning(
+                "Releasing expired KV blocks for request %s which were "
+                "retrieved by %d decode worker(s) within %d seconds.", req_id,
+                count, envs.VLLM_NIXL_ABORT_REQUEST_TIMEOUT)
             del self._reqs_to_send[req_id]
             done_sending.add(req_id)
 
@@ -1040,6 +1064,13 @@ class NixlConnectorWorker:
         for notifs in self.nixl_wrapper.get_new_notifs().values():
             for notif in notifs:
                 req_id, tp_ratio = notif.decode("utf-8").rsplit(":", 1)
+                if req_id not in self._reqs_to_send:
+                    logger.error(
+                        "Potentially invalid KV blocks for "
+                        "unrecognized request %s were retrieved by "
+                        "a decode worker. They may have expired.", req_id)
+                    continue
+
                 self.consumer_notification_counts_by_req[req_id] += 1
                 # Wait all consumers (D) to be done reading before freeing.
                 if self.consumer_notification_counts_by_req[req_id] == int(
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
index d47a75461d72e..32d0e43d71afe 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
@@ -192,8 +192,16 @@ class P2pNcclConnector(KVConnectorBase_V1):
         # Load the KV for each request each layer
         for request in metadata.requests:
             for layer_name in forward_context.no_compile_layers:
-                attn_layer = forward_context.no_compile_layers[layer_name]
-                kv_cache_layer = attn_layer.kv_cache[ \
+                layer = forward_context.no_compile_layers[layer_name]
+
+                # Only process layers that have kv_cache
+                # attribute (attention layers) Skip non-attention
+                # layers like FusedMoE
+                kv_cache = getattr(layer, 'kv_cache', None)
+                if kv_cache is None:
+                    continue
+
+                kv_cache_layer = kv_cache[ \
                     forward_context.virtual_engine]
 
                 kv_cache = self.p2p_nccl_engine.recv_tensor(
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
index 3c574d0655717..fd79387269d56 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
@@ -32,10 +32,11 @@ class ReqMeta:
     slot_mapping: torch.Tensor
     # Is store or load
     is_store: bool
+    mm_hashes: list[str]
 
     @staticmethod
     def make_meta(token_ids: list[int], block_ids: list[int], block_size: int,
-                  is_store: bool) -> "ReqMeta":
+                  is_store: bool, mm_hashes: list[str]) -> "ReqMeta":
         valid_num_tokens = align_to_block_size(len(token_ids), block_size)
         token_ids_tensor = torch.tensor(token_ids)[:valid_num_tokens]
         block_ids_tensor = torch.tensor(block_ids)
@@ -48,6 +49,7 @@ class ReqMeta:
             token_ids=token_ids_tensor,
             slot_mapping=slot_mapping,
             is_store=is_store,
+            mm_hashes=mm_hashes,
         )
 
 
@@ -64,9 +66,11 @@ class SharedStorageConnectorMetadata(KVConnectorMetadata):
         block_ids: list[int],
         block_size: int,
         is_store: bool,
+        mm_hashes: list[str],
     ) -> None:
         self.requests.append(
-            ReqMeta.make_meta(token_ids, block_ids, block_size, is_store))
+            ReqMeta.make_meta(token_ids, block_ids, block_size, is_store,
+                              mm_hashes))
 
 
 class SharedStorageConnector(KVConnectorBase_V1):
@@ -156,12 +160,20 @@ class SharedStorageConnector(KVConnectorBase_V1):
             logger.info("Inject KV cache of %d tokens to the paged memory",
                         len(request.slot_mapping))
             for layer_name in forward_context.no_compile_layers:
-                attn_layer = forward_context.no_compile_layers[layer_name]
-                kv_cache_layer = attn_layer.kv_cache[\
+                layer = forward_context.no_compile_layers[layer_name]
+
+                # Only process layers that have kv_cache
+                # attribute (attention layers) Skip non-attention
+                # layers like FusedMoE/MLP etc.
+                kv_cache_attr = getattr(layer, 'kv_cache', None)
+                if kv_cache_attr is None:
+                    continue
+
+                kv_cache_layer = kv_cache_attr[ \
                         forward_context.virtual_engine]
 
                 filename = self._generate_filename_debug(
-                    layer_name, request.token_ids)
+                    layer_name, request.token_ids, request.mm_hashes)
                 kv_cache = safetensors.torch.load_file(
                     filename)["kv_cache"].cuda()
                 inject_kv_into_layer(kv_cache_layer, kv_cache,
@@ -213,7 +225,7 @@ class SharedStorageConnector(KVConnectorBase_V1):
         for request in connector_metadata.requests:
             if request.is_store:
                 filename = self._generate_filename_debug(
-                    layer_name, request.token_ids)
+                    layer_name, request.token_ids, request.mm_hashes)
                 kv_cache = extract_kv_from_layer(kv_layer,
                                                  request.slot_mapping)
                 tensors = {"kv_cache": kv_cache.detach().cpu()}
@@ -291,7 +303,8 @@ class SharedStorageConnector(KVConnectorBase_V1):
                 meta.add_request(token_ids=new_req.prompt_token_ids,
                                  block_ids=new_req.block_ids[0],
                                  block_size=self._block_size,
-                                 is_store=False)
+                                 is_store=False,
+                                 mm_hashes=new_req.mm_hashes)
                 total_need_load += 1
             else:
                 # NOTE: here, we set the store and load being exclusive,
@@ -302,7 +315,8 @@ class SharedStorageConnector(KVConnectorBase_V1):
                     meta.add_request(token_ids=new_req.prompt_token_ids,
                                      block_ids=new_req.block_ids[0],
                                      block_size=self._block_size,
-                                     is_store=True)
+                                     is_store=True,
+                                     mm_hashes=new_req.mm_hashes)
 
         cached_reqs = scheduler_output.scheduled_cached_reqs
         for i, req_id in enumerate(cached_reqs.req_ids):
@@ -330,7 +344,8 @@ class SharedStorageConnector(KVConnectorBase_V1):
                 meta.add_request(token_ids=token_ids,
                                  block_ids=block_ids,
                                  block_size=self._block_size,
-                                 is_store=False)
+                                 is_store=False,
+                                 mm_hashes=request.mm_hashes)
                 total_need_load += 1
 
         assert total_need_load == len(self._requests_need_load)
@@ -351,20 +366,28 @@ class SharedStorageConnector(KVConnectorBase_V1):
             len(request.prompt_token_ids) - 1, self._block_size)
         foldername = self._generate_foldername_debug(torch.tensor(
             request.prompt_token_ids)[:num_tokens_to_check],
+                                                     request.mm_hashes,
                                                      create_folder=False)
         return os.path.exists(foldername)
 
     def _generate_foldername_debug(
         self,
-        input_ids: torch.Tensor,
+        token_ids: torch.Tensor,
+        mm_hashes: list[str],
         create_folder=False,
     ) -> str:
         """Generate a folder name based on the hash of the bytes of the input 
         ids.
         """
-        input_ids_bytes = input_ids.numpy().tobytes()
-        input_ids_hash = hashlib.md5(input_ids_bytes,
+        token_bytes = token_ids.numpy().tobytes()
+        # Add mm_hashes to the bytes being hashed to avoid path traversal and
+        # to create a canonical key.
+        if mm_hashes:
+            mm_str = "-".join(mm_hashes)
+            token_bytes += mm_str.encode('utf-8')
+        input_ids_hash = hashlib.md5(token_bytes,
                                      usedforsecurity=False).hexdigest()
+
         foldername = os.path.join(self._storage_path, input_ids_hash)
         if create_folder:
             os.makedirs(foldername, exist_ok=True)
@@ -373,12 +396,14 @@ class SharedStorageConnector(KVConnectorBase_V1):
     def _generate_filename_debug(
         self,
         layer_name: str,
-        input_ids: torch.Tensor,
+        token_ids: torch.Tensor,
+        mm_hashes: list[str],
     ) -> str:
         """Generate a file name based on the layer name and the hash 
         of the bytes of the input ids.
         """
-        foldername = self._generate_foldername_debug(input_ids,
+        foldername = self._generate_foldername_debug(token_ids,
+                                                     mm_hashes=mm_hashes,
                                                      create_folder=True)
         return os.path.join(foldername, f"{layer_name}.safetensors")
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 7dc3de62243cc..f94b57dbb04f2 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -22,16 +22,17 @@ from typing_extensions import TypeIs
 
 import vllm.envs as envs
 from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig,
-                         ConfigFormat, ConfigType, DecodingConfig,
-                         DetailedTraceModules, Device, DeviceConfig,
-                         DistributedExecutorBackend, GuidedDecodingBackend,
-                         GuidedDecodingBackendV1, HfOverrides, KVEventsConfig,
+                         ConfigFormat, ConfigType, ConvertOption,
+                         DecodingConfig, DetailedTraceModules, Device,
+                         DeviceConfig, DistributedExecutorBackend,
+                         GuidedDecodingBackend, HfOverrides, KVEventsConfig,
                          KVTransferConfig, LoadConfig, LogprobsMode,
                          LoRAConfig, ModelConfig, ModelDType, ModelImpl,
                          MultiModalConfig, ObservabilityConfig, ParallelConfig,
-                         PoolerConfig, PrefixCachingHashAlgo, SchedulerConfig,
-                         SchedulerPolicy, SpeculativeConfig, TaskOption,
-                         TokenizerMode, VllmConfig, get_attr_docs, get_field)
+                         PoolerConfig, PrefixCachingHashAlgo, RunnerOption,
+                         SchedulerConfig, SchedulerPolicy, SpeculativeConfig,
+                         TaskOption, TokenizerMode, VllmConfig, get_attr_docs,
+                         get_field)
 from vllm.logger import init_logger
 from vllm.platforms import CpuArchEnum, current_platform
 from vllm.plugins import load_general_plugins
@@ -107,15 +108,19 @@ def get_type(type_hints: set[TypeHint], type: TypeHintT) -> TypeHintT:
 
 
 def literal_to_kwargs(type_hints: set[TypeHint]) -> dict[str, Any]:
-    """Convert Literal type hints to argparse kwargs."""
+    """Get the `type` and `choices` from a `Literal` type hint in `type_hints`.
+
+    If `type_hints` also contains `str`, we use `metavar` instead of `choices`.
+    """
     type_hint = get_type(type_hints, Literal)
-    choices = get_args(type_hint)
-    choice_type = type(choices[0])
-    if not all(isinstance(choice, choice_type) for choice in choices):
+    options = get_args(type_hint)
+    option_type = type(options[0])
+    if not all(isinstance(option, option_type) for option in options):
         raise ValueError(
-            "All choices must be of the same type. "
-            f"Got {choices} with types {[type(c) for c in choices]}")
-    return {"type": choice_type, "choices": sorted(choices)}
+            "All options must be of the same type. "
+            f"Got {options} with types {[type(c) for c in options]}")
+    kwarg = "metavar" if contains_type(type_hints, str) else "choices"
+    return {"type": option_type, kwarg: sorted(options)}
 
 
 def is_not_builtin(type_hint: TypeHint) -> bool:
@@ -270,7 +275,9 @@ class EngineArgs:
         str, List[str]]] = ModelConfig.served_model_name
     tokenizer: Optional[str] = ModelConfig.tokenizer
     hf_config_path: Optional[str] = ModelConfig.hf_config_path
-    task: TaskOption = ModelConfig.task
+    runner: RunnerOption = ModelConfig.runner
+    convert: ConvertOption = ModelConfig.convert
+    task: Optional[TaskOption] = ModelConfig.task
     skip_tokenizer_init: bool = ModelConfig.skip_tokenizer_init
     enable_prompt_embeds: bool = ModelConfig.enable_prompt_embeds
     tokenizer_mode: TokenizerMode = ModelConfig.tokenizer_mode
@@ -440,6 +447,9 @@ class EngineArgs:
     # DEPRECATED
     enable_prompt_adapter: bool = False
 
+    kv_sharing_fast_prefill: bool = \
+        CacheConfig.kv_sharing_fast_prefill
+
     def __post_init__(self):
         # support `EngineArgs(compilation_config={...})`
         # without having to manually construct a
@@ -463,7 +473,11 @@ class EngineArgs:
         )
         if not ('serve' in sys.argv[1:] and '--help' in sys.argv[1:]):
             model_group.add_argument("--model", **model_kwargs["model"])
-        model_group.add_argument("--task", **model_kwargs["task"])
+        model_group.add_argument("--runner", **model_kwargs["runner"])
+        model_group.add_argument("--convert", **model_kwargs["convert"])
+        model_group.add_argument("--task",
+                                 **model_kwargs["task"],
+                                 deprecated=True)
         model_group.add_argument("--tokenizer", **model_kwargs["tokenizer"])
         model_group.add_argument("--tokenizer-mode",
                                  **model_kwargs["tokenizer_mode"])
@@ -692,6 +706,8 @@ class EngineArgs:
                                  **cache_kwargs["cpu_offload_gb"])
         cache_group.add_argument("--calculate-kv-scales",
                                  **cache_kwargs["calculate_kv_scales"])
+        cache_group.add_argument("--kv-sharing-fast-prefill",
+                                 **cache_kwargs["kv_sharing_fast_prefill"])
 
         # Multimodal related configs
         multimodal_kwargs = get_kwargs(MultiModalConfig)
@@ -876,6 +892,8 @@ class EngineArgs:
         return ModelConfig(
             model=self.model,
             hf_config_path=self.hf_config_path,
+            runner=self.runner,
+            convert=self.convert,
             task=self.task,
             tokenizer=self.tokenizer,
             tokenizer_mode=self.tokenizer_mode,
@@ -1062,6 +1080,7 @@ class EngineArgs:
             prefix_caching_hash_algo=self.prefix_caching_hash_algo,
             cpu_offload_gb=self.cpu_offload_gb,
             calculate_kv_scales=self.calculate_kv_scales,
+            kv_sharing_fast_prefill=self.kv_sharing_fast_prefill,
         )
 
         # Get the current placement group if Ray is initialized and
@@ -1338,14 +1357,6 @@ class EngineArgs:
                                recommend_to_remove=True)
             return False
 
-        if self.guided_decoding_backend not in get_args(
-                GuidedDecodingBackendV1):
-            _raise_or_fallback(
-                feature_name=
-                f"--guided-decoding-backend={self.guided_decoding_backend}",
-                recommend_to_remove=False)
-            return False
-
         # Need at least Ampere for now (FA support required).
         # Skip this check if we are running on a non-GPU platform,
         # or if the device capability is not available
@@ -1414,7 +1425,7 @@ class EngineArgs:
             "PALLAS_VLLM_V1",
             "TRITON_ATTN_VLLM_V1",
             "TRITON_MLA",
-            "CUTLASS_MLA_VLLM_V1",
+            "CUTLASS_MLA",
             "FLASHMLA",
             "FLASHINFER",
             "FLASHINFER_VLLM_V1",
@@ -1657,7 +1668,8 @@ class EngineArgs:
 
         if (self.max_num_seqs is None
                 and usage_context in default_max_num_seqs):
-            self.max_num_seqs = default_max_num_seqs[usage_context]
+            self.max_num_seqs = min(default_max_num_seqs[usage_context],
+                                    self.max_num_batched_tokens or sys.maxsize)
 
             logger.debug("Setting max_num_seqs to %d for %s usage context.",
                          self.max_num_seqs, use_context_value)
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 39642d89167bd..06bb4eeab69eb 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
-import copy
 import time
 import weakref
 from functools import partial
@@ -24,8 +23,6 @@ from vllm.inputs import PromptType
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
-from vllm.model_executor.guided_decoding import (
-    get_guided_decoding_logits_processor)
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.outputs import PoolingRequestOutput, RequestOutput
 from vllm.pooling_params import PoolingParams
@@ -469,19 +466,6 @@ class _AsyncLLMEngine(LLMEngine):
             tokenization_kwargs=tokenization_kwargs,
         )
 
-        if isinstance(params, SamplingParams) and \
-            params.guided_decoding is not None:
-            # Guided decoding has an async implementation for building logits
-            # processors in a separate threadpool.
-            # We want to invoke that here instead of using the blocking
-            # implementation in the LLMEngine
-            params = await build_guided_decoding_logits_processor_async(
-                sampling_params=params,
-                tokenizer=await self.get_tokenizer_async(lora_request),
-                default_guided_backend=self.decoding_config.backend,
-                reasoning_backend=self.decoding_config.reasoning_backend,
-                model_config=self.model_config)
-
         self._add_processed_request(
             request_id=request_id,
             processed_inputs=processed_inputs,
@@ -503,48 +487,6 @@ class _AsyncLLMEngine(LLMEngine):
         raise NotImplementedError
 
 
-async def build_guided_decoding_logits_processor_async(
-        sampling_params: SamplingParams, tokenizer: AnyTokenizer,
-        default_guided_backend: str, reasoning_backend: Optional[str],
-        model_config: ModelConfig) -> SamplingParams:
-    """Constructs logits processors based on the guided_decoding,
-    logits_bias, and allowed_token_ids fields in sampling_params. Deletes
-    those fields and adds the constructed logits processors to the
-    logits_processors field. Modifies sampling params in-place and returns
-    the modified sampling params."""
-    if sampling_params.guided_decoding is None:
-        return sampling_params
-
-    # Defensively copy sampling params since guided decoding logits
-    # processors can have different state for each request
-    sampling_params = copy.copy(sampling_params)
-    guided_decoding = sampling_params.guided_decoding
-
-    logger.debug(
-        "Building guided decoding logits processor. "
-        "guided_decoding: %s%s", guided_decoding,
-        f", reasoning_backend: {reasoning_backend}"
-        if reasoning_backend is not None else "")
-
-    guided_decoding.backend = guided_decoding.backend or default_guided_backend
-
-    processor = await get_guided_decoding_logits_processor(
-        guided_params=guided_decoding,
-        tokenizer=tokenizer,
-        reasoning_backend=reasoning_backend,
-        model_config=model_config)
-
-    if processor:
-        if sampling_params.logits_processors is None:
-            sampling_params.logits_processors = []
-        sampling_params.logits_processors.append(processor)
-
-    # Unset guided decoding params after constructing the lp from them
-    sampling_params.guided_decoding = None
-
-    return sampling_params
-
-
 class AsyncLLMEngine(EngineClient):
     """An asynchronous wrapper for [`LLMEngine`][vllm.LLMEngine].
 
@@ -1028,7 +970,7 @@ class AsyncLLMEngine(EngineClient):
         ```
         # Please refer to entrypoints/api_server.py for
         # the complete example.
-    
+
         # initialize the engine and the example input
         # note that engine_args here is AsyncEngineArgs instance
         engine = AsyncLLMEngine.from_engine_args(engine_args)
@@ -1036,13 +978,13 @@ class AsyncLLMEngine(EngineClient):
             "input": "What is LLM?",
             "request_id": 0,
         }
-    
+
         # start the generation
         results_generator = engine.encode(
         example_input["input"],
         PoolingParams(),
         example_input["request_id"])
-    
+
         # get the results
         final_output = None
         async for request_output in results_generator:
@@ -1052,7 +994,7 @@ class AsyncLLMEngine(EngineClient):
                 # Return or raise an error
                 ...
             final_output = request_output
-    
+
         # Process and return the final output
         ...
         ```
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index e7919d90442f9..79255b031eeca 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import copy
 import time
 from collections import Counter as collectionsCounter
 from collections import deque
@@ -36,8 +35,6 @@ from vllm.inputs.preprocess import InputPreprocessor
 from vllm.logger import init_logger
 from vllm.logits_process import get_bad_words_logits_processors
 from vllm.lora.request import LoRARequest
-from vllm.model_executor.guided_decoding import (
-    get_local_guided_decoding_logits_processor)
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from vllm.multimodal.processing import EncDecMultiModalProcessor
@@ -686,11 +683,10 @@ class LLMEngine:
                              "Priority scheduling is not enabled.")
 
         if isinstance(params, SamplingParams) \
-            and (params.guided_decoding or params.logits_processors) \
+            and params.logits_processors \
             and self.scheduler_config.num_scheduler_steps > 1:
             raise ValueError(
-                "Guided decoding and logits processors are not supported "
-                "in multi-step decoding")
+                "Logits processors are not supported in multi-step decoding")
 
         if arrival_time is None:
             arrival_time = time.time()
@@ -1226,7 +1222,7 @@ class LLMEngine:
         engine = LLMEngine.from_engine_args(engine_args)
         example_inputs = [(0, "What is LLM?",
         SamplingParams(temperature=0.0))]
-    
+
         # Start the engine with an event loop
         while True:
             if example_inputs:
@@ -1866,8 +1862,14 @@ class LLMEngine:
                 context=trace_context,
                 start_time=arrival_time_nano_seconds) as seq_span:
             metrics = seq_group.metrics
-            ttft = metrics.first_token_time - metrics.arrival_time
-            e2e_time = metrics.finished_time - metrics.arrival_time
+
+            # Handle potential None values for cancelled/aborted requests
+            ttft = (metrics.first_token_time - metrics.arrival_time
+                    if metrics.first_token_time is not None else None)
+
+            e2e_time = (metrics.finished_time - metrics.arrival_time
+                        if metrics.finished_time is not None else None)
+
             seq_span.set_attribute(SpanAttributes.GEN_AI_RESPONSE_MODEL,
                                    self.model_config.model)
             seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_ID,
@@ -1890,11 +1892,18 @@ class LLMEngine:
                     seq.get_output_len()
                     for seq in seq_group.get_finished_seqs()
                 ]))
-            seq_span.set_attribute(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE,
-                                   metrics.time_in_queue)
-            seq_span.set_attribute(
-                SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN, ttft)
-            seq_span.set_attribute(SpanAttributes.GEN_AI_LATENCY_E2E, e2e_time)
+
+            # Only set timing attributes if the values are available
+            if metrics.time_in_queue is not None:
+                seq_span.set_attribute(
+                    SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE,
+                    metrics.time_in_queue)
+            if ttft is not None:
+                seq_span.set_attribute(
+                    SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN, ttft)
+            if e2e_time is not None:
+                seq_span.set_attribute(SpanAttributes.GEN_AI_LATENCY_E2E,
+                                       e2e_time)
             if metrics.scheduler_time is not None:
                 seq_span.set_attribute(
                     SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER,
@@ -1983,43 +1992,13 @@ class LLMEngine:
     def _build_logits_processors(
             self, sampling_params: SamplingParams,
             lora_request: Optional[LoRARequest]) -> SamplingParams:
-        """Constructs logits processors based on the guided_decoding,
-        logits_bias, and allowed_token_ids fields in sampling_params. Deletes
-        those fields and adds the constructed logits processors to the
-        logits_processors field. Returns the modified sampling params."""
+        """Constructs logits processors based on the logits_bias, and
+        allowed_token_ids fields in sampling_params. Deletes those fields and
+        adds the constructed logits processors to the logits_processors field.
+        Returns the modified sampling params."""
 
         logits_processors = []
 
-        if sampling_params.guided_decoding is not None:
-            # Defensively copy sampling params since guided decoding logits
-            # processors can have different state for each request
-            sampling_params = copy.copy(sampling_params)
-            guided_decoding = sampling_params.guided_decoding
-
-            logger.debug(
-                "Building guided decoding logits processor in "
-                "LLMEngine. Params: %s", guided_decoding)
-
-            tokenizer = self.get_tokenizer(lora_request=lora_request)
-            guided_decoding.backend = guided_decoding.backend or \
-                self.decoding_config.backend
-
-            if self.decoding_config.reasoning_backend:
-                logger.debug("Building with reasoning backend %s",
-                             self.decoding_config.reasoning_backend)
-
-            processor = get_local_guided_decoding_logits_processor(
-                guided_params=guided_decoding,
-                tokenizer=tokenizer,
-                model_config=self.model_config,
-                reasoning_backend=self.decoding_config.reasoning_backend,
-            )
-            if processor:
-                logits_processors.append(processor)
-
-            # Unset so this doesn't get passed down to the model
-            sampling_params.guided_decoding = None
-
         if (sampling_params.logit_bias or sampling_params.allowed_token_ids):
             tokenizer = self.get_tokenizer(lora_request=lora_request)
 
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index cde8fc367fb54..f69f72edf6a52 100644
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -20,8 +20,6 @@ from vllm.config import DecodingConfig, ModelConfig, VllmConfig
 from vllm.core.scheduler import SchedulerOutputs
 # yapf conflicts with isort for this block
 # yapf: disable
-from vllm.engine.async_llm_engine import (
-    build_guided_decoding_logits_processor_async)
 from vllm.engine.multiprocessing import (ENGINE_DEAD_ERROR, IPC_DATA_EXT,
                                          IPC_HEALTH_EXT, IPC_INPUT_EXT,
                                          IPC_OUTPUT_EXT, RPC_REQUEST_T,
@@ -537,22 +535,6 @@ class MQLLMEngineClient(EngineClient):
         if request_id in self.output_queues:
             raise ValueError(f"Request {request_id} already exists")
 
-        # Constructing guided decoding logits processors is expensive, so we do
-        # it here to avoid contending with cpu resources and the GIL on the
-        # backend process.
-        if isinstance(params, SamplingParams) and \
-            params.guided_decoding is not None:
-            params = await \
-                build_guided_decoding_logits_processor_async(
-                    sampling_params=params,
-                    tokenizer=await self.get_tokenizer(lora_request),
-                    default_guided_backend=(self.decoding_config.backend
-                        if self.decoding_config
-                        else DecodingConfig.backend),
-                    model_config=self.model_config,
-                    reasoning_backend=self.decoding_config.reasoning_backend,
-                )
-
         # 1) Create output queue for this requests.
         queue: asyncio.Queue[Union[RequestOutput,
                                    BaseException]] = asyncio.Queue()
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index a6602391d4081..6485ed6b148b4 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -48,7 +48,7 @@ from vllm.transformers_utils.chat_templates import (
 # yapf: enable
 from vllm.transformers_utils.processor import cached_get_processor
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
-from vllm.utils import deprecate_kwargs, random_uuid
+from vllm.utils import random_uuid
 
 logger = init_logger(__name__)
 
@@ -383,17 +383,12 @@ def resolve_mistral_chat_template(
     return None
 
 
-@deprecate_kwargs(
-    "trust_remote_code",
-    additional_message="Please use `model_config.trust_remote_code` instead.",
-)
 def resolve_hf_chat_template(
     tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
     chat_template: Optional[str],
     tools: Optional[list[dict[str, Any]]],
     *,
     model_config: ModelConfig,
-    trust_remote_code: Optional[bool] = None,
 ) -> Optional[str]:
     # 1st priority: The given chat template
     if chat_template is not None:
@@ -488,10 +483,6 @@ def _log_chat_template_content_format(
         )
 
 
-@deprecate_kwargs(
-    "trust_remote_code",
-    additional_message="Please use `model_config.trust_remote_code` instead.",
-)
 def resolve_chat_template_content_format(
     chat_template: Optional[str],
     tools: Optional[list[dict[str, Any]]],
@@ -499,7 +490,6 @@ def resolve_chat_template_content_format(
     tokenizer: AnyTokenizer,
     *,
     model_config: ModelConfig,
-    trust_remote_code: Optional[bool] = None,
 ) -> _ChatTemplateContentFormat:
     if given_format != "auto":
         return given_format
@@ -568,17 +558,9 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
 
         input_modality = modality.replace("_embeds", "")
 
-        if mm_registry.has_processor(model_config):
-            mm_processor = mm_registry.create_processor(model_config)
-            allowed_counts = mm_processor.info.get_allowed_mm_limits()
-            allowed_count = allowed_counts.get(input_modality, 0)
-        else:
-            mm_config = model_config.multimodal_config
-            if mm_config is None:
-                msg = "This model does not support multi-modal inputs"
-                raise ValueError(msg)
-
-            allowed_count = mm_config.get_limit_per_prompt(input_modality)
+        mm_processor = mm_registry.create_processor(model_config)
+        allowed_counts = mm_processor.info.get_allowed_mm_limits()
+        allowed_count = allowed_counts.get(input_modality, 0)
 
         current_count = len(self._items_by_modality[modality]) + 1
         if current_count > allowed_count:
@@ -1285,10 +1267,6 @@ def parse_chat_messages_futures(
     return conversation, mm_tracker.all_mm_data()
 
 
-@deprecate_kwargs(
-    "trust_remote_code",
-    additional_message="Please use `model_config.trust_remote_code` instead.",
-)
 def apply_hf_chat_template(
     tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
     conversation: list[ConversationMessage],
@@ -1297,8 +1275,6 @@ def apply_hf_chat_template(
     *,
     model_config: ModelConfig,
     tokenize: bool = False,  # Different from HF's default
-    # Deprecated, explicitly capture here so it doesn't slit into kwargs.
-    trust_remote_code: Optional[bool] = None,
     **kwargs: Any,
 ) -> str:
     hf_chat_template = resolve_hf_chat_template(
diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py
index 68eb2580991c8..7dcba2cccdb52 100644
--- a/vllm/entrypoints/cli/serve.py
+++ b/vllm/entrypoints/cli/serve.py
@@ -21,7 +21,7 @@ from vllm.entrypoints.utils import (VLLM_SUBCMD_PARSER_EPILOG,
 from vllm.executor.multiproc_worker_utils import _add_prefix
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import FlexibleArgumentParser, bind_process_name, get_tcp_uri
+from vllm.utils import FlexibleArgumentParser, get_tcp_uri
 from vllm.v1.engine.core import EngineCoreProc
 from vllm.v1.engine.utils import CoreEngineProcManager, launch_core_engines
 from vllm.v1.executor.abstract import Executor
@@ -77,7 +77,7 @@ def run_headless(args: argparse.Namespace):
 
     if args.api_server_count > 1:
         raise ValueError("api_server_count can't be set in headless mode")
-    bind_process_name("APIServer_Headless")
+    # set_process_title("Headless_ProcManager")
     # Create the EngineConfig.
     engine_args = vllm.AsyncEngineArgs.from_cli_args(args)
     usage_context = UsageContext.OPENAI_API_SERVER
@@ -140,9 +140,16 @@ def run_multi_api_server(args: argparse.Namespace):
     num_api_servers = args.api_server_count
     assert num_api_servers > 0
 
+    orig_disable_mm_preprocessor_cache = args.disable_mm_preprocessor_cache
+
+    # set_process_title("ProcManager")
+
     if num_api_servers > 1:
         setup_multiprocess_prometheus()
 
+        # Not compatible with API server scale-out
+        args.disable_mm_preprocessor_cache = True
+
     listen_address, sock = setup_server(args)
 
     engine_args = vllm.AsyncEngineArgs.from_cli_args(args)
@@ -159,11 +166,9 @@ def run_multi_api_server(args: argparse.Namespace):
                              "with api_server_count > 1")
 
         if model_config.is_multimodal_model and not (
-                model_config.disable_mm_preprocessor_cache):
-            logger.warning(
-                "Multi-model preprocessor cache will be disabled for"
-                " api_server_count > 1")
-            model_config.disable_mm_preprocessor_cache = True
+                orig_disable_mm_preprocessor_cache):
+            logger.warning("Multi-model preprocessor cache will be disabled "
+                           "for api_server_count > 1")
 
     executor_class = Executor.get_class(vllm_config)
     log_stats = not engine_args.disable_log_stats
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 2c961156bc845..842a22ccebaa4 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import itertools
-import warnings
 from collections.abc import Sequence
 from contextlib import contextmanager
 from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Optional, Union,
@@ -20,8 +19,8 @@ from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput,
                               create_sort_beams_key_function)
 from vllm.config import (CompilationConfig, ModelDType, TokenizerMode,
                          is_init_field)
-from vllm.engine.arg_utils import (EngineArgs, HfOverrides, PoolerConfig,
-                                   TaskOption)
+from vllm.engine.arg_utils import (ConvertOption, EngineArgs, HfOverrides,
+                                   PoolerConfig, RunnerOption)
 from vllm.engine.llm_engine import LLMEngine
 from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
                                          ChatTemplateContentFormatOption,
@@ -34,20 +33,19 @@ from vllm.entrypoints.score_utils import (ScoreContentPartParam,
                                           _cosine_similarity,
                                           _validate_score_input_lens,
                                           get_score_prompt)
-from vllm.entrypoints.utils import _validate_truncation_size
+from vllm.entrypoints.utils import (_validate_truncation_size,
+                                    log_non_default_args)
 from vllm.inputs import PromptType, SingletonPrompt, TextPrompt, TokensPrompt
 from vllm.inputs.parse import parse_and_batch_prompt
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
-from vllm.model_executor.guided_decoding.guided_fields import (
-    GuidedDecodingRequest, LLMGuidedOptions)
 from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.outputs import (ClassificationRequestOutput, EmbeddingRequestOutput,
                           PoolingRequestOutput, RequestOutput,
                           ScoringRequestOutput)
 from vllm.pooling_params import PoolingParams
-from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams,
-                                  RequestOutputKind, SamplingParams)
+from vllm.sampling_params import (BeamSearchParams, RequestOutputKind,
+                                  SamplingParams)
 from vllm.tasks import PoolingTask
 from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
                                                get_cached_tokenizer)
@@ -170,7 +168,8 @@ class LLM:
         self,
         model: str,
         *,
-        task: TaskOption = "auto",
+        runner: RunnerOption = "auto",
+        convert: ConvertOption = "auto",
         tokenizer: Optional[str] = None,
         tokenizer_mode: TokenizerMode = "auto",
         skip_tokenizer_init: bool = False,
@@ -244,7 +243,8 @@ class LLM:
 
         engine_args = EngineArgs(
             model=model,
-            task=task,
+            runner=runner,
+            convert=convert,
             tokenizer=tokenizer,
             tokenizer_mode=tokenizer_mode,
             skip_tokenizer_init=skip_tokenizer_init,
@@ -271,6 +271,8 @@ class LLM:
             **kwargs,
         )
 
+        log_non_default_args(engine_args)
+
         # Create the Engine (autoselects V0 vs V1)
         self.llm_engine = LLMEngine.from_engine_args(
             engine_args=engine_args, usage_context=UsageContext.LLM_CLASS)
@@ -325,8 +327,6 @@ class LLM:
         *,
         use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
-        guided_options_request: Optional[Union[LLMGuidedOptions,
-                                               GuidedDecodingRequest]] = None,
     ) -> list[RequestOutput]:
         ...
 
@@ -340,8 +340,6 @@ class LLM:
         prompt_token_ids: Optional[list[int]] = None,
         use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
-        guided_options_request: Optional[Union[LLMGuidedOptions,
-                                               GuidedDecodingRequest]] = None,
     ) -> list[RequestOutput]:
         ...
 
@@ -355,8 +353,6 @@ class LLM:
         prompt_token_ids: Optional[list[list[int]]] = None,
         use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
-        guided_options_request: Optional[Union[LLMGuidedOptions,
-                                               GuidedDecodingRequest]] = None,
     ) -> list[RequestOutput]:
         ...
 
@@ -371,8 +367,6 @@ class LLM:
         prompt_token_ids: list[int],
         use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
-        guided_options_request: Optional[Union[LLMGuidedOptions,
-                                               GuidedDecodingRequest]] = None,
     ) -> list[RequestOutput]:
         ...
 
@@ -387,8 +381,6 @@ class LLM:
         prompt_token_ids: list[list[int]],
         use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
-        guided_options_request: Optional[Union[LLMGuidedOptions,
-                                               GuidedDecodingRequest]] = None,
     ) -> list[RequestOutput]:
         ...
 
@@ -401,8 +393,6 @@ class LLM:
         prompt_token_ids: Union[list[int], list[list[int]]],
         use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
-        guided_options_request: Optional[Union[LLMGuidedOptions,
-                                               GuidedDecodingRequest]] = None,
     ) -> list[RequestOutput]:
         ...
 
@@ -420,8 +410,6 @@ class LLM:
         prompt_token_ids: Optional[Union[list[int], list[list[int]]]] = None,
         use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
-        guided_options_request: Optional[Union[LLMGuidedOptions,
-                                               GuidedDecodingRequest]] = None,
         priority: Optional[list[int]] = None,
     ) -> list[RequestOutput]:
         """Generates the completions for the input prompts.
@@ -459,18 +447,10 @@ class LLM:
         model_config = self.llm_engine.model_config
         runner_type = model_config.runner_type
         if runner_type != "generate":
-            messages = [
-                "LLM.generate() is only supported for generative models."
-            ]
-
-            if "generate" in model_config.supported_runner_types:
-                messages.append(
-                    "Your model supports the 'generate' runner, but is "
-                    f"currently initialized for the '{runner_type}' runner. "
-                    "Please initialize vLLM using `--task generate` or "
-                    "`--task transcription`.")
-
-            raise ValueError(" ".join(messages))
+            raise ValueError(
+                "LLM.generate() is only supported for generative models. "
+                "Try passing `--runner generate` to use the model as a "
+                "generative model.")
 
         if prompt_token_ids is not None:
             parsed_prompts = self._convert_v1_inputs(
@@ -481,14 +461,6 @@ class LLM:
             parsed_prompts = cast(Union[PromptType, Sequence[PromptType]],
                                   prompts)
 
-        if isinstance(guided_options_request, dict):
-            if len(guided_options_request) > 1:
-                raise ValueError(
-                    "You can only use one guided decoding but multiple is "
-                    f"specified: {guided_options_request}")
-            guided_options_request = GuidedDecodingRequest(
-                **guided_options_request)
-
         if sampling_params is None:
             # Use default sampling params.
             sampling_params = self.get_default_sampling_params()
@@ -497,7 +469,8 @@ class LLM:
         truncate_prompt_tokens = None
         if isinstance(sampling_params, SamplingParams):
             truncate_prompt_tokens = sampling_params.truncate_prompt_tokens
-        _validate_truncation_size(self.llm_engine.model_config.max_model_len,
+
+        _validate_truncation_size(model_config.max_model_len,
                                   truncate_prompt_tokens, tokenization_kwargs)
 
         # Add any modality specific loras to the corresponding prompts
@@ -509,7 +482,6 @@ class LLM:
             params=sampling_params,
             use_tqdm=use_tqdm,
             lora_request=lora_request,
-            guided_options=guided_options_request,
             tokenization_kwargs=tokenization_kwargs,
             priority=priority,
         )
@@ -1065,7 +1037,7 @@ class LLM:
         truncate_prompt_tokens: Optional[int] = None,
         use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
-        pooling_task: PoolingTask = "encode",
+        pooling_task: Optional[PoolingTask] = None,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
     ) -> list[PoolingRequestOutput]:
         """Apply pooling to the hidden states corresponding to the input
@@ -1097,19 +1069,32 @@ class LLM:
             considered legacy and may be deprecated in the future. You should
             instead pass them via the `inputs` parameter.
         """
+        if pooling_task is None:
+            if "embed" in self.supported_tasks:
+                pooling_task = "embed"
+            else:
+                pooling_task = "encode"
+
+            logger.warning_once(
+                "`LLM.encode` is currently using `pooling_task = %s`.\n"
+                "Please use one of the more specific methods or set the "
+                "task directly when using `LLM.encode`:\n"
+                "  - For embeddings, use `LLM.embed(...)` "
+                "or `pooling_task=\"embed\"`.\n"
+                "  - For classification logits, use `LLM.classify(...)` "
+                "or `pooling_task=\"classify\"`.\n"
+                "  - For rewards, use `LLM.reward(...)` "
+                "or `pooling_task=\"reward\"`\n"
+                "  - For similarity scores, use `LLM.score(...)`.",
+                pooling_task)
+
         model_config = self.llm_engine.model_config
         runner_type = model_config.runner_type
         if runner_type != "pooling":
-            messages = ["LLM.encode() is only supported for pooling models."]
-
-            if "pooling" in model_config.supported_runner_types:
-                messages.append(
-                    "Your model supports the 'pooling' runner, but is "
-                    f"currently initialized for the '{runner_type}' runner. "
-                    "Please initialize vLLM using `--task embed`, "
-                    "`--task classify`, `--task score` etc.")
-
-            raise ValueError(" ".join(messages))
+            raise ValueError(
+                "LLM.encode() is only supported for pooling models. "
+                "Try passing `--runner pooling` to use the model as a "
+                "pooling model.")
 
         if prompt_token_ids is not None:
             parsed_prompts = self._convert_v1_inputs(
@@ -1183,8 +1168,9 @@ class LLM:
             embedding vectors in the same order as the input prompts.
         """
         if "embed" not in self.supported_tasks:
-            raise ValueError("Embedding API is not supported by this model. "
-                             "Please set `--task embed`.")
+            raise ValueError(
+                "Embedding API is not supported by this model. "
+                "Try converting the model using `--convert embed`.")
 
         items = self.encode(
             prompts,
@@ -1229,7 +1215,7 @@ class LLM:
         if "classify" not in self.supported_tasks:
             raise ValueError(
                 "Classification API is not supported by this model. "
-                "Please set `--task classify`.")
+                "Try converting the model using `--convert classify`.")
 
         items = self.encode(
             prompts,
@@ -1240,6 +1226,45 @@ class LLM:
 
         return [ClassificationRequestOutput.from_base(item) for item in items]
 
+    def reward(
+        self,
+        prompts: Union[PromptType, Sequence[PromptType]],
+        /,
+        *,
+        truncate_prompt_tokens: Optional[int] = None,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
+        pooling_params: Optional[Union[PoolingParams,
+                                       Sequence[PoolingParams]]] = None,
+        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
+    ) -> list[PoolingRequestOutput]:
+        """
+        Generate rewards for each prompt.
+
+        Args:
+            prompts: The prompts to the LLM. You may pass a sequence of prompts
+                for batch inference. See [PromptType][vllm.inputs.PromptType]
+                for more details about the format of each prompts.
+            use_tqdm: If `True`, shows a tqdm progress bar.
+                If a callable (e.g., `functools.partial(tqdm, leave=False)`),
+                it is used to create the progress bar.
+                If `False`, no progress bar is created.
+            lora_request: LoRA request to use for generation, if any.
+            pooling_params: The pooling parameters for pooling. If None, we
+                use the default pooling parameters.
+        Returns:
+            A list of `PoolingRequestOutput` objects containing the
+            pooled hidden states in the same order as the input prompts.
+        """
+
+        return self.encode(
+            prompts,
+            use_tqdm=use_tqdm,
+            lora_request=lora_request,
+            pooling_params=pooling_params,
+            truncate_prompt_tokens=truncate_prompt_tokens,
+            pooling_task="encode",
+        )
+
     def _embedding_score(
         self,
         tokenizer: AnyTokenizer,
@@ -1283,27 +1308,26 @@ class LLM:
         use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
     ) -> list[ScoringRequestOutput]:
+        model_config = self.llm_engine.model_config
 
         if isinstance(tokenizer, MistralTokenizer):
             raise ValueError(
-                "Score API is only enabled for `--task embed or score`")
+                "Score API is not supported for Mistral tokenizer")
 
         if len(data_1) == 1:
             data_1 = data_1 * len(data_2)
 
         pooling_params = PoolingParams(task="score")
         tokenization_kwargs: dict[str, Any] = {}
-        _validate_truncation_size(self.llm_engine.model_config.max_model_len,
+
+        _validate_truncation_size(model_config.max_model_len,
                                   truncate_prompt_tokens, tokenization_kwargs)
 
         parsed_prompts = []
 
         input_pairs = [(t1, t2) for t1, t2 in zip(data_1, data_2)]
 
-        if self.llm_engine.model_config.is_multimodal_model:
-
-            model_config = self.llm_engine.model_config
-
+        if model_config.is_multimodal_model:
             for q, d in input_pairs:
                 _, engine_prompt = get_score_prompt(
                     model_config=model_config,
@@ -1314,11 +1338,9 @@ class LLM:
                 )
 
                 parsed_prompts.append(engine_prompt)
-
         else:
-
             for q, t in input_pairs:
-                if self.llm_engine.model_config.use_pad_token:
+                if model_config.use_pad_token:
                     # cross_encoder models defaults to using pad_token.
                     prompt_inputs = tokenizer(
                         text=q,  # type: ignore[arg-type]
@@ -1371,17 +1393,17 @@ class LLM:
         of your inputs into a single list and pass it to this method.
 
         Supports both text and multi-modal data (images, etc.) when used with
-        appropriate multi-modal models. For multi-modal inputs, ensure the 
+        appropriate multi-modal models. For multi-modal inputs, ensure the
         prompt structure matches the model's expected input format.
 
         Args:
-            data_1: Can be a single prompt, a list of prompts or 
-                `ScoreMultiModalParam`, which can contain either text or 
-                multi-modal data. When a list, it must have the same length as 
+            data_1: Can be a single prompt, a list of prompts or
+                `ScoreMultiModalParam`, which can contain either text or
+                multi-modal data. When a list, it must have the same length as
                 the `data_2` list.
-            data_2: The data to pair with the query to form the input to 
+            data_2: The data to pair with the query to form the input to
                 the LLM. Can be text or multi-modal data. See [PromptType]
-                [vllm.inputs.PromptType] for more details about the format of 
+                [vllm.inputs.PromptType] for more details about the format of
                 each prompt.
             use_tqdm: If `True`, shows a tqdm progress bar.
                 If a callable (e.g., `functools.partial(tqdm, leave=False)`),
@@ -1396,23 +1418,18 @@ class LLM:
         model_config = self.llm_engine.model_config
         runner_type = model_config.runner_type
         if runner_type != "pooling":
-            messages = ["LLM.score() is only supported for pooling models."]
-
-            if "pooling" in model_config.supported_runner_types:
-                messages.append(
-                    "Your model supports the 'pooling' runner, but is "
-                    f"currently initialized for the '{runner_type}' runner. "
-                    "Please initialize vLLM using `--task embed`, "
-                    "`--task classify`, `--task score` etc.")
-
-            raise ValueError(" ".join(messages))
+            raise ValueError(
+                "LLM.score() is only supported for pooling models. "
+                "Try passing `--runner pooling` to use the model as a "
+                "pooling model.")
 
         supported_tasks = self.supported_tasks
         if all(t not in supported_tasks for t in ("embed", "classify")):
             raise ValueError("Score API is not supported by this model. "
-                             "Please set `--task embed` or `--task classify`.")
+                             "Try converting the model using "
+                             "`--convert embed` or `--convert classify`.")
 
-        if (model_config.task == "classify"
+        if (model_config.is_cross_encoder
                 and getattr(model_config.hf_config, "num_labels", 0) != 1):
             raise ValueError("Score API is only enabled for num_labels == 1.")
 
@@ -1421,15 +1438,14 @@ class LLM:
         # lists of tokens to the `text` and `text_pair` kwargs
         tokenizer = self.get_tokenizer()
 
-        if not self.llm_engine.model_config.is_multimodal_model:
+        if not model_config.is_multimodal_model:
 
             def check_data_type(data: Union[SingletonPrompt,
                                             Sequence[SingletonPrompt],
                                             ScoreMultiModalParam]):
                 if isinstance(data, dict) and "content" in data:
-                    raise ValueError(
-                        f"ScoreMultiModalParam is not supported for {self.llm_engine.model_config.architecture}",  # noqa: E501
-                    )
+                    raise ValueError("ScoreMultiModalParam is not supported "
+                                     f"for {model_config.architecture}")
 
             check_data_type(data_1)
             check_data_type(data_2)
@@ -1471,7 +1487,7 @@ class LLM:
 
         _validate_score_input_lens(data_1, data_2)  # type: ignore[arg-type]
 
-        if self.llm_engine.model_config.is_cross_encoder:
+        if model_config.is_cross_encoder:
             return self._cross_encoding_score(
                 tokenizer,
                 data_1,  # type: ignore[arg-type]
@@ -1598,17 +1614,8 @@ class LLM:
         use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[Sequence[LoRARequest], LoRARequest]],
         tokenization_kwargs: Optional[dict[str, Any]] = None,
-        guided_options: Optional[GuidedDecodingRequest] = None,
         priority: Optional[list[int]] = None,
     ) -> None:
-        if guided_options is not None:
-            warnings.warn(
-                "guided_options_request is deprecated, use "
-                "SamplingParams.guided_decoding instead",
-                DeprecationWarning,
-                stacklevel=2,
-            )
-
         if isinstance(prompts, (str, dict)):
             # Convert a single prompt to a list.
             prompts = [prompts]
@@ -1624,8 +1631,6 @@ class LLM:
 
         for sp in params if isinstance(params, Sequence) else (params, ):
             if isinstance(sp, SamplingParams):
-                self._add_guided_params(sp, guided_options)
-
                 # We only care about the final output
                 sp.output_kind = RequestOutputKind.FINAL_ONLY
 
@@ -1663,29 +1668,6 @@ class LLM:
             priority=priority,
         )
 
-    def _add_guided_params(
-            self,
-            params: SamplingParams,
-            guided_options: Optional[GuidedDecodingRequest] = None):
-        if guided_options is None:
-            return params
-
-        if params.guided_decoding is not None:
-            raise ValueError("Cannot set both guided_options_request and "
-                             "params.guided_decoding.")
-
-        params.guided_decoding = GuidedDecodingParams(
-            json=guided_options.guided_json,
-            regex=guided_options.guided_regex,
-            choice=guided_options.guided_choice,
-            grammar=guided_options.guided_grammar,
-            json_object=guided_options.guided_json_object,
-            backend=guided_options.guided_decoding_backend,
-            whitespace_pattern=guided_options.guided_whitespace_pattern,
-            structural_tag=guided_options.structural_tag,
-        )
-        return params
-
     def _run_engine(
         self,
         *,
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 5b87aed06e9ba..05d9a69a65f83 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -11,6 +11,7 @@ import multiprocessing
 import os
 import signal
 import socket
+import sys
 import tempfile
 import uuid
 from argparse import Namespace
@@ -48,8 +49,7 @@ from vllm.entrypoints.chat_utils import (load_chat_template,
                                          resolve_mistral_chat_template)
 from vllm.entrypoints.launcher import serve_http
 from vllm.entrypoints.logger import RequestLogger
-from vllm.entrypoints.openai.cli_args import (log_non_default_args,
-                                              make_arg_parser,
+from vllm.entrypoints.openai.cli_args import (make_arg_parser,
                                               validate_parsed_serve_args)
 # yapf conflicts with isort for this block
 # yapf: disable
@@ -94,16 +94,16 @@ from vllm.entrypoints.openai.serving_transcription import (
     OpenAIServingTranscription, OpenAIServingTranslation)
 from vllm.entrypoints.openai.tool_parsers import ToolParserManager
 from vllm.entrypoints.utils import (cli_env_setup, load_aware_call,
-                                    with_cancellation)
+                                    log_non_default_args, with_cancellation)
+from vllm.executor.multiproc_worker_utils import _add_prefix
 from vllm.logger import init_logger
 from vllm.reasoning import ReasoningParserManager
 from vllm.transformers_utils.config import (
     maybe_register_config_serialize_by_value)
 from vllm.transformers_utils.tokenizer import MistralTokenizer
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import (Device, FlexibleArgumentParser, bind_process_name,
-                        get_open_zmq_ipc_path, is_valid_ipv6_address,
-                        set_ulimit)
+from vllm.utils import (Device, FlexibleArgumentParser, get_open_zmq_ipc_path,
+                        is_valid_ipv6_address, set_process_title, set_ulimit)
 from vllm.v1.metrics.prometheus import get_prometheus_registry
 from vllm.version import __version__ as VLLM_VERSION
 
@@ -1239,9 +1239,9 @@ class AuthenticationMiddleware:
         2. The request path doesn't start with /v1 (e.g. /health).
     """
 
-    def __init__(self, app: ASGIApp, api_token: str) -> None:
+    def __init__(self, app: ASGIApp, tokens: list[str]) -> None:
         self.app = app
-        self.api_token = api_token
+        self.api_tokens = {f"Bearer {token}" for token in tokens}
 
     def __call__(self, scope: Scope, receive: Receive,
                  send: Send) -> Awaitable[None]:
@@ -1255,7 +1255,7 @@ class AuthenticationMiddleware:
         headers = Headers(scope=scope)
         # Type narrow to satisfy mypy.
         if url_path.startswith("/v1") and headers.get(
-                "Authorization") != f"Bearer {self.api_token}":
+                "Authorization") not in self.api_tokens:
             response = JSONResponse(content={"error": "Unauthorized"},
                                     status_code=401)
             return response(scope, receive, send)
@@ -1303,7 +1303,7 @@ class ScalingMiddleware:
     """
     Middleware that checks if the model is currently scaling and
     returns a 503 Service Unavailable response if it is.
-    
+
     This middleware applies to all HTTP requests and prevents
     processing when the model is in a scaling state.
     """
@@ -1512,8 +1512,8 @@ def build_app(args: Namespace) -> FastAPI:
                             status_code=HTTPStatus.BAD_REQUEST)
 
     # Ensure --api-key option from CLI takes precedence over VLLM_API_KEY
-    if token := args.api_key or envs.VLLM_API_KEY:
-        app.add_middleware(AuthenticationMiddleware, api_token=token)
+    if tokens := [key for key in (args.api_key or [envs.VLLM_API_KEY]) if key]:
+        app.add_middleware(AuthenticationMiddleware, tokens=tokens)
 
     if args.enable_request_id_headers:
         app.add_middleware(XRequestIdMiddleware)
@@ -1734,7 +1734,6 @@ async def init_app_state(
         state.openai_serving_models,
         request_logger=request_logger,
     ) if "transcription" in supported_tasks else None
-    state.task = model_config.task
 
     state.enable_server_load_tracking = args.enable_server_load_tracking
     state.server_load_metrics = 0
@@ -1807,6 +1806,13 @@ def setup_server(args):
 
 async def run_server(args, **uvicorn_kwargs) -> None:
     """Run a single-worker API server."""
+
+    # Add process-specific prefix to stdout and stderr.
+    process_name = "APIServer"
+    pid = os.getpid()
+    _add_prefix(sys.stdout, process_name, pid)
+    _add_prefix(sys.stderr, process_name, pid)
+
     listen_address, sock = setup_server(args)
     await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
 
@@ -1822,7 +1828,7 @@ async def run_server_worker(listen_address,
         ToolParserManager.import_tool_parser(args.tool_parser_plugin)
 
     server_index = client_config.get("client_index", 0) if client_config else 0
-    bind_process_name("APIServer", str(server_index))
+    set_process_title("APIServer", str(server_index))
     # Load logging config for uvicorn if specified
     log_config = load_log_config(args.log_config_file)
     if log_config is not None:
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index 7f60fe7130277..dfbc9cde3d5b1 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -85,22 +85,22 @@ class FrontendArgs:
     """Allowed methods."""
     allowed_headers: list[str] = field(default_factory=lambda: ["*"])
     """Allowed headers."""
-    api_key: Optional[str] = None
-    """If provided, the server will require this key to be presented in the
-    header."""
+    api_key: Optional[list[str]] = None
+    """If provided, the server will require one of these keys to be presented in
+    the header."""
     lora_modules: Optional[list[LoRAModulePath]] = None
     """LoRA modules configurations in either 'name=path' format or JSON format
-    or JSON list format. Example (old format): `'name=path'` Example (new 
-    format): `{\"name\": \"name\", \"path\": \"lora_path\", 
+    or JSON list format. Example (old format): `'name=path'` Example (new
+    format): `{\"name\": \"name\", \"path\": \"lora_path\",
     \"base_model_name\": \"id\"}`"""
     chat_template: Optional[str] = None
-    """The file path to the chat template, or the template in single-line form 
+    """The file path to the chat template, or the template in single-line form
     for the specified model."""
     chat_template_content_format: ChatTemplateContentFormatOption = "auto"
     """The format to render message content within a chat template.
 
 * "string" will render the content as a string. Example: `"Hello World"`
-* "openai" will render the content as a list of dictionaries, similar to OpenAI 
+* "openai" will render the content as a list of dictionaries, similar to OpenAI
 schema. Example: `[{"type": "text", "text": "Hello world!"}]`"""
     response_role: str = "assistant"
     """The role name to return if `request.add_generation_prompt=true`."""
@@ -117,40 +117,40 @@ schema. Example: `[{"type": "text", "text": "Hello world!"}]`"""
     root_path: Optional[str] = None
     """FastAPI root_path when app is behind a path based routing proxy."""
     middleware: list[str] = field(default_factory=lambda: [])
-    """Additional ASGI middleware to apply to the app. We accept multiple 
-    --middleware arguments. The value should be an import path. If a function 
-    is provided, vLLM will add it to the server using 
-    `@app.middleware('http')`. If a class is provided, vLLM will 
+    """Additional ASGI middleware to apply to the app. We accept multiple
+    --middleware arguments. The value should be an import path. If a function
+    is provided, vLLM will add it to the server using
+    `@app.middleware('http')`. If a class is provided, vLLM will
     add it to the server using `app.add_middleware()`."""
     return_tokens_as_token_ids: bool = False
-    """When `--max-logprobs` is specified, represents single tokens as 
-    strings of the form 'token_id:{token_id}' so that tokens that are not 
+    """When `--max-logprobs` is specified, represents single tokens as
+    strings of the form 'token_id:{token_id}' so that tokens that are not
     JSON-encodable can be identified."""
     disable_frontend_multiprocessing: bool = False
-    """If specified, will run the OpenAI frontend server in the same process as 
+    """If specified, will run the OpenAI frontend server in the same process as
     the model serving engine."""
     enable_request_id_headers: bool = False
-    """If specified, API server will add X-Request-Id header to responses. 
+    """If specified, API server will add X-Request-Id header to responses.
     Caution: this hurts performance at high QPS."""
     enable_auto_tool_choice: bool = False
-    """If specified, exclude tool definitions in prompts when 
+    """If specified, exclude tool definitions in prompts when
     tool_choice='none'."""
     exclude_tools_when_tool_choice_none: bool = False
-    """Enable auto tool choice for supported models. Use `--tool-call-parser` 
+    """Enable auto tool choice for supported models. Use `--tool-call-parser`
     to specify which parser to use."""
     tool_call_parser: Optional[str] = None
-    """Select the tool call parser depending on the model that you're using. 
-    This is used to parse the model-generated tool call into OpenAI API format. 
-    Required for `--enable-auto-tool-choice`. You can choose any option from 
+    """Select the tool call parser depending on the model that you're using.
+    This is used to parse the model-generated tool call into OpenAI API format.
+    Required for `--enable-auto-tool-choice`. You can choose any option from
     the built-in parsers or register a plugin via `--tool-parser-plugin`."""
     tool_parser_plugin: str = ""
-    """Special the tool parser plugin write to parse the model-generated tool 
-    into OpenAI API format, the name register in this plugin can be used in 
+    """Special the tool parser plugin write to parse the model-generated tool
+    into OpenAI API format, the name register in this plugin can be used in
     `--tool-call-parser`."""
     log_config_file: Optional[str] = envs.VLLM_LOGGING_CONFIG_PATH
     """Path to logging config JSON file for both vllm and uvicorn"""
     max_log_len: Optional[int] = None
-    """Max number of prompt characters or prompt ID numbers being printed in 
+    """Max number of prompt characters or prompt ID numbers being printed in
     log. The default of None means unlimited."""
     disable_fastapi_docs: bool = False
     """Disable FastAPI's OpenAPI schema, Swagger UI, and ReDoc endpoint."""
@@ -194,7 +194,9 @@ schema. Example: `[{"type": "text", "text": "Hello world!"}]`"""
 
         # Special case: Tool call parser shows built-in options.
         valid_tool_parsers = list(ToolParserManager.tool_parsers.keys())
-        frontend_kwargs["tool_call_parser"]["choices"] = valid_tool_parsers
+        parsers_str = ",".join(valid_tool_parsers)
+        frontend_kwargs["tool_call_parser"]["metavar"] = (
+            f"{{{parsers_str}}} or name registered in --tool-parser-plugin")
 
         frontend_group = parser.add_argument_group(
             title="Frontend",
@@ -255,15 +257,6 @@ def validate_parsed_serve_args(args: argparse.Namespace):
                         "--tool-call-parser")
 
 
-def log_non_default_args(args: argparse.Namespace):
-    non_default_args = {}
-    parser = make_arg_parser(FlexibleArgumentParser())
-    for arg, default in vars(parser.parse_args([])).items():
-        if default != getattr(args, arg):
-            non_default_args[arg] = getattr(args, arg)
-    logger.info("non-default args: %s", non_default_args)
-
-
 def create_parser_for_docs() -> FlexibleArgumentParser:
     parser_for_docs = FlexibleArgumentParser(
         prog="-m vllm.entrypoints.openai.api_server")
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 832a3d501de07..e1d8a31672ed3 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -623,7 +623,7 @@ class OpenAIServingChat(OpenAIServing):
 
                     # handle streaming deltas for tools with named tool_choice
                     if tool_choice_function_name:
-                        if (self.reasoning_parser
+                        if (self.reasoning_parser and not reasoning_end_arr[i]
                                 and not reasoning_parser.is_reasoning_end(
                                     previous_token_ids)):
                             assert reasoning_parser is not None
@@ -637,11 +637,18 @@ class OpenAIServingChat(OpenAIServing):
                                     current_token_ids,
                                     output.token_ids,
                                 ))
-                            # When encountering think end id in delta_token_ids,
-                            # process the `content`. Only keep 'content',
-                            # remove 'reasoning_content'
+                            # When encountering think end id in delta_token_ids
+                            # or think end id in prompt_token_ids
+                            # i.e {"enable_thinking": False},
+                            # set reasoning status to end.
+                            # Only keep 'content', remove 'reasoning_content'.
                             if reasoning_parser.is_reasoning_end(
-                                    list(output.token_ids)):
+                                    list(output.token_ids)) or \
+                                    (res.prompt_token_ids and
+                                        reasoning_parser.is_reasoning_end(
+                                            list(res.prompt_token_ids)
+                                        )):
+                                reasoning_end_arr[i] = True
                                 if delta_message and delta_message.content:
                                     # This need to be added to next `delta_text`
                                     current_text = delta_message.content
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 9d848679d5d98..71976fea1ee77 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -957,9 +957,11 @@ class OpenAIServing:
         def _load_and_validate_embed(embed: bytes) -> EmbedsPrompt:
             tensor = torch.load(io.BytesIO(base64.b64decode(embed)),
                                 weights_only=True)
-            assert isinstance(
-                tensor,
-                (torch.FloatTensor, torch.BFloat16Tensor, torch.HalfTensor))
+            assert isinstance(tensor, torch.Tensor) and tensor.dtype in (
+                torch.float32,
+                torch.bfloat16,
+                torch.float16,
+            )
             if tensor.dim() > 2:
                 tensor = tensor.squeeze(0)
                 assert tensor.dim() == 2
diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py
index c2227a21a4b9a..01140a4bfea7e 100644
--- a/vllm/entrypoints/openai/speech_to_text.py
+++ b/vllm/entrypoints/openai/speech_to_text.py
@@ -86,11 +86,7 @@ class OpenAISpeechToText(OpenAIServing):
         audio_data: bytes,
     ) -> tuple[list[PromptType], float]:
         # Validate request
-        # TODO language should be optional and can be guessed.
-        # For now we default to en. See
-        # https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/generation_whisper.py#L1520
-        lang = request.language or "en"
-        self.model_cls.validate_language(lang)
+        language = self.model_cls.validate_language(request.language)
 
         if len(audio_data) / 1024**2 > self.max_audio_filesize_mb:
             raise ValueError("Maximum file size exceeded.")
@@ -112,7 +108,7 @@ class OpenAISpeechToText(OpenAIServing):
                 audio=chunk,
                 stt_config=self.asr_config,
                 model_config=self.model_config,
-                language=lang,
+                language=language,
                 task_type=self.task_type,
                 request_prompt=request.prompt)
             prompts.append(prompt)
diff --git a/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py
index 40cdf7275a8f6..8fd14f171d0af 100644
--- a/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py
@@ -1,13 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# code modified from deepseekv3_tool_parser.py
 
+import ast
+import json
 from collections.abc import Sequence
-from typing import Union
+from typing import Any, Optional, Union
 
 import regex as re
 
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              ChatCompletionToolsParam,
                                               DeltaFunctionCall, DeltaMessage,
                                               DeltaToolCall,
                                               ExtractedToolCallInformation,
@@ -34,36 +36,13 @@ class Glm4MoeModelToolParser(ToolParser):
 
         self.tool_calls_start_token = self.tool_call_start_token
 
-        # Updated regex for the XML-based format
-        self.tool_call_regex = re.compile(
-            r"<tool_call>\s*"
-            r"(?P<function_name>[^\n<]+)\s*"  # 函数名（到换行或 <）
-            r"(?P<arguments>(?:\s*<arg_key>[^<]+</arg_key>\s*"
-            r"<arg_value>[^<]*</arg_value>\s*)*)\s*"
-            r"</tool_call>",
-            re.DOTALL,
-        )
-
-        # Regex for parsing individual arguments
-        self.arg_regex = re.compile(
-            r"<arg_key>(?P<key>[^<]+)</arg_key>\s*<arg_value>(?P<value>[^<]*)</arg_value>",
-            re.DOTALL,
-        )
-
-        # Streaming regex
-        self.stream_tool_call_portion_regex = re.compile(
-            r"(?P<function_name>[^\n<]+)\s*"
-            r"(?P<arguments>(?:\s*<arg_key>[^<]+</arg_key>\s*"
-            r"<arg_value>[^<]*</arg_value>\s*)*)",
-            re.DOTALL,
-        )
-
-        # For streaming, we also need a regex to match just the function name
-        self.stream_tool_call_name_regex = re.compile(
-            r"(?P<function_name>[^\n<]+)",
-            re.DOTALL,
-        )
-
+        self.func_call_regex = re.compile(r"<tool_call>.*?</tool_call>",
+                                          re.DOTALL)
+        self.func_detail_regex = re.compile(
+            r"<tool_call>([^\n]*)\n(.*)</tool_call>", re.DOTALL)
+        self.func_arg_regex = re.compile(
+            r"<arg_key>(.*?)</arg_key>\s*<arg_value>(.*?)</arg_value>",
+            re.DOTALL)
         if not self.model_tokenizer:
             raise ValueError(
                 "The model tokenizer must be passed to the ToolParser "
@@ -72,20 +51,7 @@ class Glm4MoeModelToolParser(ToolParser):
         self.tool_call_start_token_id = self.vocab.get(
             self.tool_call_start_token)
         self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)
-
-    def _parse_arguments(self, args_text: str) -> str:
-        """Parse XML-based arguments into JSON format."""
-        if not args_text or not args_text.strip():
-            return "{}"
-
-        args_dict = {}
-        matches = self.arg_regex.findall(args_text)
-
-        for key, value in matches:
-            args_dict[key.strip()] = value.strip()
-
-        import json
-        return json.dumps(args_dict, ensure_ascii=False)
+        self._buffer = ""
 
     def extract_tool_calls(
         self,
@@ -93,52 +59,67 @@ class Glm4MoeModelToolParser(ToolParser):
         request: ChatCompletionRequest,
     ) -> ExtractedToolCallInformation:
 
-        # sanity check; avoid unnecessary processing
-        if self.tool_calls_start_token not in model_output:
+        def _is_string_type(
+                tool_name: str, arg_name: str,
+                tools: Optional[list[ChatCompletionToolsParam]]) -> bool:
+            if tools is None:
+                return False
+            for tool in tools:
+                if tool.function.name == tool_name:
+                    if tool.function.parameters is None:
+                        return False
+                    arg_type = tool.function.parameters.get(
+                        "properties", {}).get(arg_name, {}).get("type", None)
+                    return arg_type == "string"
+            logger.warning("No tool named '%s'.", tool_name)
+            return False
+
+        def _deserialize(value: str) -> Any:
+            try:
+                return json.loads(value)
+            except Exception:
+                pass
+
+            try:
+                return ast.literal_eval(value)
+            except Exception:
+                pass
+            return value
+
+        matched_tool_calls = self.func_call_regex.findall(model_output)
+        logger.debug("model_output: %s", model_output)
+        try:
+            tool_calls = []
+            for match in matched_tool_calls:
+                tc_detail = self.func_detail_regex.search(match)
+                tc_name = tc_detail.group(1)
+                tc_args = tc_detail.group(2)
+                pairs = self.func_arg_regex.findall(tc_args)
+                arg_dct = {}
+                for key, value in pairs:
+                    arg_key = key.strip()
+                    arg_val = value.strip()
+                    if not _is_string_type(tc_name, arg_key, request.tools):
+                        arg_val = _deserialize(arg_val)
+                    logger.debug("arg_key = %s, arg_val = %s", arg_key,
+                                 arg_val)
+                    arg_dct[arg_key] = arg_val
+                tool_calls.append(
+                    ToolCall(type="function",
+                             function=FunctionCall(
+                                 name=tc_name, arguments=json.dumps(arg_dct))))
+        except Exception:
+            logger.exception("Failed to extract tool call spec")
             return ExtractedToolCallInformation(tools_called=False,
                                                 tool_calls=[],
                                                 content=model_output)
-
-        try:
-            # Find all tool calls in the output
-            function_call_matches = self.tool_call_regex.findall(model_output)
-
-            logger.debug("function_call_matches: %s", function_call_matches)
-
-            if not function_call_matches:
-                return ExtractedToolCallInformation(
-                    tools_called=False,
-                    tool_calls=[],
-                    content=model_output,
-                )
-
-            tool_calls = []
-            for i, match in enumerate(function_call_matches):
-                function_name, function_args_xml = match
-                function_name = function_name.strip()
-
-                # Parse XML arguments to JSON
-                function_args_json = self._parse_arguments(function_args_xml)
-
-                tool_calls.append(
-                    ToolCall(
-                        id=f"call_{i}",
-                        type='function',
-                        function=FunctionCall(name=function_name,
-                                              arguments=function_args_json),
-                    ))
-
-            # Extract content before the first tool call
-            content = model_output[:model_output.find(self.
-                                                      tool_calls_start_token)]
-            return ExtractedToolCallInformation(
-                tools_called=bool(tool_calls),
-                tool_calls=tool_calls,
-                content=content.strip() if content.strip() else None,
-            )
-
-        except Exception:
-            logger.exception("Error in extracting tool call from response.")
+        else:
+            if len(tool_calls) > 0:
+                content = model_output[:model_output.
+                                       find(self.tool_calls_start_token)]
+                return ExtractedToolCallInformation(tools_called=True,
+                                                    tool_calls=tool_calls,
+                                                    content=content)
             return ExtractedToolCallInformation(tools_called=False,
                                                 tool_calls=[],
                                                 content=model_output)
@@ -153,250 +134,52 @@ class Glm4MoeModelToolParser(ToolParser):
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
     ) -> Union[DeltaMessage, None]:
-
-        logger.debug("delta_text: %s", delta_text)
-        logger.debug("delta_token_ids: %s", delta_token_ids)
-        # check to see if we should be streaming a tool call - is there a
-        if self.tool_call_start_token_id not in current_token_ids:
-            logger.debug("No tool call tokens found!")
-            return DeltaMessage(content=delta_text)
-        delta_text = delta_text.replace(self.tool_calls_start_token,
-                                        "").replace(self.tool_call_end_token,
-                                                    "")
-        try:
-
-            # figure out where we are in the parsing by counting tool call
-            # start & end tags
-            prev_tool_start_count = previous_token_ids.count(
-                self.tool_call_start_token_id)
-            prev_tool_end_count = previous_token_ids.count(
-                self.tool_call_end_token_id)
-            cur_tool_start_count = current_token_ids.count(
-                self.tool_call_start_token_id)
-            cur_tool_end_count = current_token_ids.count(
-                self.tool_call_end_token_id)
-            tool_call_portion = None
-            text_portion = None
-
-            # case: if we're generating text, OR rounding out a tool call
-            if (cur_tool_start_count == cur_tool_end_count
-                    and prev_tool_end_count == cur_tool_end_count
-                    and self.tool_call_end_token not in delta_text):
-                logger.debug("Generating text content! skipping tool parsing.")
-                return DeltaMessage(content=delta_text)
-
-            if self.tool_call_end_token in delta_text:
-                logger.debug("tool_call_end_token in delta_text")
-                full_text = current_text + delta_text
-                tool_call_portion = full_text.split(
-                    self.tool_call_start_token)[-1].split(
-                        self.tool_call_end_token)[0].rstrip()
-                delta_text = delta_text.split(
-                    self.tool_call_end_token)[0].rstrip()
-                text_portion = delta_text.split(
-                    self.tool_call_end_token)[-1].lstrip()
-
-            # case -- we're starting a new tool call
-            if (cur_tool_start_count > cur_tool_end_count
-                    and cur_tool_start_count > prev_tool_start_count):
-                if len(delta_token_ids) > 1:
-                    tool_call_portion = current_text.split(
-                        self.tool_call_start_token)[-1]
-                else:
-                    tool_call_portion = None
-                    delta = None
-
-                text_portion = None
-
-                # set cursors and state appropriately
-                self.current_tool_id += 1
-                self.current_tool_name_sent = False
-                self.streamed_args_for_tool.append("")
-                logger.debug("Starting on a new tool %s", self.current_tool_id)
-
-            # case -- we're updating an existing tool call
-            elif (cur_tool_start_count > cur_tool_end_count
-                  and cur_tool_start_count == prev_tool_start_count):
-
-                # get the portion of the text that's the tool call
-                tool_call_portion = current_text.split(
-                    self.tool_call_start_token)[-1]
-                text_portion = None
-
-            # case -- the current tool call is being closed.
-            elif (cur_tool_start_count == cur_tool_end_count
-                  and cur_tool_end_count >= prev_tool_end_count):
-                if self.prev_tool_call_arr is None or len(
-                        self.prev_tool_call_arr) == 0:
-                    logger.debug(
-                        "attempting to close tool call, but no tool call")
-                    return None
-                diff = self.prev_tool_call_arr[self.current_tool_id].get(
-                    "arguments")
-                if diff:
-                    diff = (diff.encode("utf-8").decode("unicode_escape")
-                            if diff is str else diff)
-                    if '"}' not in delta_text:
-                        return None
-                    end_loc = delta_text.rindex('"}')
-                    diff = delta_text[:end_loc] + '"}'
-                    logger.debug(
-                        "Finishing tool and found diff that had not "
-                        "been streamed yet: %s",
-                        diff,
-                    )
-                    self.streamed_args_for_tool[self.current_tool_id] += diff
-                    return DeltaMessage(tool_calls=[
-                        DeltaToolCall(
-                            index=self.current_tool_id,
-                            function=DeltaFunctionCall(
-                                arguments=diff).model_dump(exclude_none=True),
-                        )
-                    ])
-
-            # case -- otherwise we're just generating text
-            else:
-                text = delta_text.replace(self.tool_call_start_token, "")
-                text = text.replace(self.tool_call_end_token, "")
-                delta = DeltaMessage(tool_calls=[], content=text)
-                return delta
-
-            current_tool_call = dict()
-            if tool_call_portion:
-                current_tool_call_matches = (
-                    self.stream_tool_call_portion_regex.match(
-                        tool_call_portion))
-                if current_tool_call_matches:
-                    tool_id, tool_args = (current_tool_call_matches.groups())
-                    tool_name = tool_id.split('.')[1].split(':')[0]
-                    current_tool_call['id'] = tool_id
-                    current_tool_call["name"] = tool_name
-                    current_tool_call["arguments"] = tool_args
-                else:
-                    current_tool_call_name_matches = (
-                        self.stream_tool_call_name_regex.match(
-                            tool_call_portion))
-                    if current_tool_call_name_matches:
-                        tool_id_str, = current_tool_call_name_matches.groups()
-                        tool_name = tool_id_str.split('.')[1].split(':')[0]
-                        current_tool_call['id'] = tool_id_str
-                        current_tool_call["name"] = tool_name
-                        current_tool_call["arguments"] = ""
-                    else:
-                        logger.debug("Not enough token")
-                        return None
-
-            # case - we haven't sent the tool name yet. If it's available, send
-            #   it. otherwise, wait until it's available.
-            if not self.current_tool_name_sent:
-                if current_tool_call is None:
-                    return None
-                function_name: Union[str, None] = current_tool_call.get("name")
-                tool_id = current_tool_call.get("id")
-                if function_name:
-                    self.current_tool_name_sent = True
-                    return DeltaMessage(tool_calls=[
-                        DeltaToolCall(
-                            index=self.current_tool_id,
-                            type="function",
-                            id=tool_id,
-                            function=DeltaFunctionCall(
-                                name=function_name).model_dump(
-                                    exclude_none=True),
-                        )
-                    ])
-                else:
-                    return None
-
-            # case -- otherwise, send the tool call delta
-
-            # if the tool call portion is None, send the delta as text
-            if tool_call_portion is None:
-                # if there's text but not tool calls, send that -
-                # otherwise None to skip chunk
-                delta = (DeltaMessage(
-                    content=delta_text) if text_portion is not None else None)
-                return delta
-
-            # now, the nitty-gritty of tool calls
-            # now we have the portion to parse as tool call.
-
-            logger.debug("Trying to parse current tool call with ID %s",
-                         self.current_tool_id)
-
-            # if we're starting a new tool call, push an empty object in as
-            #   a placeholder for the arguments
-            if len(self.prev_tool_call_arr) <= self.current_tool_id:
+        self._buffer += delta_text
+        cur_text = self._buffer
+        start_idx = cur_text.find(self.tool_call_start_token)
+        if start_idx == -1:
+            self._buffer = ""
+            if self.current_tool_id > 0:
+                cur_text = ""
+            return DeltaMessage(content=cur_text)
+        logger.debug("cur_text = %s", cur_text)
+        end_idx = cur_text.find(self.tool_call_end_token)
+        if end_idx != -1:
+            if self.current_tool_id == -1:
+                self.current_tool_id = 0
+                self.prev_tool_call_arr = []
+                self.streamed_args_for_tool = []
+            while len(self.prev_tool_call_arr) <= self.current_tool_id:
                 self.prev_tool_call_arr.append({})
+            while len(self.streamed_args_for_tool) <= self.current_tool_id:
+                self.streamed_args_for_tool.append("")
 
-            # main logic for tool parsing here - compare prev. partially-parsed
-            #   JSON to the current partially-parsed JSON
-            prev_arguments = self.prev_tool_call_arr[self.current_tool_id].get(
-                "arguments")
-            cur_arguments = current_tool_call.get("arguments")
+            extracted_tool_calls = self.extract_tool_calls(
+                cur_text[:end_idx + len(self.tool_call_end_token)], request)
 
-            logger.debug("diffing old arguments: %s", prev_arguments)
-            logger.debug("against new ones: %s", cur_arguments)
-
-            # case -- no arguments have been created yet. skip sending a delta.
-            if not cur_arguments and not prev_arguments:
-                logger.debug("Skipping text %s - no arguments", delta_text)
-                delta = None
-
-            # case -- prev arguments are defined, but non are now.
-            #   probably impossible, but not a fatal error - just keep going
-            elif not cur_arguments and prev_arguments:
-                logger.error("should be impossible to have arguments reset "
-                             "mid-call. skipping streaming anything.")
-                delta = None
-
-            # case -- we now have the first info about arguments available from
-            #   autocompleting the JSON
-            elif cur_arguments and not prev_arguments:
-
-                delta = DeltaMessage(tool_calls=[
-                    DeltaToolCall(
-                        index=self.current_tool_id,
-                        function=DeltaFunctionCall(
-                            arguments=cur_arguments).model_dump(
-                                exclude_none=True),
-                    )
+            if len(extracted_tool_calls.tool_calls) == 0:
+                logger.warning("Failed to extract any tool calls.")
+                return None
+            tool_call = extracted_tool_calls.tool_calls[0]
+            self.prev_tool_call_arr[self.current_tool_id] = {
+                "name": tool_call.function.name,
+                "arguments": json.loads(tool_call.function.arguments)
+            }
+            self.streamed_args_for_tool[
+                self.current_tool_id] = tool_call.function.arguments
+            delta = DeltaMessage(
+                content=extracted_tool_calls.content,
+                tool_calls=[
+                    DeltaToolCall(index=self.current_tool_id,
+                                  id=tool_call.id,
+                                  type=tool_call.type,
+                                  function=DeltaFunctionCall(
+                                      name=tool_call.function.name,
+                                      arguments=tool_call.function.arguments))
                 ])
-                self.streamed_args_for_tool[
-                    self.current_tool_id] = cur_arguments
-
-            # last case -- we have an update to existing arguments.
-            elif cur_arguments and prev_arguments:
-                if (isinstance(delta_text, str)
-                        and cur_arguments != prev_arguments
-                        and len(cur_arguments) > len(prev_arguments)
-                        and cur_arguments.startswith(prev_arguments)):
-                    delta_arguments = cur_arguments[len(prev_arguments):]
-                    logger.debug("got diff %s", delta_text)
-
-                    delta = DeltaMessage(tool_calls=[
-                        DeltaToolCall(
-                            index=self.current_tool_id,
-                            function=DeltaFunctionCall(
-                                arguments=delta_arguments).model_dump(
-                                    exclude_none=True),
-                        )
-                    ])
-                    self.streamed_args_for_tool[
-                        self.current_tool_id] = cur_arguments
-                else:
-                    delta = None
-
-            # handle saving the state for the current tool into
-            # the "prev" list for use in diffing for the next iteration
-            if self.current_tool_id == len(self.prev_tool_call_arr) - 1:
-                self.prev_tool_call_arr[
-                    self.current_tool_id] = current_tool_call
-            else:
-                self.prev_tool_call_arr.append(current_tool_call)
-
+            self.current_tool_id += 1
+            self._buffer = cur_text[end_idx + len(self.tool_call_end_token):]
             return delta
 
-        except Exception:
-            logger.exception("Error trying to handle streaming tool call.")
-            return None  # do not stream a delta. skip this token ID.
+        self._buffer = cur_text[start_idx:]
+        return DeltaMessage(content=cur_text[:start_idx])
diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
index 5698bc70af23b..194a144ad576e 100644
--- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
@@ -3,7 +3,6 @@
 
 import json
 from collections.abc import Sequence
-from json import JSONDecoder
 from typing import Union
 
 import partial_json_parser
@@ -31,11 +30,11 @@ logger = init_logger(__name__)
 @ToolParserManager.register_module("llama4_json")
 class Llama3JsonToolParser(ToolParser):
     """
-    Tool call parser for Llama 3.1 models intended for use with the
+    Tool call parser for Llama 3.x and 4 models intended for use with the
     examples/tool_chat_template_llama.jinja template.
 
-    Used when --enable-auto-tool-choice --tool-call-parser llama3_json 
-    are all set
+    Used when --enable-auto-tool-choice --tool-call-parser llama3_json or 
+    llama4_json are set.
     """
 
     def __init__(self, tokenizer: PreTrainedTokenizerBase):
@@ -51,54 +50,57 @@ class Llama3JsonToolParser(ToolParser):
         self.bot_token = "<|python_tag|>"
         self.bot_token_id = tokenizer.encode(self.bot_token,
                                              add_special_tokens=False)[0]
-        self.tool_call_regex = re.compile(r"\[{.*?}\]", re.DOTALL)
+        # Updated regex to match multiple JSONs separated by semicolons
+        # This pattern is more robust and can handle nested JSON objects
+        self.tool_call_regex = re.compile(
+            r'{[^{}]*(?:{[^{}]*}[^{}]*)*}(?:\s*;\s*{[^{}]*(?:{[^{}]*}[^{}]*)*})*',
+            re.DOTALL)
 
     def extract_tool_calls(
             self, model_output: str,
             request: ChatCompletionRequest) -> ExtractedToolCallInformation:
         """
         Extract the tool calls from a complete model response.
+        Only extracts JSON content and ignores any surrounding plain text.
+        Supports both single JSON and multiple JSONs separated by semicolons.
         """
-        # case -- if a tool call token is not present, return a text response
-        if not (model_output.startswith(self.bot_token)
-                or model_output.startswith('{')):
+        # Quick check before running regex
+        if not (self.bot_token in model_output or '{' in model_output):
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+        # Find JSON object(s) in the text using regex
+        match = self.tool_call_regex.search(model_output)
+        if not match:
             return ExtractedToolCallInformation(tools_called=False,
                                                 tool_calls=[],
                                                 content=model_output)
 
         try:
-            # load the JSON, and then use it to build the Function and
-            # Tool Call
-            dec = JSONDecoder()
-            function_call_arr = []
+            json_str = match.group(0)
+            # Split by semicolon and strip whitespace
+            json_objects = [obj.strip() for obj in json_str.split(';')]
 
-            # depending on the prompt format the Llama model may or may not
-            # prefix the output with the <|python_tag|> token
-            start_idx = len(self.bot_token) if model_output.startswith(
-                self.bot_token) else 0
-            while start_idx < len(model_output):
-                (obj, end_idx) = dec.raw_decode(model_output[start_idx:])
-                start_idx += end_idx + len('; ')
-                function_call_arr.append(obj)
+            tool_calls: list[ToolCall] = []
+            for json_obj in json_objects:
+                if not json_obj:  # Skip empty strings
+                    continue
+                obj = json.loads(json_obj)
+                tool_calls.append(
+                    ToolCall(
+                        type="function",
+                        function=FunctionCall(
+                            name=obj["name"],
+                            # function call args are JSON but as a string
+                            arguments=json.dumps(
+                                obj["arguments"]
+                                if "arguments" in obj else obj["parameters"],
+                                ensure_ascii=False))))
 
-            tool_calls: list[ToolCall] = [
-                ToolCall(
-                    type="function",
-                    function=FunctionCall(
-                        name=raw_function_call["name"],
-                        # function call args are JSON but as a string
-                        arguments=json.dumps(raw_function_call["arguments"] \
-                                if "arguments" in raw_function_call \
-                                else raw_function_call["parameters"],
-                                ensure_ascii=False)))
-                for raw_function_call in function_call_arr
-            ]
-
-            # get any content before  the tool call
-            ret = ExtractedToolCallInformation(tools_called=True,
-                                               tool_calls=tool_calls,
-                                               content=None)
-            return ret
+            return ExtractedToolCallInformation(tools_called=True,
+                                                tool_calls=tool_calls,
+                                                content=None)
 
         except Exception:
             logger.exception("Error in extracting tool call from response.")
diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py
index 87334f458feef..d8905fc141245 100644
--- a/vllm/entrypoints/utils.py
+++ b/vllm/entrypoints/utils.py
@@ -3,6 +3,7 @@
 
 import argparse
 import asyncio
+import dataclasses
 import functools
 import os
 import subprocess
@@ -13,10 +14,13 @@ from fastapi import Request
 from fastapi.responses import JSONResponse, StreamingResponse
 from starlette.background import BackgroundTask, BackgroundTasks
 
+from vllm.engine.arg_utils import EngineArgs
+from vllm.entrypoints.openai.cli_args import make_arg_parser
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               CompletionRequest)
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
+from vllm.utils import FlexibleArgumentParser
 
 logger = init_logger(__name__)
 
@@ -295,3 +299,28 @@ def get_max_tokens(max_model_len: int, request: Union[ChatCompletionRequest,
                for val in (default_max_tokens, max_tokens, max_output_tokens,
                            default_sampling_params.get("max_tokens"))
                if val is not None)
+
+
+def log_non_default_args(args: Union[argparse.Namespace, EngineArgs]):
+    non_default_args = {}
+
+    # Handle argparse.Namespace
+    if isinstance(args, argparse.Namespace):
+        parser = make_arg_parser(FlexibleArgumentParser())
+        for arg, default in vars(parser.parse_args([])).items():
+            if default != getattr(args, arg):
+                non_default_args[arg] = getattr(args, arg)
+
+    # Handle EngineArgs instance
+    elif isinstance(args, EngineArgs):
+        default_args = EngineArgs()  # Create default instance
+        for field in dataclasses.fields(args):
+            current_val = getattr(args, field.name)
+            default_val = getattr(default_args, field.name)
+            if current_val != default_val:
+                non_default_args[field.name] = current_val
+    else:
+        raise TypeError("Unsupported argument type. " \
+        "Must be argparse.Namespace or EngineArgs instance.")
+
+    logger.info("non-default args: %s", non_default_args)
diff --git a/vllm/envs.py b/vllm/envs.py
index 0eff741519ae5..19bc9156b2586 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -68,6 +68,7 @@ if TYPE_CHECKING:
     MAX_JOBS: Optional[str] = None
     NVCC_THREADS: Optional[str] = None
     VLLM_USE_PRECOMPILED: bool = False
+    VLLM_DOCKER_BUILD_CONTEXT: bool = False
     VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False
     VLLM_NO_DEPRECATION_WARNING: bool = False
     VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False
@@ -79,6 +80,10 @@ if TYPE_CHECKING:
     VLLM_PLUGINS: Optional[list[str]] = None
     VLLM_LORA_RESOLVER_CACHE_DIR: Optional[str] = None
     VLLM_TORCH_PROFILER_DIR: Optional[str] = None
+    VLLM_TORCH_PROFILER_RECORD_SHAPES: bool = False
+    VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY: bool = False
+    VLLM_TORCH_PROFILER_WITH_STACK: bool = True
+    VLLM_TORCH_PROFILER_WITH_FLOPS: bool = False
     VLLM_USE_TRITON_AWQ: bool = False
     VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
     VLLM_SKIP_P2P_CHECK: bool = False
@@ -119,6 +124,7 @@ if TYPE_CHECKING:
     VLLM_V1_USE_OUTLINES_CACHE: bool = False
     VLLM_TPU_BUCKET_PADDING_GAP: int = 0
     VLLM_TPU_MOST_MODEL_LEN: Optional[int] = None
+    VLLM_TPU_USING_PATHWAYS: bool = False
     VLLM_USE_DEEP_GEMM: bool = False
     VLLM_USE_FLASHINFER_MOE_FP8: bool = False
     VLLM_USE_FLASHINFER_MOE_FP4: bool = False
@@ -143,6 +149,7 @@ if TYPE_CHECKING:
     VLLM_USE_CUDNN_PREFILL: bool = False
     VLLM_ENABLE_CUDAGRAPH_GC: bool = False
     VLLM_LOOPBACK_IP: str = ""
+    VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = False
 
 
 def get_default_cache_root():
@@ -221,8 +228,14 @@ environment_variables: dict[str, Callable[[], Any]] = {
 
     # If set, vllm will use precompiled binaries (*.so)
     "VLLM_USE_PRECOMPILED":
-    lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")) or bool(
-        os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")),
+    lambda: os.environ.get("VLLM_USE_PRECOMPILED", "").strip().lower() in
+    ("1", "true") or bool(os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")),
+
+    # Used to mark that setup.py is running in a Docker build context,
+    # in order to force the use of precompiled binaries.
+    "VLLM_DOCKER_BUILD_CONTEXT":
+    lambda: os.environ.get("VLLM_DOCKER_BUILD_CONTEXT", "").strip().lower() in
+    ("1", "true"),
 
     # Whether to force using nightly wheel in python build.
     # This is used for testing the nightly wheel in python build.
@@ -621,6 +634,31 @@ environment_variables: dict[str, Callable[[], Any]] = {
     lambda: (None if os.getenv("VLLM_TORCH_PROFILER_DIR", None) is None else os
              .path.expanduser(os.getenv("VLLM_TORCH_PROFILER_DIR", "."))),
 
+    # Enable torch profiler to record shapes if set
+    # VLLM_TORCH_PROFILER_RECORD_SHAPES=1. If not set, torch profiler will
+    # not record shapes.
+    "VLLM_TORCH_PROFILER_RECORD_SHAPES":
+    lambda: bool(os.getenv("VLLM_TORCH_PROFILER_RECORD_SHAPES", "0") != "0"),
+
+    # Enable torch profiler to profile memory if set
+    # VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY=1. If not set, torch profiler
+    # will not profile memory.
+    "VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY":
+    lambda: bool(
+        os.getenv("VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY", "0") != "0"),
+
+    # Enable torch profiler to profile stack if set
+    # VLLM_TORCH_PROFILER_WITH_STACK=1. If not set, torch profiler WILL
+    # profile stack by default.
+    "VLLM_TORCH_PROFILER_WITH_STACK":
+    lambda: bool(os.getenv("VLLM_TORCH_PROFILER_WITH_STACK", "1") != "0"),
+
+    # Enable torch profiler to profile flops if set
+    # VLLM_TORCH_PROFILER_WITH_FLOPS=1. If not set, torch profiler will
+    # not profile flops.
+    "VLLM_TORCH_PROFILER_WITH_FLOPS":
+    lambda: bool(os.getenv("VLLM_TORCH_PROFILER_WITH_FLOPS", "0") != "0"),
+
     # If set, vLLM will use Triton implementations of AWQ.
     "VLLM_USE_TRITON_AWQ":
     lambda: bool(int(os.getenv("VLLM_USE_TRITON_AWQ", "0"))),
@@ -631,12 +669,14 @@ environment_variables: dict[str, Callable[[], Any]] = {
     (os.environ.get("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "0").strip().lower() in
      ("1", "true")),
 
-    # By default, vLLM will check the peer-to-peer capability itself,
-    # in case of broken drivers. See https://github.com/vllm-project/vllm/blob/a9b15c606fea67a072416ea0ea115261a2756058/vllm/distributed/device_communicators/custom_all_reduce_utils.py#L101-L108 for details. # noqa
-    # If this env var is set to 1, vLLM will skip the peer-to-peer check,
-    # and trust the driver's peer-to-peer capability report.
+    # We assume drivers can report p2p status correctly.
+    # If the program hangs when using custom allreduce,
+    # potantially caused by a bug in the driver (535 series),
+    # if might be helpful to set VLLM_SKIP_P2P_CHECK=0
+    # so that vLLM can verify if p2p is actually working.
+    # See https://github.com/vllm-project/vllm/blob/a9b15c606fea67a072416ea0ea115261a2756058/vllm/distributed/device_communicators/custom_all_reduce_utils.py#L101-L108 for details. # noqa
     "VLLM_SKIP_P2P_CHECK":
-    lambda: os.getenv("VLLM_SKIP_P2P_CHECK", "0") == "1",
+    lambda: os.getenv("VLLM_SKIP_P2P_CHECK", "1") == "1",
 
     # List of quantization kernels that should be disabled, used for testing
     # and performance comparisons. Currently only affects MPLinearKernel
@@ -861,6 +901,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_TPU_MOST_MODEL_LEN":
     lambda: maybe_convert_int(os.environ.get("VLLM_TPU_MOST_MODEL_LEN", None)),
 
+    # Whether using Pathways
+    "VLLM_TPU_USING_PATHWAYS":
+    lambda: bool("proxy" in os.getenv("JAX_PLATFORMS", "").lower()),
+
     # Allow use of DeepGemm kernels for fused moe ops.
     "VLLM_USE_DEEP_GEMM":
     lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM", "0"))),
@@ -991,6 +1035,17 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # The default value is "VLLM".
     "VLLM_PROCESS_NAME_PREFIX":
     lambda: os.getenv("VLLM_PROCESS_NAME_PREFIX", "VLLM"),
+
+    # Allow chunked local attention with hybrid kv cache manager.
+    # Currently using the Hybrid KV cache manager with chunked local attention
+    # in the Llama4 models (the only models currently using chunked local attn)
+    # causes a latency regression. For this reason, we disable it by default.
+    # This flag is used to allow users to enable it if they want to (to save on
+    # kv-cache memory usage and enable longer contexts)
+    # TODO(lucas): Remove this flag once latency regression is resolved.
+    "VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE":
+    lambda: bool(int(os.getenv(\
+            "VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE", "0"))),
 }
 
 # --8<-- [end:env-vars-definition]
diff --git a/vllm/lora/ops/ipex_ops/__init__.py b/vllm/lora/ops/ipex_ops/__init__.py
new file mode 100644
index 0000000000000..5daa432493b19
--- /dev/null
+++ b/vllm/lora/ops/ipex_ops/__init__.py
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm.lora.ops.ipex_ops.lora_ops import (bgmv_expand, bgmv_expand_slice,
+                                             bgmv_shrink)
+
+__all__ = ["bgmv_expand", "bgmv_expand_slice", "bgmv_shrink"]
diff --git a/vllm/lora/ops/ipex_ops/lora_ops.py b/vllm/lora/ops/ipex_ops/lora_ops.py
new file mode 100644
index 0000000000000..7590c868ecb67
--- /dev/null
+++ b/vllm/lora/ops/ipex_ops/lora_ops.py
@@ -0,0 +1,44 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+try:
+    import intel_extension_for_pytorch as ipex
+except ImportError as e:
+    raise e
+
+
+def bgmv_shrink(inputs: torch.Tensor,
+                lora_a_weights: torch.Tensor,
+                output_tensor: torch.Tensor,
+                lora_indices_tensor: torch.Tensor,
+                scaling: float = 1.0) -> None:
+
+    ipex.llm.functional.bgmv_shrink(inputs, lora_a_weights, output_tensor,
+                                    lora_indices_tensor, scaling)
+
+
+def bgmv_expand(inputs: torch.Tensor,
+                lora_b_weights: torch.Tensor,
+                output_tensor: torch.Tensor,
+                lora_indices_tensor: torch.Tensor,
+                add_inputs: bool = True) -> None:
+    ipex.llm.functional.bgmv_expand(inputs, lora_b_weights, output_tensor,
+                                    lora_indices_tensor, add_inputs)
+
+
+def bgmv_expand_slice(inputs: torch.Tensor,
+                      lora_b_weights: torch.Tensor,
+                      output_tensor: torch.Tensor,
+                      lora_indices_tensor: torch.Tensor,
+                      slice_offset: int,
+                      slice_size: int,
+                      add_inputs: bool = True) -> None:
+    ipex.llm.functional.bgmv_expand_slice(inputs, lora_b_weights,
+                                          output_tensor, lora_indices_tensor,
+                                          slice_offset, slice_size, add_inputs)
diff --git a/vllm/lora/punica_wrapper/punica_xpu.py b/vllm/lora/punica_wrapper/punica_xpu.py
new file mode 100644
index 0000000000000..572e39e0eced0
--- /dev/null
+++ b/vllm/lora/punica_wrapper/punica_xpu.py
@@ -0,0 +1,269 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Based on:
+Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
+Punica: Multi-Tenant LoRA Serving. 
+https://arxiv.org/abs/2310.18547
+"""
+
+from typing import Optional, Union, final
+
+import torch
+
+from vllm.lora.layers import LoRAMapping
+from vllm.lora.ops.ipex_ops import bgmv_expand, bgmv_expand_slice, bgmv_shrink
+
+from .punica_base import PunicaWrapperBase
+
+
+@final
+class PunicaWrapperXPU(PunicaWrapperBase):
+    """
+    PunicaWrapperXPU is designed to manage and provide metadata for the punica
+    kernel. The main function is to maintain the state information for 
+    Multi-LoRA, and to provide the interface for the punica ipex kernel.
+    """
+
+    def __init__(self, max_num_batched_tokens: int, max_batches: int,
+                 device: Union[torch.device, str], **kwargs):
+        PunicaWrapperBase.__init__(self, max_num_batched_tokens, max_batches,
+                                   device)
+        torch._dynamo.mark_dynamic(self._token_lora_indices, 0)
+        torch._dynamo.mark_dynamic(self._embeddings_indices, 1)
+        torch._dynamo.mark_dynamic(self._sampler_indices_padded, 0)
+
+    def update_metadata(self, mapping: LoRAMapping,
+                        lora_index_to_id: list[Optional[int]], max_loras: int,
+                        vocab_size: int, extra_vocab_size: int, **kwargs):
+
+        self.is_prefill = mapping.is_prefill
+        self._update_base_metadata(mapping, lora_index_to_id, max_loras,
+                                   vocab_size, extra_vocab_size)
+
+    def _get_token_lora_indices(self, x: torch.Tensor) -> torch.IntTensor:
+        return torch.narrow(self._token_lora_indices, 0, 0, x.size(0))
+
+    def _apply_shrink(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        scale: float,
+    ):
+        bgmv_shrink(x, w_t_all, y, self._get_token_lora_indices(x), scale)
+
+    def _apply_expand(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        y_offset: int,
+        y_slice_size: int,
+        add_inputs: bool,
+    ):
+        token_lora_indices = self._get_token_lora_indices(x)
+        bgmv_expand_slice(x, w_t_all, y, token_lora_indices, y_offset,
+                          y_slice_size, add_inputs)
+
+    def add_shrink(self, y: torch.Tensor, x: torch.Tensor,
+                   lora_a_stacked: tuple[torch.Tensor,
+                                         ...], scale: float, **kwargs):
+        """
+        Performs GEMM  for multiple slices of lora_a.
+            
+        Semantics:
+        for i in range(len(lora_a_stacked)):
+            y[i] += (x @ lora_a_stacked[i]) * scale
+        
+        Args:
+            y (torch.Tensor): Output tensors
+            x (torch.Tensor): Input tensor
+            lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weights
+            scale (float): Scaling factor for the operation
+        """
+
+        x = x.view(-1, x.shape[-1])
+        for slice_idx in range(len(lora_a_stacked)):
+            self._apply_shrink(y[slice_idx], x, lora_a_stacked[slice_idx],
+                               scale)
+
+    def add_expand(self,
+                   y: torch.Tensor,
+                   x: torch.Tensor,
+                   lora_b_stacked: tuple[torch.Tensor, ...],
+                   lora_bias_stacked: Optional[tuple[torch.Tensor, ...]],
+                   output_slices: tuple[int, ...],
+                   offset_start: int = 0,
+                   add_inputs=True,
+                   **kwargs) -> None:
+        """
+        Performs GEMM and bias addition for multiple slices of lora_b.
+      
+        Semantics:
+            for i in range(len(lora_b_stacked)):
+                slice = output_slices[i]
+                y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i] + 
+                    lora_bias_stacked[i] 
+                offset += slice
+            
+        Args:
+            y (torch.Tensor): Output tensor.
+            x (torch.Tensor): Input tensors
+            lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight
+            lora_bias_stacked (Optional[tuple[torch.Tensor, ...]]): 
+                bias's weight
+            output_slices (tuple[int, ...]): Every slice's size
+            add_inputs (bool): Defaults to True.
+        """
+        y_org = y
+        y = y.view(-1, y.shape[-1])
+        if lora_bias_stacked is not None:
+            token_lora_indices = self._get_token_lora_indices(y)
+            self._apply_bias(token_lora_indices, y, output_slices,
+                             lora_bias_stacked)
+
+        assert x.ndim == 3
+        assert x.size(0) == len(output_slices)
+
+        # TODO fuse these kernels
+        for slice_idx in range(len(lora_b_stacked)):
+            self._apply_expand(
+                y,
+                x[slice_idx],
+                lora_b_stacked[slice_idx],
+                offset_start,
+                output_slices[slice_idx],
+                add_inputs=add_inputs,
+            )
+            offset_start += output_slices[slice_idx]
+        y.view_as(y_org)
+
+    def add_lora_embedding(self,
+                           y: torch.Tensor,
+                           x: torch.Tensor,
+                           lora_b_stacked: torch.Tensor,
+                           add_inputs: bool = True,
+                           **kwargs) -> None:
+        """
+        Applies lora  specifically for VocabParallelEmbeddingWithLoRA.
+
+        Semantics:
+            y += x @ lora_b_stacked
+
+        Args:
+            y (torch.Tensor): Output tensor.
+            x (torch.Tensor): Input tensor.
+            lora_b_stacked (torch.Tensor): lora_b's weights.
+            add_inputs (bool): Default to True.
+        """
+        token_lora_indices = self._get_token_lora_indices(x)
+        bgmv_expand(x, lora_b_stacked, y, token_lora_indices, add_inputs)
+
+    def add_lora_linear(self,
+                        y: torch.Tensor,
+                        x: torch.Tensor,
+                        lora_a_stacked: tuple[torch.Tensor, ...],
+                        lora_b_stacked: tuple[torch.Tensor, ...],
+                        lora_bias_stacked: Optional[tuple[torch.Tensor, ...]],
+                        scale: float,
+                        output_slices: tuple[int, ...],
+                        *,
+                        buffer: Optional[torch.Tensor] = None,
+                        **kwargs) -> None:
+        """
+        Applicable to linear-related lora.
+
+        Semantics:
+            for i in range(len(lora_a_stacked)):
+                y[i] += (
+                    x[i].unsqueeze(0)
+                    @ lora_a_stacked[indices[i], layer_idx, :, :]
+                    @ lora_b_stacked[indices[i], layer_idx, :, :]
+                    * scale
+                    ).squeeze(0)+lora_bias_stacked[i]
+
+        Args:
+            y (torch.Tensor): Output tensor. Will be changed in-place.
+            x (torch.Tensor): Input tensor
+            lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weight.
+            lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight.
+            lora_bias_stacked (Optional[tuple[torch.Tensor, ...]]): lora's bias.
+            scale (float): Scaling factor.
+            output_slices (tuple[int, ...]): Every slice's size.
+            buffer (Optional[torch.Tensor]): Defaults to None.
+        """
+
+        assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices)
+        if lora_bias_stacked is not None:
+            assert len(lora_bias_stacked) == len(output_slices)
+            token_lora_indices = self._get_token_lora_indices(y)
+            y = self._apply_bias(token_lora_indices, y, output_slices,
+                                 lora_bias_stacked)
+
+        if buffer is None:
+            r = lora_b_stacked[0].size(-1)
+            # We set the buffer to be float32 by default, refer to:
+            # https://github.com/triton-lang/triton/issues/1387
+            buffer = torch.zeros(  # type: ignore
+                (len(output_slices), x.size(0), r),
+                dtype=torch.float32,
+                device=x.device,
+            )
+        self.add_shrink(
+            buffer,  # type: ignore
+            x,
+            lora_a_stacked,
+            scale,
+            **kwargs)
+        self.add_expand(
+            y,
+            buffer,  # type: ignore
+            lora_b_stacked,
+            None,
+            output_slices,
+            add_inputs=True,
+            **kwargs)
+
+    def add_lora_logits(self,
+                        y: torch.Tensor,
+                        x: torch.Tensor,
+                        lora_a_stacked: torch.Tensor,
+                        lora_b_stacked: torch.Tensor,
+                        scale,
+                        *,
+                        buffer: Optional[torch.Tensor] = None,
+                        **kwargs) -> None:
+        """
+        Applies lora  specifically for LogitsProcessorWithLoRA.
+        
+        Semantics:
+            buffer = (x @ lora_a_stacked) * scale
+            y += buffer @ lora_b_stacked
+
+        Args:
+            y (torch.Tensor): Output tensor.
+            x (torch.Tensor): Input tensor.
+            lora_a_stacked (torch.Tensor): lora_a's weights.
+            lora_b_stacked (torch.Tensor): lora_b's weights.
+            scale (float): Scaling factor.
+            buffer (Optional[torch.Tensor]): Default to None.
+        """
+        y_org = y
+        y = y.view(-1, y.shape[-1])
+        x = x.view(-1, x.shape[-1])
+        r = lora_b_stacked.size(-1)
+        if buffer is None:
+            # We set the buffer to be float32 by default, refer to:
+            # https://github.com/triton-lang/triton/issues/1387
+            buffer = torch.zeros((x.size(0), r),
+                                 dtype=torch.float32,
+                                 device=x.device)
+
+        bgmv_shrink(x, lora_a_stacked, buffer, self.sampler_indices, scale)
+        bgmv_expand(buffer,
+                    lora_b_stacked,
+                    y,
+                    self.sampler_indices,
+                    add_inputs=True)
+        return y.view_as(y_org)
diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py
deleted file mode 100644
index 7540e6344a498..0000000000000
--- a/vllm/model_executor/guided_decoding/__init__.py
+++ /dev/null
@@ -1,192 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from __future__ import annotations
-
-from typing import TYPE_CHECKING
-
-from vllm.logger import init_logger
-from vllm.model_executor.guided_decoding.utils import (
-    convert_lark_to_gbnf, grammar_is_likely_lark,
-    has_lmf_unsupported_json_features, has_xgrammar_unsupported_json_features)
-from vllm.reasoning import ReasoningParserManager
-
-if TYPE_CHECKING:
-    from transformers import PreTrainedTokenizer
-
-    from vllm.config import ModelConfig
-    from vllm.logits_process import LogitsProcessor
-    from vllm.sampling_params import GuidedDecodingParams
-
-logger = init_logger(__name__)
-
-
-def maybe_backend_fallback(
-        guided_params: GuidedDecodingParams) -> GuidedDecodingParams:
-
-    def fallback_or_error(guided_params: GuidedDecodingParams, message: str,
-                          fallback: str) -> None:
-        """Change the backend to the specified fallback with a warning log,
-        or raise a ValueError if the `disable_fallback` option is specified."""
-        if guided_params.disable_fallback:
-            raise ValueError(message)
-
-        logger.warning("%s Falling back to use %s instead.", message, fallback)
-        guided_params.backend = fallback
-
-    # `auto` was added for V1 to explicitly declare a mode that has fallbacks
-    # in place. If that is specified with V0, treat it as `xgrammar`, as we have
-    # fallbacks enabled for that and it is the V0 default.
-    if guided_params.backend == "auto":
-        guided_params.backend = "xgrammar"
-
-    # lm-format-enforce doesn't support grammar, fallback to xgrammar
-    if guided_params.backend == "lm-format-enforcer":
-        if guided_params.grammar is not None:
-            fallback_or_error(
-                guided_params,
-                "lm-format-enforcer does not support grammar guided decoding.",
-                "xgrammar")
-
-        # lm-format-enforcer doesn't support some JSON schema features
-        elif (guided_params.json is not None
-              and has_lmf_unsupported_json_features(guided_params.json)):
-            fallback_or_error(
-                guided_params,
-                "lm-format-enforcer does not support advanced JSON schema "
-                "features like patterns or numeric ranges.", "outlines")
-
-    if guided_params.backend == "xgrammar":
-        from vllm.model_executor.guided_decoding.xgrammar_decoding import (
-            xgr_installed)
-
-        # xgrammar doesn't support some JSON schema features
-        if (guided_params.json is not None and
-                has_xgrammar_unsupported_json_features(guided_params.json)):
-            fallback_or_error(
-                guided_params,
-                "xgrammar does not support advanced JSON schema features like "
-                "string length, item limits, or property bounds.", "outlines")
-
-        # xgrammar only supports GBNF grammars, so we must convert Lark.
-        # We must check if the grammar is likely Lark and if that
-        # grammar is convertible to GBNF
-        elif (guided_params.grammar is not None
-              and grammar_is_likely_lark(guided_params.grammar)):
-            try:
-                convert_lark_to_gbnf(guided_params.grammar)
-            except Exception:
-                fallback_or_error(
-                    guided_params,
-                    "xgrammar does not support Lark grammars and the "
-                    "grammar failed to convert to GBNF.", "guidance")
-
-        # If the xgrammar module cannot be imported successfully,
-        # we should still allow users to use guided decoding with a fallback.
-        elif not xgr_installed:
-            fallback_or_error(
-                guided_params,
-                "xgrammar module cannot be imported successfully.", "guidance")
-
-    if guided_params.backend == "outlines":
-        if guided_params.json_object is not None:
-            # outlines doesn't support json_object, fallback to guidance
-            fallback_or_error(guided_params,
-                              "outlines does not support json_object.",
-                              "guidance")
-        elif guided_params.grammar is not None:
-            # outlines grammar support has been removed, fallback to guidance
-            # if it is a lark-based grammar and xgrammar otherwise
-            if grammar_is_likely_lark(guided_params.grammar):
-                fallback_or_error(guided_params,
-                                  "outlines no longer supports grammars.",
-                                  "guidance")
-            else:
-                # The grammar is likely already GBNF format.
-                fallback_or_error(guided_params,
-                                  "outlines no longer supports grammars.",
-                                  "xgrammar")
-
-    return guided_params
-
-
-async def get_guided_decoding_logits_processor(
-        guided_params: GuidedDecodingParams,
-        tokenizer: PreTrainedTokenizer,
-        model_config: ModelConfig,
-        reasoning_backend: str | None = None) -> LogitsProcessor | None:
-
-    reasoner = None
-    if reasoning_backend:
-        reasoner_class = ReasoningParserManager.get_reasoning_parser(
-            reasoning_backend)
-        reasoner = reasoner_class(tokenizer)
-
-    guided_params = maybe_backend_fallback(guided_params)
-
-    if guided_params.backend == 'outlines':
-        # NOTE: lazy import outlines to avoid https://github.com/vllm-project/vllm/issues/4193
-        from vllm.model_executor.guided_decoding.outlines_decoding import (  # noqa
-            get_outlines_guided_decoding_logits_processor)
-        return await get_outlines_guided_decoding_logits_processor(
-            guided_params, tokenizer, reasoner)
-    if guided_params.backend == 'lm-format-enforcer':
-        from vllm.model_executor.guided_decoding.lm_format_enforcer_decoding import (  # noqa
-            get_local_lm_format_enforcer_guided_decoding_logits_processor)
-        return get_local_lm_format_enforcer_guided_decoding_logits_processor(
-            guided_params, tokenizer)
-    if guided_params.backend == 'xgrammar':
-        from vllm.model_executor.guided_decoding.xgrammar_decoding import (  # noqa
-            get_local_xgrammar_guided_decoding_logits_processor)
-        return get_local_xgrammar_guided_decoding_logits_processor(
-            guided_params, tokenizer, model_config, reasoner)
-    if guided_params.backend == 'guidance':
-        from vllm.model_executor.guided_decoding.guidance_decoding import (
-            get_local_guidance_guided_decoding_logits_processor)
-        return get_local_guidance_guided_decoding_logits_processor(
-            guided_params, tokenizer)
-    raise ValueError(
-        f"Unknown guided decoding backend '{guided_params.backend}'. "
-        "Must be one of 'outlines, 'lm-format-enforcer', 'xgrammar', 'guidance'"
-    )
-
-
-def get_local_guided_decoding_logits_processor(
-        guided_params: GuidedDecodingParams,
-        tokenizer: PreTrainedTokenizer,
-        model_config: ModelConfig,
-        reasoning_backend: str | None = None) -> LogitsProcessor | None:
-    guided_params = maybe_backend_fallback(guided_params)
-
-    reasoner = None
-    if reasoning_backend:
-        reasoner_class = ReasoningParserManager.get_reasoning_parser(
-            reasoning_backend)
-        reasoner = reasoner_class(tokenizer)
-
-    if guided_params.backend == 'outlines':
-        # NOTE: lazy import outlines to avoid https://github.com/vllm-project/vllm/issues/4193
-        from vllm.model_executor.guided_decoding.outlines_decoding import (  # noqa
-            get_local_outlines_guided_decoding_logits_processor)
-        return get_local_outlines_guided_decoding_logits_processor(
-            guided_params, tokenizer, reasoner)
-    if guided_params.backend == 'lm-format-enforcer':
-        from vllm.model_executor.guided_decoding.lm_format_enforcer_decoding import (  # noqa
-            get_local_lm_format_enforcer_guided_decoding_logits_processor)
-        return get_local_lm_format_enforcer_guided_decoding_logits_processor(
-            guided_params, tokenizer)
-    if guided_params.backend == 'xgrammar':
-        from vllm.model_executor.guided_decoding.xgrammar_decoding import (  # noqa
-            get_local_xgrammar_guided_decoding_logits_processor)
-        return get_local_xgrammar_guided_decoding_logits_processor(
-            guided_params, tokenizer, model_config, reasoner)
-    if guided_params.backend == 'guidance':
-        from vllm.model_executor.guided_decoding.guidance_decoding import (
-            get_local_guidance_guided_decoding_logits_processor)
-        return get_local_guidance_guided_decoding_logits_processor(
-            guided_params, tokenizer)
-
-    raise ValueError(
-        f"Unknown guided decoding backend '{guided_params.backend}'. "
-        "Must be one of 'outlines, 'lm-format-enforcer', 'xgrammar', 'guidance'"
-    )
diff --git a/vllm/model_executor/guided_decoding/guidance_decoding.py b/vllm/model_executor/guided_decoding/guidance_decoding.py
deleted file mode 100644
index 05b6a1c3239f1..0000000000000
--- a/vllm/model_executor/guided_decoding/guidance_decoding.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import json
-
-import llguidance
-from regex import escape as regex_escape
-from transformers import PreTrainedTokenizerBase
-
-from vllm.model_executor.guided_decoding.guidance_logits_processors import (
-    GuidanceLogitsProcessor)
-from vllm.sampling_params import GuidedDecodingParams
-from vllm.v1.structured_output.backend_guidance import (
-    process_for_additional_properties)
-
-
-def get_local_guidance_guided_decoding_logits_processor(
-        guided_params: GuidedDecodingParams,
-        tokenizer: PreTrainedTokenizerBase) -> GuidanceLogitsProcessor:
-    """
-    Given an OpenAI-compatible request, check for guided decoding parameters
-    and get the necessary logits processor for the given guide.
-    """
-
-    grm = ""
-    any_whitespace = not guided_params.disable_any_whitespace
-    if (guide_json := guided_params.json) is not None:
-        # Optionally set additionalProperties to False at the top-level
-        # By default, other backends do not allow additional top-level
-        # properties, so this makes guidance more similar to other backends
-        if guided_params.disable_additional_properties:
-            if not isinstance(guide_json, str):
-                guide_json = json.dumps(guide_json)
-            guide_json = process_for_additional_properties(guide_json)
-
-        grm = llguidance.LLMatcher.grammar_from_json_schema(
-            guide_json,
-            overrides={"whitespace_pattern": guided_params.whitespace_pattern},
-            defaults={
-                "whitespace_flexible": any_whitespace,
-            })
-    elif guided_params.json_object:
-        grm = llguidance.LLMatcher.grammar_from_json_schema(
-            '{"type": "object"}',
-            overrides={"whitespace_pattern": guided_params.whitespace_pattern},
-            defaults={
-                "whitespace_flexible": any_whitespace,
-            })
-    elif guided_params.regex:
-        grm = llguidance.grammar_from("regex", guided_params.regex)
-    elif guided_params.choice:
-        # choice just uses regex
-        choices = (regex_escape(str(choice))
-                   for choice in guided_params.choice)
-        choices_regex = "(" + "|".join(choices) + ")"
-        grm = llguidance.grammar_from("regex", choices_regex)
-    elif guided_params.grammar:
-        # this supports Lark and GBNF
-        grm = llguidance.grammar_from("grammar", guided_params.grammar)
-
-    if grm:
-        return GuidanceLogitsProcessor(grm, tokenizer)
-
-    raise ValueError("Unknown guided decoding mode")
diff --git a/vllm/model_executor/guided_decoding/guidance_logits_processors.py b/vllm/model_executor/guided_decoding/guidance_logits_processors.py
deleted file mode 100644
index 379b5eaa38a76..0000000000000
--- a/vllm/model_executor/guided_decoding/guidance_logits_processors.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import copy
-import os
-from typing import Any
-
-import llguidance
-import llguidance.hf
-import llguidance.torch
-import torch
-from transformers import PreTrainedTokenizerBase
-
-from vllm.logger import init_logger
-
-logger = init_logger(__name__)
-
-
-class GuidanceLogitsProcessor:
-    """Base Guidance Logits Processor"""
-
-    cached_tokenizers: dict[str, Any] = {}
-
-    def __init__(
-        self,
-        grammar: str,
-        tokenizer: PreTrainedTokenizerBase,
-    ) -> None:
-        """Base Guidance Logits Processor
-
-        Args:
-            grammar (str)
-                grammar to guide the generation
-            tokenizer (PreTrainedTokenizerBase)
-                model's tokenizer
-        """
-        self.grammar = grammar
-        self.tokenizer = tokenizer
-        self.tokenizer_name = tokenizer.name_or_path
-        self.ll_tokenizer = None
-        self.ll_matcher = None
-        self.bitmask = None
-        self.new_sampling = False
-        self.initialized = False
-
-    def clone(self) -> "GuidanceLogitsProcessor":
-        cloned = copy.copy(self)
-        if self.initialized:
-            cloned.ll_matcher = llguidance.LLMatcher(
-                self.ll_tokenizer,  # type: ignore[assignment]
-                self.grammar,
-                log_level=int(os.environ.get("LLGUIDANCE_LOG_LEVEL", "1")),
-            )
-            self.bitmask = llguidance.torch.allocate_token_bitmask(
-                1, self.ll_tokenizer.vocab_size)  # type: ignore[attr-defined]
-        return cloned
-
-    def _initialize(self):
-        if self.initialized:
-            return
-
-        ll_tokenizer = self.cached_tokenizers.get(self.tokenizer.name_or_path,
-                                                  None)
-        if ll_tokenizer is None:
-            ll_tokenizer = llguidance.hf.from_tokenizer(self.tokenizer, None)
-            self.cached_tokenizers[self.tokenizer.name_or_path] = ll_tokenizer
-
-        self.ll_tokenizer = ll_tokenizer
-        self.ll_matcher = llguidance.LLMatcher(
-            self.ll_tokenizer,
-            self.grammar,
-            log_level=int(os.environ.get("LLGUIDANCE_LOG_LEVEL", "1")),
-        )
-
-        # create reusable bitmask
-        self.bitmask = llguidance.torch.allocate_token_bitmask(
-            1, self.ll_tokenizer.vocab_size)  # type: ignore[attr-defined]
-
-        self.initialized = True
-
-    def __call__(
-        self,
-        input_ids: list[int],
-        scores: torch.Tensor,
-    ) -> torch.Tensor:
-        # we initialize the guidance model here
-        # to avoid pickling ll_tokenizer and ll_interpreter
-        self._initialize()
-
-        if self.new_sampling and len(input_ids) > 0:
-            self.ll_matcher.consume_token(  # type: ignore[attr-defined]
-                input_ids[-1])
-            err = self.ll_matcher.get_error()  # type: ignore[attr-defined]
-            if err:
-                logger.warning("Error in LLMatcher: %s", err)
-
-        llguidance.torch.fill_next_token_bitmask(self.ll_matcher, self.bitmask,
-                                                 0)
-        llguidance.torch.apply_token_bitmask_inplace(
-            scores,
-            self.bitmask.to(scores.device))  # type: ignore[attr-defined]
-
-        self.new_sampling = True
-
-        return scores
diff --git a/vllm/model_executor/guided_decoding/guided_fields.py b/vllm/model_executor/guided_decoding/guided_fields.py
deleted file mode 100644
index fa97b6dbf5115..0000000000000
--- a/vllm/model_executor/guided_decoding/guided_fields.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from dataclasses import dataclass
-from typing import Optional, TypedDict, Union
-
-
-# These classes are deprecated, see SamplingParams
-class LLMGuidedOptions(TypedDict, total=False):
-    guided_json: Union[dict, str]
-    guided_regex: str
-    guided_choice: list[str]
-    guided_grammar: str
-    guided_decoding_backend: str
-    guided_whitespace_pattern: str
-    guided_json_object: bool
-
-
-@dataclass
-class GuidedDecodingRequest:
-    """One of the fields will be used to retrieve the logit processor."""
-    guided_json: Optional[Union[dict, str]] = None
-    guided_regex: Optional[str] = None
-    guided_choice: Optional[list[str]] = None
-    guided_grammar: Optional[str] = None
-    guided_decoding_backend: Optional[str] = None
-    guided_whitespace_pattern: Optional[str] = None
-    guided_json_object: Optional[bool] = None
-    structural_tag: Optional[str] = None
-
-    def __post_init__(self):
-        """Validate that some fields are mutually exclusive."""
-        guide_count = sum(x is not None
-                          for x in (self.guided_json, self.guided_regex,
-                                    self.guided_choice, self.guided_grammar,
-                                    self.guided_json_object,
-                                    self.structural_tag))
-        if guide_count > 1:
-            raise ValueError(
-                "You can only use one kind of guided decoding but multiple are "
-                f"specified: {self.__dict__}")
diff --git a/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py b/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py
deleted file mode 100644
index f9b51f4c15745..0000000000000
--- a/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from functools import lru_cache
-from json import loads as json_loads
-from typing import Optional, Union
-
-from lmformatenforcer import (CharacterLevelParser, JsonSchemaParser,
-                              RegexParser, StringParser,
-                              TokenEnforcerTokenizerData, UnionParser)
-from lmformatenforcer.integrations.vllm import (
-    build_vllm_logits_processor, build_vllm_token_enforcer_tokenizer_data)
-from transformers import PreTrainedTokenizerBase
-
-from vllm.logits_process import LogitsProcessor
-from vllm.sampling_params import GuidedDecodingParams
-
-
-def get_local_lm_format_enforcer_guided_decoding_logits_processor(
-        guided_params: GuidedDecodingParams,
-        tokenizer) -> Optional[LogitsProcessor]:
-    """
-    Given an OpenAI-compatible request, check for guided decoding parameters
-    and get the necessary logits processor for the given guide.
-    We cache logit processors by (guide, tokenizer), and on cache hit
-    we make a shallow copy to reuse the same underlying FSM.
-    """
-
-    tokenizer_data = _cached_build_vllm_token_enforcer_tokenizer_data(
-        tokenizer)
-    character_level_parser: CharacterLevelParser
-    if guided_params.json:
-        schema_dict = _normalize_json_schema_object(guided_params.json)
-        character_level_parser = JsonSchemaParser(schema_dict)
-    elif guided_params.choice:
-        character_level_parser = UnionParser(
-            [StringParser(choice) for choice in guided_params.choice])
-    elif guided_params.regex:
-        character_level_parser = RegexParser(guided_params.regex)
-    elif guided_params.grammar:
-        # CFG grammar not supported by LMFE
-        raise ValueError("Cannot construct a guided decoding logits processor"
-                         " using the grammar option with the"
-                         " lm_format_enforcer backend.")
-    elif guided_params.json_object:
-        # None means any json object
-        character_level_parser = JsonSchemaParser(None)
-    else:
-        return None
-
-    logits_processor = build_vllm_logits_processor(tokenizer_data,
-                                                   character_level_parser)
-    return logits_processor
-
-
-def _normalize_json_schema_object(schema: Union[str, dict]) -> dict:
-    if isinstance(schema, str):
-        return json_loads(schema)
-    if isinstance(schema, dict):
-        return schema
-    raise AssertionError(f"Unsupported schema type {schema}")
-
-
-@lru_cache
-def _cached_build_vllm_token_enforcer_tokenizer_data(
-        tokenizer: PreTrainedTokenizerBase) -> TokenEnforcerTokenizerData:
-    return build_vllm_token_enforcer_tokenizer_data(tokenizer)
diff --git a/vllm/model_executor/guided_decoding/outlines_decoding.py b/vllm/model_executor/guided_decoding/outlines_decoding.py
deleted file mode 100644
index 7e365b294438b..0000000000000
--- a/vllm/model_executor/guided_decoding/outlines_decoding.py
+++ /dev/null
@@ -1,117 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import asyncio
-import concurrent.futures
-import os
-from enum import Enum
-from json import dumps as json_dumps
-from typing import Optional, Union
-
-from regex import escape as regex_escape
-from transformers import PreTrainedTokenizerBase
-
-from vllm.model_executor.guided_decoding.outlines_logits_processors import (
-    JSONLogitsProcessor, RegexLogitsProcessor)
-from vllm.reasoning import ReasoningParser
-from vllm.sampling_params import GuidedDecodingParams
-
-
-class GuidedDecodingMode(Enum):
-    JSON = "json"
-    REGEX = "regex"
-    CHOICE = "choice"
-
-
-global_thread_pool = None  # used for generating logits processor fsm
-
-# It's not yet clear that using more provides a benefit, and it could
-# potentially starve other processes on the machine. We'll cap this for now and
-# adjust later if testing proves it to help overcome a bottleneck.
-_MAX_THREADPOOL_WORKERS = 16
-
-
-async def get_outlines_guided_decoding_logits_processor(
-    guided_params: GuidedDecodingParams, tokenizer: PreTrainedTokenizerBase,
-    reasoner: Optional[ReasoningParser]
-) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, None]:
-    """
-    Given an OpenAI-compatible request, check for guided decoding parameters
-    and get the necessary logits processor for the given guide.
-    """
-    global global_thread_pool
-    guide, mode = _get_guide_and_mode(guided_params)
-    if not guide or not mode:
-        return None
-
-    if global_thread_pool is None:
-        max_workers = os.cpu_count() or 2
-        if max_workers > _MAX_THREADPOOL_WORKERS:
-            max_workers = _MAX_THREADPOOL_WORKERS
-        global_thread_pool = concurrent.futures.ThreadPoolExecutor(
-            max_workers=max_workers)
-    loop = asyncio.get_running_loop()
-    return await loop.run_in_executor(global_thread_pool,
-                                      _get_logits_processor, guide, tokenizer,
-                                      mode, guided_params.whitespace_pattern,
-                                      reasoner)
-
-
-def get_local_outlines_guided_decoding_logits_processor(
-    guided_params: GuidedDecodingParams, tokenizer: PreTrainedTokenizerBase,
-    reasoner: Optional[ReasoningParser]
-) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, None]:
-    """
-    Given an OpenAI-compatible request, check for guided decoding parameters
-    and get the necessary logits processor for the given guide.
-    """
-    guide, mode = _get_guide_and_mode(guided_params)
-    if not guide or not mode:
-        return None
-
-    return _get_logits_processor(guide, tokenizer, mode,
-                                 guided_params.whitespace_pattern, reasoner)
-
-
-def _get_guide_and_mode(
-    guided_params: GuidedDecodingParams
-) -> Union[tuple[str, GuidedDecodingMode], tuple[None, None]]:
-    if guided_params.json:
-        if isinstance(guided_params.json, dict):
-            # turn dict into hashable string
-            json = json_dumps(guided_params.json)
-        else:
-            json = guided_params.json
-        return json, GuidedDecodingMode.JSON
-    elif guided_params.regex:
-        return guided_params.regex, GuidedDecodingMode.REGEX
-    elif guided_params.choice:
-        # choice just uses regex
-        choices = [
-            regex_escape(str(choice)) for choice in guided_params.choice
-        ]
-        choices_regex = "(" + "|".join(choices) + ")"
-        return choices_regex, GuidedDecodingMode.CHOICE
-    elif guided_params.grammar:
-        raise ValueError(
-            "The `outlines` guided decoding backend no longer supports grammar "
-            "guided generation. Please use either the `xgrammar` or `guidance` "
-            "backend")
-    else:
-        return None, None
-
-
-def _get_logits_processor(
-    guide: str,
-    tokenizer: PreTrainedTokenizerBase,
-    mode: GuidedDecodingMode,
-    whitespace_pattern: Union[str, None],
-    reasoner: Optional[ReasoningParser],
-) -> Union[JSONLogitsProcessor, RegexLogitsProcessor]:
-    if mode == GuidedDecodingMode.JSON:
-        return JSONLogitsProcessor(guide, tokenizer, whitespace_pattern,
-                                   reasoner)
-    elif mode == GuidedDecodingMode.REGEX or mode == GuidedDecodingMode.CHOICE:
-        return RegexLogitsProcessor(guide, tokenizer, reasoner)
-    else:
-        raise ValueError(f"Unknown guided decoding mode {mode}")
diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
deleted file mode 100644
index 7f047a1df6a58..0000000000000
--- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py
+++ /dev/null
@@ -1,307 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# SPDX-FileCopyrightText: Copyright 2024-present the Outlines developers
-from __future__ import annotations
-
-import copy
-import hashlib
-import importlib.metadata
-import json
-import os
-from typing import Optional, Union
-
-import regex as re
-import torch
-from cachetools import LRUCache
-from diskcache import Cache
-from outlines_core import Guide, Index, Vocabulary
-from outlines_core.json_schema import build_regex_from_schema
-from outlines_core.kernels.torch import (_apply_token_bitmask_inplace_kernel,
-                                         allocate_token_bitmask)
-from pydantic import BaseModel
-from transformers import PreTrainedTokenizerBase
-from transformers.file_utils import SPIECE_UNDERLINE
-from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
-
-import vllm.envs as envs
-from vllm.logger import init_logger
-from vllm.reasoning import ReasoningParser
-from vllm.transformers_utils.tokenizer import AnyTokenizer
-
-logger = init_logger(__name__)
-
-CACHE = None
-
-
-class BaseLogitsProcessor:
-
-    def __init__(self, guide: Guide, eos_token_id: int,
-                 reasoner: Optional[ReasoningParser]) -> None:
-        self._guide: Guide = guide
-        self._eos_token_id: int = eos_token_id
-        self._reasoner: Optional[ReasoningParser] = reasoner
-        self._mask: Optional[torch.Tensor] = None
-
-    def __call__(self, input_ids: list[int],
-                 scores: torch.Tensor) -> torch.Tensor:
-        if self._mask is None:
-            self._mask = allocate_token_bitmask(scores.size(-1))
-
-        # Skip the structured logits processing if reasoning is not finished.
-        # reasoner is not None only when `--reasoning-parser` is set.
-        if self._reasoner is not None and not self._reasoner.is_reasoning_end(
-                input_ids):
-            return scores
-
-        # Remove the reasoning tokens from the input_ids
-        # We need this because our implementation relies on the
-        # input_ids sequence to store the FSM state.
-        input_ids = (self._reasoner.extract_content_ids(input_ids)
-                     if self._reasoner is not None else input_ids)
-
-        # Vllm V0 engine has a weird bug where we have to repeat
-        # the eos token id twice for generation to stop, or at least
-        # that is what we have to do from here in any case.
-        # This is a patch until a better solution can be pushed
-        # to outlines_core
-        if input_ids and input_ids[-1] != self._eos_token_id:
-            self._guide.advance(token_id=input_ids[-1], return_tokens=False)
-
-        self._guide.write_mask_into(
-            data_ptr=self._mask.data_ptr(),
-            numel=self._mask.numel(),
-            element_size=self._mask.element_size(),
-        )
-
-        # Any allowed tokens beyond the length of the scores will
-        # be ignored by the kernel, taking care of the issue with
-        # models such as Llama 3.2 Vision with an `<|image|>` token
-        # with id 128256, but scores.shape == torch.Size([128256])
-        _apply_token_bitmask_inplace_kernel(
-            logits=scores.unsqueeze(dim=0),
-            # mask must be on same device
-            mask=self._mask.to(scores.device, non_blocking=True))
-        self._mask.to("cpu", non_blocking=True)
-
-        return scores
-
-    def clone(self) -> BaseLogitsProcessor:
-        guide = copy.deepcopy(self._guide)
-        guide.reset()
-        return BaseLogitsProcessor(guide=guide,
-                                   eos_token_id=self._eos_token_id,
-                                   reasoner=self._reasoner)
-
-
-class RegexLogitsProcessor(BaseLogitsProcessor):
-
-    @classmethod
-    def _get_guide(cls, regex_string: str,
-                   tokenizer: PreTrainedTokenizerBase) -> Guide:
-        global CACHE
-        if CACHE is None:
-            CACHE = get_cache()
-        vocabulary = get_vocabulary(tokenizer)  # type: ignore[arg-type]
-        cache_key = f"{vocabulary._hash}_{regex_string}"
-        if CACHE is not None and cache_key in CACHE:
-            return Guide(CACHE[cache_key])
-
-        index = Index(regex_string, vocabulary.inner)
-
-        if CACHE is not None:
-            CACHE[cache_key] = index
-
-        return Guide(index)
-
-    def __init__(self, regex_string: str, tokenizer: PreTrainedTokenizerBase,
-                 reasoner: Optional[ReasoningParser]) -> None:
-        super().__init__(
-            guide=RegexLogitsProcessor._get_guide(regex_string, tokenizer),
-            eos_token_id=tokenizer.eos_token_id,  # type: ignore
-            reasoner=reasoner)
-
-
-class JSONLogitsProcessor(RegexLogitsProcessor):
-
-    def __init__(self, schema: Union[str, dict, BaseModel],
-                 tokenizer: PreTrainedTokenizerBase,
-                 whitespace_pattern: Union[str, None],
-                 reasoner: Optional[ReasoningParser]) -> None:
-
-        if isinstance(schema, type(BaseModel)):
-            schema_str = json.dumps(schema.model_json_schema())
-        elif isinstance(schema, dict):
-            schema_str = json.dumps(schema)
-        elif isinstance(schema, str):
-            schema_str = schema
-        else:
-            raise ValueError(
-                f"Cannot parse schema {schema}. The schema must be either "
-                f"a Pydantic object, a dictionary or a string that contains "
-                f"the JSON Schema specification")
-
-        regex_string = build_regex_from_schema(schema_str, whitespace_pattern)
-        super().__init__(regex_string, tokenizer, reasoner)
-
-
-class OutlinesVocabulary:
-    """
-    Wrapper class for `outlines_core.Vocabulary`,
-    which allows us to store a hash with the vocabulary
-    """
-
-    def __init__(self, vocabulary: Vocabulary) -> None:
-        # Actual vocabulary object
-        self.inner = vocabulary
-        # Have to do abs(hash()) because python hashes can
-        # be negative, and we are using hash as a cache key.
-        hex_str = hashlib.sha256(
-            vocabulary.__repr__().encode('utf-8')).hexdigest()
-        hash_int = int(hex_str, 16)
-        self._hash = hash_int
-
-
-re_llama_byte_token = re.compile(r"^<0x[0-9A-F]{2}>$")
-re_replacement_seq = re.compile(r"^.{0,6}�+.{0,6}$")
-
-
-def _reduced_vocabulary(tokenizer: AnyTokenizer,
-                        eos_token_id: int) -> dict[bytes, list[int]]:
-    """Create a map from vocabulary tokens to lists of equivalent token ids.
-    
-    Returns:
-        A Dict of token string -> equivalent token ids
-    """
-    unicode_to_bytes = {v: k for k, v in bytes_to_unicode().items()}
-
-    def convert_token_to_string(token: str) -> str:
-
-        string = tokenizer.convert_tokens_to_string([token])
-
-        # A hack to handle missing spaces to HF's Llama tokenizers
-        if (type(token) is str and token.startswith(SPIECE_UNDERLINE)
-                or token == "<0x20>"):
-            return " " + string
-
-        return string
-
-    vocabulary: dict[bytes, list[int]] = {}
-    empty_token_ids: list[int] = []
-    for token, token_idx in tokenizer.get_vocab().items():
-        if token in tokenizer.all_special_tokens:  # type: ignore
-            continue
-
-        token_str = convert_token_to_string(token)
-        if token_str:
-            if isinstance(token, (bytes, bytearray)):
-                # For BPE tokenizers where tokens are stored as bytes.
-
-                # safe to ignore since token_str is of type (bytearray, bytes)
-                # by this point.
-                token_bytes = bytes(token_str)  # type: ignore[arg-type]
-
-            elif "\ufffd" in token_str and not re_replacement_seq.match(
-                    token_str):
-                # Handle tokens with invalid UTF-8 sequences.
-                if re_llama_byte_token.match(token):
-                    # Llama-like tokenizers use <0xXX> for incomplete sequences.
-                    token_bytes = bytes([int(token[3:5], 16)])
-                else:
-                    # GPT2 tokenizers: map each byte back using unicode_to_bytes
-                    byte_vals = [unicode_to_bytes.get(c) for c in token]
-                    if None in byte_vals:
-                        raise RuntimeError(
-                            f"Cannot convert token `{token}`"
-                            f" ({token_idx}) to bytes: {token_str}")
-                    # safe to ignore, since if None in byte_vals,
-                    # an error is thrown.
-                    token_bytes = bytes(byte_vals)  # type: ignore[arg-type]
-            else:
-                token_bytes = token_str.encode('utf-8')
-
-            if token_idx != eos_token_id:
-                vocabulary.setdefault(token_bytes, []).append(token_idx)
-        else:
-            empty_token_ids.append(token_idx)
-
-    return vocabulary
-
-
-def get_vocabulary(tokenizer: AnyTokenizer) -> Vocabulary:
-    """Get the `Vocabulary` object for a given tokenizer.
-    """
-    if hasattr(tokenizer, "_outlines_vocabulary"):
-        return tokenizer._outlines_vocabulary  # type: ignore
-
-    try:
-        if hasattr(
-                tokenizer,
-                "eos_token_id",
-        ) and tokenizer.eos_token_id is not None:
-            eos_token_id = tokenizer.eos_token_id
-        else:
-            raise ValueError(
-                f"Error during guided decoding setup: Tokenizer"
-                f" ({type(tokenizer)}) has no `eos_token_id` property, "
-                "but `eos_token_id` is required for guided decoding"
-                " to work properly.")
-
-        reduced_vocab = _reduced_vocabulary(
-            tokenizer,
-            eos_token_id  #type: ignore
-        )
-        vocabulary = OutlinesVocabulary(Vocabulary(eos_token_id,
-                                                   reduced_vocab))
-        tokenizer._outlines_vocabulary = vocabulary  # type: ignore
-
-        return vocabulary
-    except AttributeError as e:
-        raise ValueError(f"Cannot get the vocabulary of the tokenizer "
-                         f"({type(tokenizer)}). The tokenizer should have a "
-                         "get_vocab method.") from e
-
-
-def get_cache_path() -> str:
-    """Get the context object that contains previously-computed return values"""
-    outlines_cache_dir = os.getenv("OUTLINES_CACHE_DIR")
-    xdg_cache_home = os.getenv("XDG_CACHE_HOME")
-    home_dir = os.path.expanduser("~")
-
-    if outlines_cache_dir:
-        # OUTLINES_CACHE_DIR takes precedence
-        return outlines_cache_dir
-    elif xdg_cache_home:
-        return os.path.join(xdg_cache_home, ".cache", "outlines")
-    # If homedir is "/", we may be inside a container, and thus writing to
-    # root would be problematic, so we fallback to using a tempfile.
-    # Also validate the path exists, since os.path.expanduser does
-    # not garuntee existence.
-    elif os.path.isdir(home_dir) and home_dir != "/":
-        # Default Unix fallback: ~/.cache/outlines
-        return os.path.join(home_dir, ".cache", "outlines")
-    else:
-        import tempfile
-
-        # home_dir may be / inside a docker container without existing user
-        tempdir = tempfile.gettempdir()
-        return os.path.join(tempdir, ".cache", "outlines")
-
-
-def get_cache():
-    """Get the Cache instance to be used for index caching"""
-
-    cache_dir = get_cache_path()
-    if envs.VLLM_V0_USE_OUTLINES_CACHE:
-        logger.warning("Enabling outlines cache. This is an unbounded on-disk "
-                       "cache. It may consume a lot of disk space and should "
-                       "not be used with untrusted clients.")
-        cache = Cache(cache_dir, eviction_policy="none", cull_limit=0)
-        outlines_version = importlib.metadata.version("outlines_core")
-
-        cached_version = cache.get('__version__', None)
-        if cached_version != outlines_version:
-            cache.clear()
-        cache.set('__version__', outlines_version)
-        return cache
-    else:
-        return LRUCache(maxsize=128)
diff --git a/vllm/model_executor/guided_decoding/utils.py b/vllm/model_executor/guided_decoding/utils.py
deleted file mode 100644
index 8fdfa983e120b..0000000000000
--- a/vllm/model_executor/guided_decoding/utils.py
+++ /dev/null
@@ -1,242 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import regex as re
-
-
-def has_xgrammar_unsupported_json_features(schema: dict) -> bool:
-    """Check if JSON schema contains features unsupported by xgrammar."""
-
-    def check_object(obj: dict) -> bool:
-        if not isinstance(obj, dict):
-            return False
-
-        # Check for numeric ranges
-        if obj.get("type") in ("integer", "number") and ("multipleOf" in obj):
-            return True
-
-        # Check for array unsupported keywords
-        if obj.get("type") == "array" and any(key in obj for key in [
-                "uniqueItems", "contains", "minContains", "maxContains",
-                "minItems", "maxItems"
-        ]):
-            return True
-
-        # Unsupported keywords for strings
-        if obj.get("type") == "string" and any(
-                key in obj for key in ["minLength", "maxLength", "format"]):
-            return True
-
-        # Unsupported keywords for objects
-        if obj.get("type") == "object" and any(key in obj for key in [
-                "minProperties", "maxProperties", "propertyNames",
-                "patternProperties"
-        ]):
-            return True
-
-        # Recursively check all nested objects and arrays
-        for value in obj.values():
-            if isinstance(value, dict):
-                if check_object(value):
-                    return True
-            elif isinstance(value, list):
-                for item in value:
-                    if isinstance(item, dict) and check_object(item):
-                        return True
-
-        return False
-
-    return check_object(schema)
-
-
-def has_lmf_unsupported_json_features(schema: dict) -> bool:
-    """
-    Check if JSON schema contains features unsupported 
-    by lm_format_enforcer.
-
-    Known issues:
-    - Regex patterns:
-        "grade": {
-            "type": "string",
-            "pattern": "^[A-D]$"  # Regex pattern
-        },
-    """
-
-    def check_object(obj: dict) -> bool:
-        if not isinstance(obj, dict):
-            return False
-
-        # Check for pattern restrictions
-        if "pattern" in obj:
-            return True
-
-        # Recursively check all nested objects and arrays
-        for value in obj.values():
-            if isinstance(value, dict):
-                if check_object(value):
-                    return True
-            elif isinstance(value, list):
-                for item in value:
-                    if isinstance(item, dict) and check_object(item):
-                        return True
-
-        return False
-
-    return check_object(schema)
-
-
-def grammar_is_likely_lark(grammar_str: str) -> bool:
-    """
-    Check if grammar appears to use Lark syntax.
-    
-    Args:
-        grammar_str: Input grammar string
-        
-    Returns:
-        bool: True if grammar appears to be in Lark format, False otherwise
-        
-    Examples:
-        >>> grammar_is_likely_lark("rule: 'abc'")
-        True
-        >>> grammar_is_likely_lark("rule ::= 'abc'")
-        False
-    """
-    if not grammar_str or not isinstance(grammar_str, str):
-        return False
-
-    for line in grammar_str.split('\n'):
-        # Remove both comment styles
-        line = re.sub(r'(#|//).*$', '', line).strip()
-        if not line:
-            continue
-
-        # Look for GBNF rule definition
-        if '::=' in line:
-            return False
-
-    return True
-
-
-def convert_lark_to_gbnf(grammar_str: str) -> str:
-    """
-    Convert a Lark grammar string to GBNF format.
-
-    GBNF reference:
-    https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md
-    Lark grammar reference:
-    https://lark-parser.readthedocs.io/en/latest/grammar.html
-    
-    Args:
-        grammar_str: Input grammar in Lark format
-        
-    Returns:
-        str: Converted grammar in GBNF format
-        
-    Examples:
-        >>> print(convert_lark_to_gbnf("rule: 'hello'"))
-        root ::= rule
-        rule ::= "hello"
-    """
-    if not isinstance(grammar_str, str):
-        raise ValueError(f"Grammar must be a string, got {type(grammar_str)}")
-    if not grammar_str.strip():
-        raise ValueError("Grammar string cannot be empty")
-
-    defined_rules = set()
-    referenced_rules = set()
-    output_lines = []
-
-    def clean_line(line: str) -> str:
-        """Remove comments and whitespace from line."""
-        return re.sub(r'(#|//).*$', '', line).strip()
-
-    def check_quotes(text: str, rule_name: str, line_num: int) -> None:
-        """Validate quote matching in text."""
-        if text.count("'") % 2 != 0 or text.count('"') % 2 != 0:
-            raise ValueError(
-                f"Mismatched quotes in {rule_name} on line {line_num}")
-
-    def extract_references(text: str) -> set:
-        """Extract rule references from text."""
-        # Remove quoted strings and special characters
-        text = re.sub(r'"[^"]*"', '', text)
-        text = re.sub(r'[+*?()|\[\]{}]', ' ', text)
-        return set(re.findall(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b', text))
-
-    # First pass: Find root rule and validate rule definitions
-    lines = [clean_line(line) for line in grammar_str.split('\n')]
-    first_rule = None
-
-    for line_num, line in enumerate(lines, 1):
-        if not line or line.startswith('|'):
-            continue
-
-        if ':' in line:
-            try:
-                name = line.split(':', 1)[0].strip().strip('?')
-                defined_rules.add(name)
-                if first_rule is None:
-                    first_rule = name
-                if name == 'start':
-                    first_rule = 'start'
-            except IndexError as e:
-                raise ValueError(f"Invalid rule format on line {line_num}. "
-                                 "Expected 'rule_name: definition'") from e
-
-    if not defined_rules:
-        raise ValueError("No valid rules found in grammar")
-
-    # Add root rule
-    output_lines.append(f"root ::= {first_rule}")
-
-    # Second pass: Process rule definitions and alternatives
-    current_rule = None
-    current_definition = []
-
-    for line_num, line in enumerate(lines, 1):
-        if not line:
-            continue
-
-        try:
-            if ':' in line and not line.startswith('|'):
-                # Save previous rule if exists
-                if current_rule:
-                    output_lines.append(
-                        f"{current_rule} ::= {' | '.join(current_definition)}")
-
-                # Process new rule
-                name, definition = line.split(':', 1)
-                current_rule = name.strip().strip('?')
-
-                check_quotes(definition, f"rule '{current_rule}'", line_num)
-                definition = re.sub(r"'([^']*)'", r'"\1"', definition)
-                referenced_rules.update(extract_references(definition))
-                current_definition = [definition.strip()]
-
-            elif line.startswith('|'):
-                if not current_rule:
-                    raise ValueError(f"Alternative '|' on line {line_num} "
-                                     "without a preceding rule definition")
-
-                alt_def = line[1:].strip()
-                check_quotes(alt_def, f"alternative for rule '{current_rule}'",
-                             line_num)
-                alt_def = re.sub(r"'([^']*)'", r'"\1"', alt_def)
-                referenced_rules.update(extract_references(alt_def))
-                current_definition.append(alt_def)
-
-        except ValueError as e:
-            raise ValueError(f"Error on line {line_num}: {str(e)}") from e
-
-    # Add final rule if exists
-    if current_rule:
-        output_lines.append(
-            f"{current_rule} ::= {' | '.join(current_definition)}")
-
-    # Validate all rules are defined
-    undefined_rules = referenced_rules - defined_rules - {'root'}
-    if undefined_rules:
-        raise ValueError("Referenced rules are not defined: "
-                         f"{', '.join(sorted(undefined_rules))}")
-
-    return '\n'.join(output_lines)
diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
deleted file mode 100644
index bdd3a1a9c0a59..0000000000000
--- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py
+++ /dev/null
@@ -1,426 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# noqa: UP007
-from __future__ import annotations
-
-import json
-from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Any
-
-import regex as re
-import torch
-
-import vllm.envs
-from vllm.logger import init_logger
-
-try:
-    import xgrammar as xgr
-    xgr_installed = True
-except ImportError:
-    xgr_installed = False
-    pass
-
-from vllm.model_executor.guided_decoding.utils import (convert_lark_to_gbnf,
-                                                       grammar_is_likely_lark)
-from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
-
-if TYPE_CHECKING:
-    from transformers import PreTrainedTokenizer
-
-    from vllm.config import ModelConfig
-    from vllm.reasoning import ReasoningParser
-    from vllm.sampling_params import GuidedDecodingParams
-
-logger = init_logger(__name__)
-
-
-def get_local_xgrammar_guided_decoding_logits_processor(
-        guided_params: GuidedDecodingParams,
-        tokenizer: PreTrainedTokenizer,
-        model_config: ModelConfig,
-        reasoner: ReasoningParser | None,
-        max_threads: int = 8):
-    config = GrammarConfig.from_guided_params(guided_params=guided_params,
-                                              model_config=model_config,
-                                              tokenizer=tokenizer,
-                                              max_threads=max_threads)
-    return XGrammarLogitsProcessor(config, reasoner)
-
-
-@dataclass(frozen=True)
-class TokenizerData:
-    """Immutable container for cached tokenizer data."""
-    metadata: str
-    encoded_vocab: list[str] = field(default_factory=list)
-
-
-class TokenizerDataCache:
-    """Cache manager for tokenizer data to avoid repeated processing."""
-    _cache: dict[int, TokenizerData] = {}
-
-    @classmethod
-    def get_tokenizer_data(
-        cls,
-        tokenizer: PreTrainedTokenizer,
-        /,
-        *,
-        tokenizer_hash: int,
-        vocab_size: int,
-    ) -> TokenizerData:
-
-        if tokenizer_hash not in cls._cache:
-            tokenizer_info = xgr.TokenizerInfo.from_huggingface(
-                tokenizer,
-                # NOTE: We will need to use lm_head's vocab_size
-                # to determine correct special_token_ids for this tokenizer.
-                # See https://github.com/mlc-ai/xgrammar/commit/70c959fb6d9cea75aae33c414763cd0602022d92  # noqa: E501
-                vocab_size=vocab_size,
-            )
-            metadata = json.loads(tokenizer_info.dump_metadata())
-
-            # Vendored from xgrammar logic to get encoded_vocab
-            # https://github.com/mlc-ai/xgrammar/blob/989222175c2a30fb7987d8bcce35bec1bf6817f2/python/xgrammar/tokenizer_info.py#L127 # noqa: E501
-            try:
-                vocab_dict = tokenizer.get_vocab()
-            except AttributeError as e:
-                raise ValueError(
-                    f"Cannot get the vocabulary of the tokenizer "
-                    f"{type(tokenizer)}. The tokenizer should have a "
-                    "get_vocab method.") from e
-
-            # maintain tokenizer's indexing
-            encoded_vocab = [""] * tokenizer_info.vocab_size
-            for token, idx in vocab_dict.items():
-                if idx < tokenizer_info.vocab_size:
-                    encoded_vocab[idx] = token
-
-            if isinstance(tokenizer, MistralTokenizer):
-                # REF: https://github.com/mlc-ai/xgrammar/blob/5e141f6ff1ca02bc31f9e512e68b61f2a8ae88e5/tests/python/test_tokenizer_info.py#L43 # noqa: E501
-                metadata.update({
-                    "vocab_type": xgr.VocabType.BYTE_FALLBACK,
-                    "add_prefix_space": True
-                })
-
-            cls._cache[tokenizer_hash] = TokenizerData(
-                encoded_vocab=encoded_vocab,
-                metadata=json.dumps(metadata),
-            )
-
-        return cls._cache[tokenizer_hash]
-
-
-class GrammarCompilerCache:
-    """
-    Cache for GrammarCompiler instances based on tokenizer.
-
-    This cache reduces the overhead of creating new compiler instances when
-    using the same tokenizer configuration.
-    """
-    _cache: dict[str, xgr.GrammarCompiler] = {}
-
-    @classmethod
-    def get_compiler(cls, config: GrammarConfig) -> xgr.GrammarCompiler:
-        cache_key = str(config.tokenizer_hash)
-
-        if cache_key not in cls._cache:
-            config_data = config.tokenizer_data
-
-            # In TokenizerDataCache.get_tokenizer_data, a serializable
-            # tokenizer_data is created and cached. This data is used to build
-            # a tokenizer_info and create an xgrammar compiler.
-            tokenizer_info = xgr.TokenizerInfo.from_vocab_and_metadata(
-                encoded_vocab=config_data.encoded_vocab,
-                metadata=config_data.metadata,
-            )
-            cache_size = vllm.envs.VLLM_XGRAMMAR_CACHE_MB * 1024 * 1024
-            cls._cache[cache_key] = xgr.GrammarCompiler(
-                tokenizer_info,
-                max_threads=config.max_threads,
-                cache_enabled=True,
-                cache_limit_bytes=cache_size,
-            )
-
-        return cls._cache[cache_key]
-
-
-@dataclass
-class GrammarConfig:
-    """Serializable configuration for grammar compilation"""
-    tokenizer_hash: int
-    tokenizer_data: TokenizerData
-    json_str: str | None = None
-    grammar_str: str | None = None
-    json_object: bool | None = None
-    any_whitespace: bool = True
-    regex_str: str | None = None
-    max_threads: int = 8
-
-    @classmethod
-    def from_guided_params(cls,
-                           guided_params: GuidedDecodingParams,
-                           model_config: ModelConfig,
-                           tokenizer: PreTrainedTokenizer,
-                           max_threads: int = 8) -> GrammarConfig:
-
-        tokenizer_hash = hash(tokenizer)
-        tokenizer_data = TokenizerDataCache.get_tokenizer_data(
-            tokenizer,
-            tokenizer_hash=tokenizer_hash,
-            vocab_size=model_config.hf_text_config.vocab_size,
-        )
-
-        if guided_params.json:
-            if not isinstance(guided_params.json, str):
-                json_str = json.dumps(guided_params.json)
-            else:
-                json_str = guided_params.json
-
-            any_whitespace = not guided_params.disable_any_whitespace
-
-            # Check and log if model with xgrammar and whitespace have history
-            # of runaway generation of whitespaces.
-            # References:
-            # https://github.com/vllm-project/vllm/pull/12744
-            # https://github.com/mlc-ai/xgrammar/issues/212
-            model_with_warn = None
-
-            if 'Mistral' in model_config.model:
-                model_with_warn = 'Mistral'
-            elif 'Qwen' in model_config.model:
-                model_with_warn = 'Qwen'
-
-            if model_with_warn is not None and any_whitespace:
-                logger.info_once(
-                    "%s model detected, consider setting `disable_any_whitespace` to prevent runaway generation of whitespaces.",  # noqa: E501
-                    model_with_warn,
-                )
-            # Validate the schema and raise ValueError here if it is invalid.
-            # This is to avoid exceptions in model execution, which will crash
-            # the engine worker process.
-            try:
-                xgr.Grammar.from_json_schema(json_str,
-                                             any_whitespace=any_whitespace)
-            except RuntimeError as err:
-                raise ValueError(str(err)) from err
-
-            return cls(json_str=json_str,
-                       tokenizer_hash=tokenizer_hash,
-                       max_threads=max_threads,
-                       tokenizer_data=tokenizer_data,
-                       any_whitespace=any_whitespace)
-        elif guided_params.grammar:
-            # XGrammar only supports GBNF grammars, so we must convert Lark
-            if grammar_is_likely_lark(guided_params.grammar):
-                try:
-                    grammar_str = convert_lark_to_gbnf(guided_params.grammar)
-                except ValueError as e:
-                    raise ValueError(
-                        "Failed to convert the grammar from Lark to GBNF. "
-                        "Please either use GBNF grammar directly or specify"
-                        " --guided-decoding-backend=outlines.\n"
-                        f"Conversion error: {str(e)}") from e
-            else:
-                grammar_str = guided_params.grammar
-
-            # Validate the grammar and raise ValueError here if it is invalid.
-            # This is to avoid exceptions in model execution, which will crash
-            # the engine worker process.
-            try:
-                xgr.Grammar.from_ebnf(grammar_str)
-            except RuntimeError as err:
-                raise ValueError(str(err)) from err
-
-            return cls(grammar_str=grammar_str,
-                       tokenizer_hash=tokenizer_hash,
-                       max_threads=max_threads,
-                       tokenizer_data=tokenizer_data)
-        elif guided_params.json_object:
-            return cls(
-                json_object=True,
-                tokenizer_hash=tokenizer_hash,
-                max_threads=max_threads,
-                tokenizer_data=tokenizer_data,
-            )
-        elif guided_params.choice:
-            choice_str = GrammarConfig.choice_as_grammar(guided_params.choice)
-            try:
-                xgr.Grammar.from_ebnf(choice_str)
-            except RuntimeError as err:
-                raise ValueError(str(err)) from err
-
-            return cls(
-                grammar_str=choice_str,
-                tokenizer_hash=tokenizer_hash,
-                max_threads=max_threads,
-                tokenizer_data=tokenizer_data,
-            )
-        elif guided_params.regex:
-            return cls(
-                regex_str=guided_params.regex,
-                tokenizer_hash=tokenizer_hash,
-                max_threads=max_threads,
-                tokenizer_data=tokenizer_data,
-            )
-        else:
-            raise ValueError(
-                "Currently only support JSON and EBNF grammar mode for xgrammar"
-            )
-
-    @staticmethod
-    def escape_ebnf_string(s: str) -> str:
-        """Escape special characters in a EBNF string."""
-        # Escape double quotes and backslashes
-        return re.sub(r'(["\\])', r'\\\1', s)
-
-    @staticmethod
-    def choice_as_grammar(choice: list[str] | None) -> str:
-        if choice is None:
-            raise ValueError("Choice is not set")
-        escaped_choices = (GrammarConfig.escape_ebnf_string(c) for c in choice)
-        grammar = ('root ::= ' + ' | '.join(f'"{c}"' for c in escaped_choices))
-        return grammar
-
-    @staticmethod
-    def tokenizer_info(tokenizer_data: TokenizerData) -> xgr.TokenizerInfo:
-        return xgr.TokenizerInfo.from_vocab_and_metadata(
-            encoded_vocab=tokenizer_data.encoded_vocab,
-            metadata=tokenizer_data.metadata,
-        )
-
-
-@dataclass
-class XGrammarLogitsProcessor:
-    """Wrapper class to support pickle protocol"""
-    config: GrammarConfig
-    reasoner: ReasoningParser | None = None
-
-    ctx: xgr.CompiledGrammar | None = None
-    tokenizer_info: xgr.TokenizerInfo = None  # type: ignore[assignment]
-    token_bitmask: torch.Tensor = None  # type: ignore[assignment]
-    matchers: list[xgr.GrammarMatcher] = field(default_factory=list)
-    batch_size: int = field(default=1)
-    prefilled: bool = field(default=False)
-
-    def __post_init__(self):
-        if self.tokenizer_info is None:
-            self.tokenizer_info = self.config.tokenizer_info(
-                self.config.tokenizer_data)
-
-    def __getstate__(self) -> dict[str, Any]:
-        return {'config': self.config, 'reasoner': self.reasoner}
-
-    def __setstate__(self, state: dict[str, Any]):
-        self.config = state['config']
-        self.reasoner = state['reasoner']
-
-        self.tokenizer_info = GrammarConfig.tokenizer_info(
-            self.config.tokenizer_data)
-        self.ctx = None
-        self.matchers = []
-        self.batch_size = 1
-        self.token_bitmask = None  # type: ignore[assignment]
-        self.prefilled = False
-
-    def _ensure_ctx(self):
-        """Lazily initialize the processor in the worker process"""
-        if self.ctx is None:
-            compiler = GrammarCompilerCache.get_compiler(self.config)
-            if self.config.json_str is not None:
-                any_whitespace = self.config.any_whitespace
-                self.ctx = compiler\
-                    .compile_json_schema(self.config.json_str,
-                                         any_whitespace=any_whitespace)
-            elif self.config.grammar_str is not None:
-                self.ctx = compiler.compile_grammar(self.config.grammar_str)
-            elif self.config.json_object:
-                any_whitespace = self.config.any_whitespace
-                self.ctx = compiler\
-                    .compile_json_schema('{"type": "object"}',
-                                         any_whitespace=any_whitespace)
-            elif self.config.regex_str:
-                self.ctx = compiler.compile_regex(self.config.regex_str)
-            else:
-                raise ValueError(
-                    "Invalid configuration for xgrammar logits processor")
-
-    def __call__(self, input_ids: list[int],
-                 scores: torch.Tensor) -> torch.Tensor:
-
-        # Skip the structured logits processing if reasoning is not finished.
-        # reasoner is not None only when `--reasoning-parser` is set.
-        if self.reasoner is not None and \
-        not self.reasoner.is_reasoning_end(
-                input_ids):
-            return scores
-
-        if self.ctx is None:
-            self._ensure_ctx()
-
-        if len(self.matchers) == 0:
-            self.matchers = [
-                xgr.GrammarMatcher(self.ctx) for _ in range(self.batch_size)
-            ]
-            self.token_bitmask = xgr.allocate_token_bitmask(
-                self.batch_size, self.tokenizer_info.vocab_size)
-
-        if not self.prefilled:
-            # Have not sampled a token yet
-            self.prefilled = True
-        else:
-            for i, matcher in enumerate(self.matchers):
-                if not matcher.is_terminated():
-                    sampled_token = input_ids[-1]
-                    assert self.matchers[i].accept_token(sampled_token)
-
-        for i, matcher in enumerate(self.matchers):
-            if not matcher.is_terminated():
-                # @ubospica: ideally, fill_next_token_bitmask should be
-                # parallelized with model decoding
-                # See https://github.com/vllm-project/vllm/pull/10785/files#r1864278303
-                matcher.fill_next_token_bitmask(self.token_bitmask, i)
-
-        # token_bitmask is a CPU tensor for use with accept_token and
-        # fill_next_token_bitmask so we move it to the device of scores
-        device_type = scores.device.type
-        dtype = scores.dtype
-        if device_type != "cuda":
-            # xgrammar on cpu only supports float32 scores
-            # see: https://github.com/mlc-ai/xgrammar/blob/c1b64920cad24f44f235778c1c00bb52d57da01a/python/xgrammar/kernels/apply_token_bitmask_inplace_cpu.py#L22
-            scores = scores.to("cpu").float().unsqueeze(0)
-
-        # Note: In this method, if the tensors have different dimensions
-        # on CPU device fails, but on GPU it runs without error. Hence the
-        # unsqueeze above for scores, to match the token bitmask shape
-        xgr.apply_token_bitmask_inplace(
-            scores, self.token_bitmask.to(scores.device, non_blocking=True))
-        if device_type != "cuda":
-            scores = scores.to(dtype).to(device_type).squeeze()
-
-        return scores
-
-    def clone(self) -> XGrammarLogitsProcessor:
-        """Create a new instance with shared compiled grammar
-          but separate state"""
-        new_processor = XGrammarLogitsProcessor(self.config, self.reasoner,
-                                                None, self.tokenizer_info)
-
-        # Share the compiled grammar context (immutable after compilation)
-        new_processor.ctx = self.ctx
-
-        # Create fresh matchers for the new sequence
-        if self.ctx is not None:
-            new_processor.matchers = [
-                xgr.GrammarMatcher(self.ctx) for _ in range(self.batch_size)
-            ]
-
-        # Create a new token bitmask with the same size
-        if hasattr(self, 'token_bitmask') and self.token_bitmask is not None:
-            new_processor.token_bitmask = self.token_bitmask
-
-        # Copy simple attributes
-        new_processor.batch_size = self.batch_size
-        # Reset prefilled state for new sequence
-        new_processor.prefilled = False
-
-        return new_processor
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..307c9240938c5
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
index 30a0b5801f71f..46b38b74b4280 100644
--- a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
@@ -43,7 +43,7 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
 
     # DeepEP low-latency kernels are compiled only for certain
     # specific hidden sizes.
-    SUPPORTED_HIDDEN_SIZES = [2048, 2560, 4096, 5120, 7168]
+    SUPPORTED_HIDDEN_SIZES = [2048, 2560, 4096, 5120, 6144, 7168]
 
     def __init__(self,
                  buffers: list[deep_ep.Buffer],
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 1df98bb915a2f..cb9842ef7dfe6 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -7,6 +7,7 @@ import os
 from typing import Any, Callable, Optional
 
 import torch
+import torch.nn.functional as F
 
 import vllm.envs as envs
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
@@ -29,6 +30,8 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
     TopKWeightAndReduceNoOP)
 from vllm.model_executor.layers.fused_moe.utils import (
     _resize_cache, moe_kernel_quantize_input, per_token_group_quant_fp8)
+from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
+    calculate_tile_tokens_dim)
 from vllm.model_executor.layers.quantization.utils.mxfp4_utils import (
     dequant_mxfp4)
 from vllm.platforms import current_platform
@@ -1001,6 +1004,7 @@ def inplace_fused_experts(hidden_states: torch.Tensor,
                           topk_weights: torch.Tensor,
                           topk_ids: torch.Tensor,
                           activation: str = "silu",
+                          is_act_and_mul: bool = True,
                           apply_router_weight_on_input: bool = False,
                           use_fp8_w8a8: bool = False,
                           use_int8_w8a8: bool = False,
@@ -1018,7 +1022,8 @@ def inplace_fused_experts(hidden_states: torch.Tensor,
                           a2_scale: Optional[torch.Tensor] = None,
                           block_shape: Optional[list[int]] = None) -> None:
     fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids, True,
-                       activation, apply_router_weight_on_input, use_fp8_w8a8,
+                       activation, is_act_and_mul,
+                       apply_router_weight_on_input, use_fp8_w8a8,
                        use_int8_w8a8, use_int8_w8a16, use_int4_w4a16,
                        use_mxfp4_w4a4, per_channel_quant, global_num_experts,
                        expert_map, w1_scale, w2_scale, w1_zp, w2_zp, a1_scale,
@@ -1032,6 +1037,7 @@ def inplace_fused_experts_fake(
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
         activation: str = "silu",
+        is_act_and_mul: bool = True,
         apply_router_weight_on_input: bool = False,
         use_fp8_w8a8: bool = False,
         use_int8_w8a8: bool = False,
@@ -1061,22 +1067,6 @@ direct_register_custom_op(
 )
 
 
-def next_positive_power_of_2(x: int) -> int:
-    if x < 1:
-        return 1
-    return 1 << (x - 1).bit_length()
-
-
-def _get_tile_tokens_dim(num_tokens, top_k, num_experts):
-    # Guess tokens per expert assuming perfect expert distribution first.
-    num_tokens_per_expert = (num_tokens * top_k) // num_experts
-    # And pad the number to the next power of 2.
-    tile_tokens_dim = next_positive_power_of_2(num_tokens_per_expert)
-    # Cap to 8-64 tokens per CTA tile as it's the range supported by the kernel.
-    tile_tokens_dim = min(max(tile_tokens_dim, 8), 64)
-    return tile_tokens_dim
-
-
 def flashinfer_fused_moe_blockscale_fp8(
         routing_logits: torch.Tensor,
         routing_bias: torch.Tensor,
@@ -1124,9 +1114,10 @@ def flashinfer_fused_moe_blockscale_fp8(
         local_expert_offset=expert_offset,
         local_num_experts=local_num_experts,
         routed_scaling_factor=routed_scaling,
-        tile_tokens_dim=_get_tile_tokens_dim(x.shape[0], top_k,
-                                             global_num_experts),
+        tile_tokens_dim=calculate_tile_tokens_dim(x.shape[0], top_k,
+                                                  global_num_experts),
         routing_method_type=2,  # DeepSeek-styled routing method
+        use_shuffled_weight=False,
     )
 
 
@@ -1159,6 +1150,97 @@ direct_register_custom_op(
 )
 
 
+def flashinfer_fused_moe_per_tensor_scale_fp8(
+        routing_logits: torch.Tensor,
+        routing_bias: Optional[torch.Tensor],
+        hidden_states: torch.Tensor,
+        input_scale: torch.Tensor,
+        gemm1_weights: torch.Tensor,
+        gemm1_weights_scale: torch.Tensor,
+        activation_scale: torch.Tensor,
+        gemm2_weights: torch.Tensor,
+        gemm2_weights_scale: torch.Tensor,
+        num_experts: int,
+        top_k: int,
+        num_expert_group: Optional[int],
+        topk_group: Optional[int],
+        intermediate_size: int,
+        local_expert_offset: int,
+        local_num_experts: int,
+        use_routing_scales_on_input: bool,
+        routing_method_type: int,
+        routed_scaling_factor: float = 1.0) -> torch.Tensor:
+    num_expert_group = num_expert_group if num_expert_group is not None else 0
+    topk_group = topk_group if topk_group is not None else 0
+
+    quant_hidden_states, input_scale = moe_kernel_quantize_input(
+        hidden_states,
+        input_scale,
+        quant_dtype=torch.float8_e4m3fn,
+        per_act_token_quant=False)
+
+    output1_scales_scalar = gemm1_weights_scale * input_scale * (
+        1.0 / activation_scale)
+    output1_scales_gate_scalar = gemm1_weights_scale * input_scale
+    output2_scales_scalar = activation_scale * gemm2_weights_scale
+
+    from vllm.utils.flashinfer import (
+        flashinfer_trtllm_fp8_per_tensor_scale_moe)
+    return flashinfer_trtllm_fp8_per_tensor_scale_moe(
+        routing_logits=routing_logits,
+        routing_bias=routing_bias,
+        hidden_states=quant_hidden_states,
+        gemm1_weights=gemm1_weights,
+        output1_scales_scalar=output1_scales_scalar,
+        output1_scales_gate_scalar=output1_scales_gate_scalar,
+        gemm2_weights=gemm2_weights,
+        output2_scales_scalar=output2_scales_scalar,
+        num_experts=num_experts,
+        top_k=top_k,
+        n_group=num_expert_group,
+        topk_group=topk_group,
+        intermediate_size=intermediate_size,
+        local_expert_offset=local_expert_offset,
+        local_num_experts=local_num_experts,
+        routed_scaling_factor=routed_scaling_factor,
+        use_routing_scales_on_input=use_routing_scales_on_input,
+        tile_tokens_dim=calculate_tile_tokens_dim(hidden_states.shape[0],
+                                                  top_k, num_experts),
+        routing_method_type=routing_method_type)
+
+
+def flashinfer_fused_moe_per_tensor_scale_fp8_fake(
+        routing_logits: torch.Tensor,
+        routing_bias: torch.Tensor,
+        hidden_states: torch.Tensor,
+        gemm1_weights: torch.Tensor,
+        output1_scales_scalar: torch.Tensor,
+        output1_scales_gate_scalar: torch.Tensor,
+        gemm2_weights: torch.Tensor,
+        output2_scales_scalar: torch.Tensor,
+        num_experts: int,
+        top_k: int,
+        num_expert_group: int,
+        topk_group: int,
+        intermediate_size: int,
+        local_expert_offset: int,
+        local_num_experts: int,
+        routed_scaling_factor: float = 1.0,
+        use_routing_scales_on_input: bool = False,
+        tile_tokens_dim: int = 8,
+        routing_method_type: int = 0) -> torch.Tensor:
+    pass
+
+
+direct_register_custom_op(
+    op_name="flashinfer_fused_moe_per_tensor_scale_fp8",
+    op_func=flashinfer_fused_moe_per_tensor_scale_fp8,
+    mutates_args=["hidden_states"],
+    fake_impl=flashinfer_fused_moe_per_tensor_scale_fp8_fake,
+    tags=(torch.Tag.needs_fixed_stride_order, ),
+)
+
+
 def outplace_fused_experts(
         hidden_states: torch.Tensor,
         w1: torch.Tensor,
@@ -1166,6 +1248,7 @@ def outplace_fused_experts(
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
         activation: str = "silu",
+        is_act_and_mul: bool = True,
         apply_router_weight_on_input: bool = False,
         use_fp8_w8a8: bool = False,
         use_int8_w8a8: bool = False,
@@ -1182,13 +1265,12 @@ def outplace_fused_experts(
         a1_scale: Optional[torch.Tensor] = None,
         a2_scale: Optional[torch.Tensor] = None,
         block_shape: Optional[list[int]] = None) -> torch.Tensor:
-    return fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids,
-                              False, activation, apply_router_weight_on_input,
-                              use_fp8_w8a8, use_int8_w8a8, use_int8_w8a16,
-                              use_int4_w4a16, use_mxfp4_w4a4,
-                              per_channel_quant, global_num_experts,
-                              expert_map, w1_scale, w2_scale, w1_zp, w2_zp,
-                              a1_scale, a2_scale, block_shape)
+    return fused_experts_impl(
+        hidden_states, w1, w2, topk_weights, topk_ids, False, activation,
+        is_act_and_mul, apply_router_weight_on_input, use_fp8_w8a8,
+        use_int8_w8a8, use_int8_w8a16, use_int4_w4a16, use_mxfp4_w4a4,
+        per_channel_quant, global_num_experts, expert_map, w1_scale, w2_scale,
+        w1_zp, w2_zp, a1_scale, a2_scale, block_shape)
 
 
 def outplace_fused_experts_fake(
@@ -1198,6 +1280,7 @@ def outplace_fused_experts_fake(
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
         activation: str = "silu",
+        is_act_and_mul: bool = True,
         use_fp8_w8a8: bool = False,
         use_int8_w8a8: bool = False,
         use_int8_w8a16: bool = False,
@@ -1252,6 +1335,7 @@ def fused_experts(
         topk_ids: torch.Tensor,
         inplace: bool = False,
         activation: str = "silu",
+        is_act_and_mul: bool = True,
         apply_router_weight_on_input: bool = False,
         use_fp8_w8a8: bool = False,
         use_int8_w8a8: bool = False,
@@ -1282,6 +1366,8 @@ def fused_experts(
                             or is_blackwell_deep_gemm_used())
     if (allow_deep_gemm and use_fp8_w8a8 and should_use_deep_gemm):
         assert apply_router_weight_on_input is False
+        assert is_act_and_mul, (
+            "DeepGemm only supports is_act_and_mul=True for now.")
         return deep_gemm_moe_fp8(
             hidden_states=hidden_states,
             w1=w1,
@@ -1318,6 +1404,7 @@ def fused_experts(
             topk_weights=topk_weights,
             topk_ids=topk_ids,
             activation=activation,
+            is_act_and_mul=is_act_and_mul,
             apply_router_weight_on_input=apply_router_weight_on_input,
             use_fp8_w8a8=use_fp8_w8a8,
             use_int8_w8a8=use_int8_w8a8,
@@ -1344,6 +1431,7 @@ def fused_experts_impl(
     topk_ids: torch.Tensor,
     inplace: bool = False,
     activation: str = "silu",
+    is_act_and_mul: bool = True,
     apply_router_weight_on_input: bool = False,
     use_fp8_w8a8: bool = False,
     use_int8_w8a8: bool = False,
@@ -1502,14 +1590,21 @@ def fused_experts_impl(
                                 per_channel_quant=per_channel_quant,
                                 block_shape=block_shape)
 
-        if activation == "silu":
+        # Activation function with multiplication
+        if activation == "silu" and is_act_and_mul:
             torch.ops._C.silu_and_mul(intermediate_cache2,
                                       intermediate_cache1.view(-1, N))
-        elif activation == "gelu":
+        elif activation == "gelu" and is_act_and_mul:
             torch.ops._C.gelu_and_mul(intermediate_cache2,
                                       intermediate_cache1.view(-1, N))
+        # Activation function without multiplication
+        elif activation == "silu":
+            intermediate_cache2 = F.silu(intermediate_cache1.view(-1, N))
+        elif activation == "gelu":
+            intermediate_cache2 = F.gelu(intermediate_cache1.view(-1, N))
         else:
-            raise ValueError(f"Unsupported FusedMoe activation: {activation}")
+            raise ValueError(f"Unsupported FusedMoe activation: {activation}, "
+                             f"with is_act_and_mul={is_act_and_mul}.")
 
         qintermediate_cache2, a2q_scale = moe_kernel_quantize_input(
             A=intermediate_cache2,
@@ -1554,6 +1649,7 @@ def fused_moe(
     renormalize: bool,
     inplace: bool = False,
     activation: str = "silu",
+    is_act_and_mul: bool = True,
     use_grouped_topk: bool = False,
     num_expert_group: Optional[int] = None,
     topk_group: Optional[int] = None,
@@ -1590,6 +1686,9 @@ def fused_moe(
         Defaults to False.
     - activation (str): The activation function to apply after the first
         MoE layer.
+    - is_act_and_mul (bool): If True, use activation-and-mul function for
+        activation (self-gated activation), otherwise use activation function
+        for activation (ungated activation).
     - num_expert_group: Optional[int]: additional parameter for grouped_topk
     - topk_group: Optional[int]: additional parameter for grouped_topk
     - use_grouped_topk: If True, use grouped_topk instead of fused_topk
@@ -1626,6 +1725,9 @@ def fused_moe(
     Returns:
     - torch.Tensor: The output tensor after applying the MoE layer.
     """
+    if not is_act_and_mul:
+        assert inplace is False, (
+            "is_act_and_mul=False is not supported with inplace=True")
 
     if use_grouped_topk:
         assert num_expert_group is not None and topk_group is not None
@@ -1646,6 +1748,7 @@ def fused_moe(
                          topk_ids,
                          inplace=inplace,
                          activation=activation,
+                         is_act_and_mul=is_act_and_mul,
                          use_fp8_w8a8=use_fp8_w8a8,
                          use_int8_w8a8=use_int8_w8a8,
                          use_int8_w8a16=use_int8_w8a16,
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 5f8c617b8b24b..5a87c9bd9a13c 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -875,6 +875,14 @@ class FusedMoE(torch.nn.Module):
         elif shard_id == "w2":
             param_data[expert_id] = loaded_weight
 
+    def _load_w13_weight_scale(self, shard_dim: int,
+                               loaded_weight: torch.Tensor,
+                               param: torch.Tensor, tp_rank: int):
+        shard_size = param.shape[shard_dim]
+        loaded_weight = loaded_weight.narrow(shard_dim, shard_size * tp_rank,
+                                             shard_size)
+        param.copy_(loaded_weight)
+
     def _load_model_weight_or_group_weight_scale(self,
                                                  shard_dim: int,
                                                  expert_data: torch.Tensor,
@@ -1124,7 +1132,12 @@ class FusedMoE(torch.nn.Module):
                 "weight_scale_2" in weight_name if uses_weight_scale_2 else
                 "weight_scale" in weight_name) or "input_scale" in weight_name
 
-            if per_tensor_conditions:
+            if "w13_weight_scale" in weight_name:
+                self._load_w13_weight_scale(shard_dim=shard_dim,
+                                            loaded_weight=loaded_weight,
+                                            param=param,
+                                            tp_rank=self.tp_rank)
+            elif per_tensor_conditions:
                 self._load_per_tensor_weight_scale(
                     shard_id=shard_id,
                     param=param,
diff --git a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
index 2c9ad509fa98e..c7d7126bab3ad 100644
--- a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
+++ b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
@@ -5,144 +5,8 @@ from typing import Optional
 import torch
 
 from vllm import _custom_ops as ops
-from vllm.triton_utils import tl, triton
-from vllm.utils import cdiv, round_up
-
-
-@triton.jit
-def moe_align_block_size_stage1(
-    topk_ids_ptr,
-    tokens_cnts_ptr,
-    num_experts: tl.constexpr,
-    numel: tl.constexpr,
-    tokens_per_thread: tl.constexpr,
-):
-    pid = tl.program_id(0)
-
-    start_idx = pid * tokens_per_thread
-
-    off_c = (pid + 1) * num_experts
-
-    for i in range(tokens_per_thread):
-        if start_idx + i < numel:
-            idx = tl.load(topk_ids_ptr + start_idx + i)
-            token_cnt = tl.load(tokens_cnts_ptr + off_c + idx)
-            tl.store(tokens_cnts_ptr + off_c + idx, token_cnt + 1)
-
-
-@triton.jit
-def moe_align_block_size_stage2(
-    tokens_cnts_ptr,
-    num_experts: tl.constexpr,
-):
-    pid = tl.program_id(0)
-
-    last_cnt = 0
-    for i in range(1, num_experts + 1):
-        token_cnt = tl.load(tokens_cnts_ptr + i * num_experts + pid)
-        last_cnt = last_cnt + token_cnt
-        tl.store(tokens_cnts_ptr + i * num_experts + pid, last_cnt)
-
-
-@triton.jit
-def moe_align_block_size_stage3(
-    total_tokens_post_pad_ptr,
-    tokens_cnts_ptr,
-    cumsum_ptr,
-    num_experts: tl.constexpr,
-    block_size: tl.constexpr,
-):
-    last_cumsum = 0
-    off_cnt = num_experts * num_experts
-    for i in range(1, num_experts + 1):
-        token_cnt = tl.load(tokens_cnts_ptr + off_cnt + i - 1)
-        last_cumsum = last_cumsum + tl.cdiv(token_cnt, block_size) * block_size
-        tl.store(cumsum_ptr + i, last_cumsum)
-    tl.store(total_tokens_post_pad_ptr, last_cumsum)
-
-
-@triton.jit
-def moe_align_block_size_stage4(
-    topk_ids_ptr,
-    sorted_token_ids_ptr,
-    expert_ids_ptr,
-    tokens_cnts_ptr,
-    cumsum_ptr,
-    num_experts: tl.constexpr,
-    block_size: tl.constexpr,
-    numel: tl.constexpr,
-    tokens_per_thread: tl.constexpr,
-):
-    pid = tl.program_id(0)
-    start_idx = tl.load(cumsum_ptr + pid)
-    end_idx = tl.load(cumsum_ptr + pid + 1)
-
-    for i in range(start_idx, end_idx, block_size):
-        tl.store(expert_ids_ptr + i // block_size, pid)
-
-    start_idx = pid * tokens_per_thread
-    off_t = pid * num_experts
-
-    for i in range(start_idx, tl.minimum(start_idx + tokens_per_thread,
-                                         numel)):
-        expert_id = tl.load(topk_ids_ptr + i)
-        token_cnt = tl.load(tokens_cnts_ptr + off_t + expert_id)
-        rank_post_pad = token_cnt + tl.load(cumsum_ptr + expert_id)
-        tl.store(sorted_token_ids_ptr + rank_post_pad, i)
-        tl.store(tokens_cnts_ptr + off_t + expert_id, token_cnt + 1)
-
-
-# Triton implementation based on:
-# https://github.com/sgl-project/sglang/commit/ba5112ff691d791a9e38c6c71f59324a5fcb49d0
-def moe_align_block_size_triton(
-    topk_ids: torch.Tensor,
-    num_experts: int,
-    block_size: int,
-    sorted_token_ids: torch.Tensor,
-    expert_ids: torch.Tensor,
-    num_tokens_post_pad: torch.Tensor,
-) -> None:
-    numel = topk_ids.numel()
-    grid = (num_experts, )
-    tokens_cnts = torch.zeros((num_experts + 1, num_experts),
-                              dtype=torch.int32,
-                              device=topk_ids.device)
-    cumsum = torch.zeros((num_experts + 1, ),
-                         dtype=torch.int32,
-                         device=topk_ids.device)
-    tokens_per_thread = cdiv(numel, num_experts)
-    sorted_token_ids.fill_(numel)
-    expert_ids.zero_()
-
-    moe_align_block_size_stage1[grid](
-        topk_ids,
-        tokens_cnts,
-        num_experts,
-        numel,
-        tokens_per_thread,
-    )
-    moe_align_block_size_stage2[grid](
-        tokens_cnts,
-        num_experts,
-    )
-    moe_align_block_size_stage3[(1, )](
-        num_tokens_post_pad,
-        tokens_cnts,
-        cumsum,
-        num_experts,
-        block_size,
-    )
-    moe_align_block_size_stage4[grid](
-        topk_ids,
-        sorted_token_ids,
-        expert_ids,
-        tokens_cnts,
-        cumsum,
-        num_experts,
-        block_size,
-        numel,
-        tokens_per_thread,
-    )
+from vllm.triton_utils import triton
+from vllm.utils import round_up
 
 
 def moe_align_block_size(
diff --git a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
index 20ee0d9f780a7..d9059f50b4459 100644
--- a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
+++ b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
@@ -76,43 +76,43 @@ def _moe_unpermute_and_reduce(
 
 def moe_permute(
     hidden_states: torch.Tensor,
-    topk_weights: torch.Tensor,
+    a1q_scale: Optional[torch.Tensor],
     topk_ids: torch.Tensor,
-    token_expert_indices: torch.Tensor,
-    topk: int,
     n_expert: int,
-    n_local_expert: int,
+    n_local_expert: int = -1,
     expert_map: Optional[torch.Tensor] = None,
     align_block_size: Optional[int] = None,
     fill_invalid_expert: int = -1
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor, torch.Tensor,
+           torch.Tensor]:
     """
     This function expands and permutes activation to gather uncontinuous tokens
       for each expert.
     Parameters:
     - hidden_states (torch.Tensor): The input tensor to the MoE layer.
-    - topk_weights (torch.Tensor): topk expert route weight for each token.
+    - a1q_scale (Optional[torch.Tensor]): quant scale for hidden_states
     - topk_ids (torch.Tensor): topk expert route id for each token.
-    - token_expert_indices (torch.Tensor): indice for expanded hidden.
-    - topk (int): The number of top-k experts to select.
     - n_expert (int): The number of expert.
     - n_local_expert (int): The number of expert in current EP rank.
     - expert_map (Optional[torch.Tensor]):  A tensor mapping expert indices
-        from the global expert space to the local expert space of the expert
+        from the global expert space to the local expert space of the expert 
         parallel shard.
     - align_block_size (Optional[int]): align group gemm block size for deepgemm
     - fill_invalid_expert(int): fill expert id in m_indices for invalid expert
       to workaround DeepGemm unsupported -1 in m_indices
     Returns:
     - permuted_hidden_states (torch.Tensor): permuted activation.
+    - a1q_scale (Optional[torch.Tensor]): quant scale for hidden_states
     - expert_first_token_offset (torch.Tensor): offset of the first token
        of each expert for standard grouped gemm. if enable 'align_block_size'
        expert_first_token_offset will align up to 'align_block_size'.
-    - src_row_id2dst_row_id_map (torch.Tensor): idx map for moe_unpermute.
+    - inv_permuted_idx (torch.Tensor): idx map for moe_unpermute.
+    - permuted_idx (torch.Tensor): idx map from hidden to permuted_hidden.
     - m_indices: m_indices for grouped gemm in deepgemm,`m_indices[i]` records
     the group which the j-th row of the LHS belong to.`
     """
     n_token, n_hidden = hidden_states.size()
+    topk = topk_ids.size(1)
     assert (n_hidden * hidden_states.element_size()
             ) % 16 == 0, "permue kernel need hidden dim align to 16B"
     permuted_row_size = n_token * topk
@@ -120,12 +120,19 @@ def moe_permute(
         permuted_row_size = (permuted_row_size + n_expert *
                              (align_block_size - 1) + align_block_size -
                              1) // align_block_size * align_block_size
-
+    if n_local_expert == -1:
+        n_local_expert = n_expert
     permuted_hidden_states = torch.empty(
         (permuted_row_size, n_hidden),
         dtype=hidden_states.dtype,
         device=hidden_states.device,
     )
+    token_expert_indices = torch.arange(0,
+                                        n_token * topk,
+                                        dtype=torch.int32,
+                                        device=hidden_states.device).reshape(
+                                            (n_token, topk))
+
     m_indices = torch.full((permuted_row_size, ),
                            fill_invalid_expert,
                            dtype=torch.int32,
@@ -133,57 +140,54 @@ def moe_permute(
     expert_first_token_offset = torch.empty(n_local_expert + 1,
                                             dtype=torch.int64,
                                             device=hidden_states.device)
-    src_row_id2dst_row_id_map = torch.empty((n_token, topk),
-                                            dtype=torch.int32,
-                                            device=hidden_states.device)
-    torch.ops._moe_C.moe_permute(hidden_states, topk_weights, topk_ids,
-                                 token_expert_indices, expert_map, n_expert,
-                                 n_local_expert, topk, align_block_size,
-                                 permuted_hidden_states,
-                                 expert_first_token_offset,
-                                 src_row_id2dst_row_id_map, m_indices)
-    return (permuted_hidden_states, expert_first_token_offset,
-            src_row_id2dst_row_id_map, m_indices)
+    permuted_idx = torch.full((permuted_row_size, ),
+                              n_token * topk,
+                              dtype=torch.int32,
+                              device=hidden_states.device)
+    inv_permuted_idx = torch.empty((n_token, topk),
+                                   dtype=torch.int32,
+                                   device=hidden_states.device)
+    topk_ids = topk_ids.to(torch.int32)
+    torch.ops._moe_C.moe_permute(hidden_states, topk_ids, token_expert_indices,
+                                 expert_map, n_expert, n_local_expert, topk,
+                                 align_block_size, permuted_hidden_states,
+                                 expert_first_token_offset, inv_permuted_idx,
+                                 permuted_idx, m_indices)
+    if a1q_scale is not None:
+        a1q_scale = a1q_scale[permuted_idx.clamp(max=n_token * topk - 1) //
+                              topk]
+    return (permuted_hidden_states, a1q_scale, expert_first_token_offset,
+            inv_permuted_idx.flatten(), m_indices)
 
 
 def moe_unpermute(
+    out: torch.Tensor,
     permuted_hidden_states: torch.Tensor,
     topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    src_row_id2dst_row_id_map: torch.Tensor,
-    expert_first_token_offset: torch.Tensor,
-    topk: int,
-    n_expert: int,
-    n_local_expert: int,
-) -> torch.Tensor:
+    inv_permuted_idx: torch.Tensor,
+    expert_first_token_offset: Optional[torch.Tensor] = None,
+) -> None:
     """
     This function expands and permutes activation to gathering uncontinuous
       tokens for each expert.
     Parameters:
+    - out (torch.Tensor): output tensor
     - permuted_hidden_states (torch.Tensor): permuted activation.
     - topk_weights (torch.Tensor): topk expert route weight for each token.
-    - topk_ids (torch.Tensor): topk expert route id for each token.
-    - expert_first_token_offset (torch.Tensor): offset of the first token
-       of each expert for grouped gemm.
-    - topk (int): The number of top-k experts to select.
-    - n_expert (int): The number of expert.
-    - n_local_expert (int): The number of expert in current EP rank.
+    - inv_permuted_idx (torch.Tensor): row idx map for moe_unpermute.
+    - expert_first_token_offset (Optional[torch.Tensor]): offset of the first 
+      token of each expert for grouped gemm.
     Returns:
     - hidden_states (torch.Tensor): The reduced and unpermuted activation
       tensor.
     """
-    n_token, n_hidden = topk_weights.size(0), permuted_hidden_states.size(-1)
+    topk = topk_weights.size(1)
+    n_hidden = permuted_hidden_states.size(-1)
     assert (n_hidden * permuted_hidden_states.element_size()
             ) % 16 == 0, "unpermue kernel need hidden dim align to 16B"
-    hidden_states = torch.empty((n_token, n_hidden),
-                                dtype=permuted_hidden_states.dtype,
-                                device=permuted_hidden_states.device)
-
     torch.ops._moe_C.moe_unpermute(permuted_hidden_states, topk_weights,
-                                   topk_ids, src_row_id2dst_row_id_map,
-                                   expert_first_token_offset, n_expert,
-                                   n_local_expert, topk, hidden_states)
-    return hidden_states
+                                   inv_permuted_idx, expert_first_token_offset,
+                                   topk, out)
 
 
 def moe_permute_unpermute_supported():
diff --git a/vllm/model_executor/layers/mamba/abstract.py b/vllm/model_executor/layers/mamba/abstract.py
index 4c4997b4894aa..daebe46f6f771 100644
--- a/vllm/model_executor/layers/mamba/abstract.py
+++ b/vllm/model_executor/layers/mamba/abstract.py
@@ -27,3 +27,8 @@ class MambaBase(ABC):
         In this case, returns (conv_state_shape, ssm_state_shape).
         """
         pass
+
+    @property
+    @abstractmethod
+    def mamba_type(self) -> str:
+        pass
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py
index 2c95099e53ad6..36edac2375d0e 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer2.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py
@@ -732,6 +732,10 @@ class MambaMixer2(MambaBase, CustomOp):
             conv_kernel=self.conv_kernel_size,
         )
 
+    @property
+    def mamba_type(self) -> str:
+        return "mamba2"
+
 
 def mamba_mixer2(
     hidden_states: torch.Tensor,
diff --git a/vllm/model_executor/layers/quantization/auto_round.py b/vllm/model_executor/layers/quantization/auto_round.py
index ea17cd56c9855..a9e967e608e96 100644
--- a/vllm/model_executor/layers/quantization/auto_round.py
+++ b/vllm/model_executor/layers/quantization/auto_round.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from fractions import Fraction
-from typing import Any, Optional, Union
+from typing import TYPE_CHECKING, Any, Optional, Union
 
 import torch
 
@@ -16,6 +16,9 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 
+if TYPE_CHECKING:
+    from vllm.model_executor.models.utils import WeightsMapper
+
 logger = init_logger(__name__)
 
 
@@ -28,7 +31,13 @@ class AutoRoundConfig(QuantizationConfig):
     SUPPORTED_DTYPES = {"int"}
     SUPPORTED_FORMATS = {"auto_round:auto_gptq", "auto_round:auto_awq"}
     SUPPORTED_BACKENDS = {
-        "auto", "gptq", "gptq:marlin", "awq", "awq:marlin", "marlin", "ipex"
+        "auto",
+        "gptq",
+        "gptq:marlin",
+        "awq",
+        "awq:marlin",
+        "marlin",
+        "ipex",
     }
 
     def __init__(
@@ -109,26 +118,70 @@ class AutoRoundConfig(QuantizationConfig):
         )
 
     def get_layer_config(self, layer, layer_name: str):
-        # Priority: extra_config > block_name_to_quantize > type fallback
-        if self.extra_config and layer_name in self.extra_config:
-            cfg = self.extra_config[layer_name]
-            return cfg.get("bits", self.weight_bits), cfg.get(
-                "group_size", self.group_size), cfg.get("sym", self.sym)
 
-        quantized = True
+        def get_config(name: str, quantized: bool = True):
+            cfg = self.extra_config.get(name, {}) if self.extra_config else {}
+            return (
+                cfg.get("bits", self.weight_bits if quantized else 16),
+                cfg.get("group_size", self.group_size if quantized else -1),
+                cfg.get("sym", self.sym if quantized else True),
+            )
+
+        # 1. Exact match from config
+        if self.extra_config and layer_name in self.extra_config:
+            return get_config(layer_name)
+
+        # 2. Determine whether layer should be quantized
+        quantized = not isinstance(layer, ParallelLMHead)
         if self.block_name_to_quantize:
             quantized = any(
                 layer_name.startswith(name)
                 for name in self.block_name_to_quantize)
-        elif isinstance(layer, ParallelLMHead):
-            quantized = False
 
-        return (self.weight_bits, self.group_size,
-                self.sym) if quantized else (16, -1, True)
+        # 3. Handle fused MoE
+        if self.extra_config and "fusedmoe" in layer.__class__.__name__.lower(
+        ):
+            moe_configs = [
+                get_config(name, quantized) for name in self.extra_config
+                if name.startswith(layer_name)
+            ]
+            if moe_configs:
+                if len(set(moe_configs)) == 1:
+                    return moe_configs[0]
+                raise ValueError(f"Fused MoE layer '{layer_name}' requires "
+                                 f"consistent quant config for all sub-layers")
+
+        # 4. Handle fused QKV or other patterns
+        if self.extra_config:
+            for fusion_key, sub_keys in self.packed_modules_mapping.items():
+                if fusion_key in layer_name and layer_name.count(
+                        fusion_key) == 1:
+                    sub_names = [
+                        layer_name.replace(fusion_key, sub_key)
+                        for sub_key in sub_keys
+                    ]
+                    sub_configs = [
+                        get_config(name, quantized) for name in sub_names
+                    ]
+                    if len(set(sub_configs)) == 1:
+                        return sub_configs[0]
+                    raise ValueError(
+                        f"Fused module '{layer_name}' requires "
+                        f"consistent quant config for {sub_names}")
+
+        # 5. Fallback
+        return get_config(layer_name, quantized)
 
     def check_quantized(self, weight_bits: int) -> bool:
         return weight_bits < 16
 
+    def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
+        if self.block_name_to_quantize is not None:
+            self.block_name_to_quantize = hf_to_vllm_mapper.apply_list(
+                self.block_name_to_quantize)
+        if self.extra_config is not None:
+            self.extra_config = hf_to_vllm_mapper.apply_dict(self.extra_config)
+
     def apply_awq_quant_layer(self, layer, prefix: str, backend: str = "auto"):
         from vllm.model_executor.layers.fused_moe import FusedMoE
         from vllm.model_executor.layers.quantization.utils.marlin_utils import (
@@ -141,9 +194,14 @@ class AutoRoundConfig(QuantizationConfig):
             else:
                 return None
 
-        logger.debug("[%s] Type: %s, Bits: %s, Group Size: %s, Sym: %s",
-                     prefix, layer.__class__.__name__, weight_bits, group_size,
-                     sym)
+        logger.debug(
+            "[%s] Type: %s, Bits: %s, Group Size: %s, Sym: %s",
+            prefix,
+            layer.__class__.__name__,
+            weight_bits,
+            group_size,
+            sym,
+        )
         if backend == "auto" or "marlin" in backend:
             AWQ_TYPE_MAP = {
                 4: scalar_types.uint4,
@@ -162,15 +220,19 @@ class AutoRoundConfig(QuantizationConfig):
         if use_marlin:
             from vllm.model_executor.layers.quantization.awq_marlin import (
                 AWQMarlinConfig, AWQMarlinLinearMethod, AWQMoEMethod)
-            quant_args_marlin = AWQMarlinConfig(weight_bits=weight_bits,
-                                                group_size=group_size,
-                                                zero_point=not sym,
-                                                lm_head_quantized=False,
-                                                full_config={},
-                                                modules_to_not_convert=[])
+
+            quant_args_marlin = AWQMarlinConfig(
+                weight_bits=weight_bits,
+                group_size=group_size,
+                zero_point=not sym,
+                lm_head_quantized=False,
+                full_config={},
+                modules_to_not_convert=[],
+            )
         else:
             from vllm.model_executor.layers.quantization.awq import (
                 AWQConfig, AWQLinearMethod)
+
             quant_args = AWQConfig(
                 weight_bits=weight_bits,
                 group_size=group_size,
@@ -182,6 +244,7 @@ class AutoRoundConfig(QuantizationConfig):
                 return AWQMoEMethod(quant_args_marlin)
             from vllm.model_executor.layers.quantization.moe_wna16 import (
                 MoeWNA16Config)
+
             config = {
                 "quant_method": "awq",
                 "bits": weight_bits,
@@ -206,6 +269,7 @@ class AutoRoundConfig(QuantizationConfig):
         from vllm.model_executor.layers.fused_moe import FusedMoE
         from vllm.model_executor.layers.quantization.utils.marlin_utils import (
             check_marlin_supported, check_moe_marlin_supports_layer)
+
         weight_bits, group_size, sym = self.get_layer_config(layer, prefix)
         if not self.check_quantized(weight_bits):
             if isinstance(layer, (LinearBase, ParallelLMHead)):
@@ -213,19 +277,24 @@ class AutoRoundConfig(QuantizationConfig):
             else:
                 return None
 
-        logger.debug("[%s] Type: %s, Bits: %s, Group Size: %s, Sym: %s",
-                     prefix, layer.__class__.__name__, weight_bits, group_size,
-                     sym)
+        logger.debug(
+            "[%s] Type: %s, Bits: %s, Group Size: %s, Sym: %s",
+            prefix,
+            layer.__class__.__name__,
+            weight_bits,
+            group_size,
+            sym,
+        )
         if backend == "auto" or "marlin" in backend:
             GPTQ_TYPE_MAP = {
                 (4, True): scalar_types.uint4b8,
                 (8, True): scalar_types.uint8b128,
             }
-            use_marlin = ((weight_bits, sym) in GPTQ_TYPE_MAP
-                          and check_marlin_supported(
+            use_marlin = (weight_bits,
+                          sym) in GPTQ_TYPE_MAP and check_marlin_supported(
                               GPTQ_TYPE_MAP[(weight_bits, sym)],
                               group_size,
-                              has_zp=not sym))
+                              has_zp=not sym)
             if isinstance(layer, FusedMoE):
                 use_marlin = use_marlin and check_moe_marlin_supports_layer(
                     layer, group_size)
@@ -234,26 +303,33 @@ class AutoRoundConfig(QuantizationConfig):
         if use_marlin:
             from vllm.model_executor.layers.quantization.gptq_marlin import (
                 GPTQMarlinConfig, GPTQMarlinLinearMethod, GPTQMarlinMoEMethod)
-            quant_args_marlin = GPTQMarlinConfig(weight_bits=weight_bits,
-                                                 group_size=group_size,
-                                                 is_sym=sym,
-                                                 lm_head_quantized=False,
-                                                 desc_act=False,
-                                                 dynamic={},
-                                                 full_config={})
+
+            quant_args_marlin = GPTQMarlinConfig(
+                weight_bits=weight_bits,
+                group_size=group_size,
+                is_sym=sym,
+                lm_head_quantized=False,
+                desc_act=False,
+                dynamic={},
+                full_config={},
+            )
         else:
             from vllm.model_executor.layers.quantization.gptq import (
                 GPTQConfig, GPTQLinearMethod)
-            quant_args = GPTQConfig(weight_bits=weight_bits,
-                                    group_size=group_size,
-                                    lm_head_quantized=False,
-                                    desc_act=False,
-                                    dynamic={})
+
+            quant_args = GPTQConfig(
+                weight_bits=weight_bits,
+                group_size=group_size,
+                lm_head_quantized=False,
+                desc_act=False,
+                dynamic={},
+            )
 
         if isinstance(layer, FusedMoE):
             if use_marlin:
                 from vllm.model_executor.layers.quantization.moe_wna16 import (
                     MoeWNA16Config)
+
                 config = {
                     "quant_method": "gptq",
                     "bits": weight_bits,
@@ -282,6 +358,7 @@ class AutoRoundConfig(QuantizationConfig):
                 return None
         from vllm.model_executor.layers.quantization.ipex_quant import (
             IPEXAWQLinearMethod, IPEXConfig, IPEXGPTQLinearMethod)
+
         if isinstance(layer, (LinearBase, ParallelLMHead)):
             if "awq" in self.packing_format:
                 config = IPEXConfig(method="awq",
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 90b45e32a688d..69bced7c0b8ec 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -26,14 +26,15 @@ from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tenso
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     W4A16SPARSE24_SUPPORTED_BITS, WNA16_SUPPORTED_BITS, CompressedTensors24,
     CompressedTensorsScheme, CompressedTensorsW4A4Fp4,
-    CompressedTensorsW4A16Fp4, CompressedTensorsW4A16Sparse24,
-    CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
-    CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
+    CompressedTensorsW4A8Int, CompressedTensorsW4A16Fp4,
+    CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8,
+    CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8,
+    CompressedTensorsWNA16)
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     find_matched_target, is_activation_quantization_format,
     should_ignore_layer)
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
-from vllm.model_executor.layers.quantization.utils.nvfp4_emulation_utils import (  # noqa: E501
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
     cutlass_fp4_supported)
 from vllm.platforms import current_platform
 
@@ -74,7 +75,7 @@ class CompressedTensorsConfig(QuantizationConfig):
         return CompressedTensorsLinearMethod(self)
 
     def get_supported_act_dtypes(cls) -> list[torch.dtype]:
-        return [torch.float16, torch.bfloat16]
+        return [torch.float32, torch.float16, torch.bfloat16]
 
     @classmethod
     def get_min_capability(cls) -> int:
@@ -299,6 +300,22 @@ class CompressedTensorsConfig(QuantizationConfig):
         # Only symmetric weight quantization supported.
         return is_8_bits and is_token and weight_quant.symmetric and is_dynamic
 
+    def _is_dynamic_token_w4a8_int(self, weight_quant: BaseModel,
+                                   input_quant: BaseModel) -> bool:
+        is_weight_4_bits = weight_quant.num_bits == 4
+        is_activation_8_bits = input_quant.num_bits == 8
+        weight_strategy = (
+            weight_quant.strategy == QuantizationStrategy.GROUP.value
+            or weight_quant.strategy == QuantizationStrategy.CHANNEL.value)
+        is_token = (weight_strategy and input_quant.strategy
+                    == QuantizationStrategy.TOKEN.value)
+        is_dynamic = not weight_quant.dynamic and input_quant.dynamic
+
+        # Both symmetric and asymmetric input quantization supported.
+        # Only symmetric weight quantization supported.
+        return (is_weight_4_bits and is_activation_8_bits and is_token
+                and weight_quant.symmetric and is_dynamic)
+
     def _is_fp8_w8a8(self, weight_quant: BaseModel,
                      input_quant: BaseModel) -> bool:
         # Confirm weights and activations quantized.
@@ -374,7 +391,6 @@ class CompressedTensorsConfig(QuantizationConfig):
     def _get_scheme_from_parts(
             self, weight_quant: BaseModel,
             input_quant: BaseModel) -> "CompressedTensorsScheme":
-
         # Detect If Mixed Precision
         if self._is_fp4a16_nvfp4(weight_quant, input_quant):
             return CompressedTensorsW4A16Fp4()
@@ -443,6 +459,16 @@ class CompressedTensorsConfig(QuantizationConfig):
                     is_static_input_scheme=False,
                     input_symmetric=input_quant.symmetric)
 
+            if self._is_dynamic_token_w4a8_int(weight_quant, input_quant):
+                is_static_input_scheme = (input_quant
+                                          and not input_quant.dynamic)
+                return CompressedTensorsW4A8Int(
+                    num_bits=weight_quant.num_bits,
+                    strategy=weight_quant.strategy,
+                    group_size=weight_quant.group_size,
+                    is_static_input_scheme=is_static_input_scheme,
+                    input_symmetric=input_quant.symmetric)
+
         raise NotImplementedError(
             "No compressed-tensors compatible scheme was found.")
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 7da52ce6ff8c8..17b41e8a1c23c 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -27,8 +27,8 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
     prepare_moe_fp4_layer_for_marlin)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
     prepare_moe_fp8_layer_for_marlin)
-from vllm.model_executor.layers.quantization.utils.nvfp4_emulation_utils import (  # noqa: E501
-    cutlass_fp4_supported)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    cutlass_fp4_supported, swizzle_blockscale)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     all_close_1d, normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize)
 from vllm.model_executor.utils import set_weight_attrs
@@ -45,7 +45,6 @@ class GPTQMarlinState(Enum):
 
 __all__ = [
     "CompressedTensorsMoEMethod", "CompressedTensorsW8A8Fp8MoEMethod",
-    "CompressedTensorsW8A8Fp8MoECutlassMethod",
     "CompressedTensorsW8A8Int8MoEMethod",
     "CompressedTensorsWNA16MarlinMoEMethod", "CompressedTensorsWNA16MoEMethod",
     "CompressedTensorsW4A4MoeMethod"
@@ -84,9 +83,8 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
         elif quant_config._is_fp4a4_nvfp4(weight_quant, input_quant):
             return CompressedTensorsW4A4MoeMethod()
         elif (quant_config._is_fp8_w8a8_sm90(weight_quant, input_quant)
-              or quant_config._is_fp8_w8a8_sm100(weight_quant, input_quant)):
-            return CompressedTensorsW8A8Fp8MoECutlassMethod(quant_config)
-        elif quant_config._is_fp8_w8a8(weight_quant, input_quant):
+              or quant_config._is_fp8_w8a8_sm100(weight_quant, input_quant)
+              or quant_config._is_fp8_w8a8(weight_quant, input_quant)):
             return CompressedTensorsW8A8Fp8MoEMethod(quant_config)
         elif quant_config._is_dynamic_token_w8a8(weight_quant, input_quant):
             return CompressedTensorsW8A8Int8MoEMethod(quant_config)
@@ -193,29 +191,6 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
             {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
         set_weight_attrs(w2_input_scale, extra_weight_attrs)
 
-    def swizzle_blockscale(self, scale: torch.tensor):
-        assert (scale.dtype == torch.float8_e4m3fn)
-        # Pad and blockwise interleave weight_scale
-        scale_ndim = scale.ndim
-        if scale.ndim == 2:
-            scale = scale.unsqueeze(0)
-        assert scale.ndim == 3
-        B, M, K = scale.shape
-        round_up_multiple = lambda x, m: (x + m - 1) // m * m
-        M_padded = round_up_multiple(M, 128)
-        K_padded = round_up_multiple(K, 4)
-        padded_scale = torch.zeros((B, M_padded, K_padded), dtype=scale.dtype)
-        padded_scale[:B, :M, :K] = scale
-        batches, rows, cols = padded_scale.shape
-        assert rows % 128 == 0
-        assert cols % 4 == 0
-        padded_scale = padded_scale.reshape(batches, rows // 128, 4, 32,
-                                            cols // 4, 4)
-        swizzled_scale = padded_scale.permute((0, 1, 4, 3, 2, 5))
-        swizzled_scale = swizzled_scale.contiguous().cuda()
-        return (swizzled_scale.reshape(M, K)
-                if scale_ndim == 2 else swizzled_scale.reshape(B, M, K))
-
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
 
         # From packed to weight
@@ -243,13 +218,13 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
             return
 
         # swizzle weight scales
-        layer.w13_blockscale_swizzled = torch.nn.Parameter(
-            self.swizzle_blockscale(layer.w13_weight_scale),
-            requires_grad=False)
+        layer.w13_blockscale_swizzled = torch.nn.Parameter(swizzle_blockscale(
+            layer.w13_weight_scale),
+                                                           requires_grad=False)
 
-        layer.w2_blockscale_swizzled = torch.nn.Parameter(
-            self.swizzle_blockscale(layer.w2_weight_scale),
-            requires_grad=False)
+        layer.w2_blockscale_swizzled = torch.nn.Parameter(swizzle_blockscale(
+            layer.w2_weight_scale),
+                                                          requires_grad=False)
 
         # w13
         w13_input_global_scale = layer.w13_input_global_scale.max(
@@ -401,6 +376,14 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
 
         self.rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled()
 
+        # cutlass path
+        self.is_fp8_w8a8_sm100 = quant_config._is_fp8_w8a8_sm100(
+            self.weight_quant, self.input_quant)
+        self.use_cutlass = (quant_config._is_fp8_w8a8_sm90(
+            self.weight_quant, self.input_quant) or self.is_fp8_w8a8_sm100)
+        self.fused_experts = None  # type: ignore[assignment]
+        self.disable_expert_map = False
+
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
                        hidden_size: int, intermediate_size_per_partition: int,
                        params_dtype: torch.dtype, **extra_weight_attrs):
@@ -581,6 +564,34 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
         prepare_finalize: FusedMoEPrepareAndFinalize,
         moe: FusedMoEConfig,
     ) -> FusedMoEPermuteExpertsUnpermute:
+        # cutlass path
+        if self.use_cutlass:
+            from vllm.model_executor.layers.fused_moe import CutlassExpertsFp8
+
+            use_batched_format = (prepare_finalize.activation_format ==
+                                  FusedMoEActivationFormat.BatchedExperts)
+
+            num_dispatchers = prepare_finalize.num_dispatchers()
+            num_experts = (moe.num_local_experts
+                           if use_batched_format else moe.num_experts)
+
+            logger.debug("CutlassExpertsFp8(%s)", self.__class__.__name__)
+
+            experts = CutlassExpertsFp8(
+                num_experts,
+                moe.in_dtype,
+                self.input_quant.strategy == QuantizationStrategy.TOKEN,
+                self.weight_quant.strategy == QuantizationStrategy.CHANNEL,
+                num_dispatchers=num_dispatchers,
+                use_batched_format=use_batched_format,
+            )
+
+            self.disable_expert_map = (num_dispatchers > 1
+                                       or not experts.supports_expert_map())
+
+            return experts
+
+        # triton path
         from vllm.model_executor.layers.fused_moe import TritonExperts
         from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
             BatchedTritonExperts)
@@ -652,6 +663,68 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
             indices_type=self.topk_indices_dtype,
         )
 
+        # cutlass path
+        if self.use_cutlass:
+            per_act_token = (
+                self.input_quant.strategy == QuantizationStrategy.TOKEN)
+            per_channel_quant = (
+                self.weight_quant.strategy == QuantizationStrategy.CHANNEL)
+
+            # small-batch fallback on SM100
+            if self.is_fp8_w8a8_sm100 and topk_ids.shape[0] <= 8:
+                from vllm.model_executor.layers.fused_moe import fused_experts
+                return fused_experts(
+                    hidden_states=x,
+                    w1=layer.w13_weight,
+                    w2=layer.w2_weight,
+                    topk_weights=topk_weights,
+                    topk_ids=topk_ids,
+                    inplace=True,
+                    activation=activation,
+                    apply_router_weight_on_input=apply_router_weight_on_input,
+                    use_fp8_w8a8=True,
+                    per_channel_quant=per_channel_quant,
+                    global_num_experts=global_num_experts,
+                    expert_map=None if self.disable_expert_map else expert_map,
+                    w1_scale=layer.w13_weight_scale,
+                    w2_scale=layer.w2_weight_scale,
+                    a1_scale=layer.w13_input_scale,
+                    a2_scale=layer.w2_input_scale)
+
+            if self.fused_experts is None:
+                from vllm.model_executor.layers.fused_moe.cutlass_moe import (
+                    cutlass_moe_fp8)
+                return cutlass_moe_fp8(
+                    x,
+                    layer.w13_weight,
+                    layer.w2_weight,
+                    topk_weights,
+                    topk_ids,
+                    per_act_token=per_act_token,
+                    activation=activation,
+                    global_num_experts=global_num_experts,
+                    expert_map=None if self.disable_expert_map else expert_map,
+                    w1_scale=layer.w13_weight_scale,
+                    w2_scale=layer.w2_weight_scale,
+                    a1_scale=layer.w13_input_scale,
+                    a2_scale=layer.w2_input_scale,
+                )
+            else:
+                return self.fused_experts(
+                    x,
+                    layer.w13_weight,
+                    layer.w2_weight,
+                    topk_weights,
+                    topk_ids,
+                    activation=activation,
+                    global_num_experts=global_num_experts,
+                    expert_map=None if self.disable_expert_map else expert_map,
+                    w1_scale=layer.w13_weight_scale,
+                    w2_scale=layer.w2_weight_scale,
+                    a1_scale=layer.w13_input_scale,
+                    a2_scale=layer.w2_input_scale,
+                )
+
         if self.rocm_aiter_moe_enabled:
             return self.rocm_aiter_fused_experts_func(
                 hidden_states=x,
@@ -708,291 +781,6 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
             a2_scale=layer.w2_input_scale)
 
 
-class CompressedTensorsW8A8Fp8MoECutlassMethod(CompressedTensorsMoEMethod):
-
-    def __init__(
-            self,
-            quant_config: "CompressedTensorsConfig"  # type: ignore # noqa E501
-    ):
-        self.quant_config = quant_config
-        self.weight_quant = self.quant_config.target_scheme_map["Linear"].get(
-            "weights")
-        self.input_quant = self.quant_config.target_scheme_map["Linear"].get(
-            "input_activations")
-
-        per_tensor = (self.weight_quant.strategy == QuantizationStrategy.TENSOR
-                      and self.input_quant.strategy
-                      == QuantizationStrategy.TENSOR)
-        per_channel = (
-            self.weight_quant.strategy == QuantizationStrategy.CHANNEL
-            and self.input_quant.strategy == QuantizationStrategy.TOKEN)
-        if not (per_tensor or per_channel):
-            raise ValueError(
-                "For FP8 Fused MoE layers, we require per tensor "
-                "or channelwise, dynamic per token quantization. Found "
-                f"{self.weight_quant}, {self.input_quant}")
-
-        self.static_input_scales = not self.input_quant.dynamic
-        if self.static_input_scales and per_channel:
-            raise ValueError(
-                "For FP8 Fused MoE layer, we require either per tensor or "
-                "channelwise, dynamic per token quantization.")
-
-        self.topk_indices_dtype = None
-        self.fused_experts = None  # type: ignore
-        self.disable_expert_map = False
-        self.is_fp8_w8a8_sm100 = self.quant_config._is_fp8_w8a8_sm100(
-            self.weight_quant, self.input_quant)
-
-    def create_weights(self, layer: torch.nn.Module, num_experts: int,
-                       hidden_size: int, intermediate_size_per_partition: int,
-                       params_dtype: torch.dtype, **extra_weight_attrs):
-
-        params_dtype = torch.float8_e4m3fn
-
-        # WEIGHTS
-        w13_weight = torch.nn.Parameter(torch.empty(
-            num_experts,
-            2 * intermediate_size_per_partition,
-            hidden_size,
-            dtype=params_dtype),
-                                        requires_grad=False)
-        layer.register_parameter("w13_weight", w13_weight)
-        set_weight_attrs(w13_weight, extra_weight_attrs)
-
-        w2_weight = torch.nn.Parameter(torch.empty(
-            num_experts,
-            hidden_size,
-            intermediate_size_per_partition,
-            dtype=params_dtype),
-                                       requires_grad=False)
-        layer.register_parameter("w2_weight", w2_weight)
-        set_weight_attrs(w2_weight, extra_weight_attrs)
-
-        # WEIGHT_SCALES
-        if self.weight_quant.strategy == QuantizationStrategy.TENSOR:
-            # Allocate 2 scales for w1 and w3 respectively.
-            # They are combined to a single scale after weight loading.
-            w13_weight_scale = torch.nn.Parameter(torch.ones(
-                num_experts, 2, dtype=torch.float32),
-                                                  requires_grad=False)
-            layer.register_parameter("w13_weight_scale", w13_weight_scale)
-            w2_weight_scale = torch.nn.Parameter(torch.ones(
-                num_experts, dtype=torch.float32),
-                                                 requires_grad=False)
-            layer.register_parameter("w2_weight_scale", w2_weight_scale)
-            # Add PER-TENSOR quantization for FusedMoE.weight_loader.
-            extra_weight_attrs.update(
-                {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
-            set_weight_attrs(w13_weight_scale, extra_weight_attrs)
-            set_weight_attrs(w2_weight_scale, extra_weight_attrs)
-
-        elif self.weight_quant.strategy == QuantizationStrategy.CHANNEL:
-            w13_weight_scale = torch.nn.Parameter(torch.ones(
-                num_experts,
-                2 * intermediate_size_per_partition,
-                1,
-                dtype=torch.float32),
-                                                  requires_grad=False)
-            layer.register_parameter("w13_weight_scale", w13_weight_scale)
-            w2_weight_scale = torch.nn.Parameter(torch.ones(
-                num_experts, hidden_size, 1, dtype=torch.float32),
-                                                 requires_grad=False)
-            layer.register_parameter("w2_weight_scale", w2_weight_scale)
-            # Add PER-CHANNEL quantization for FusedMoE.weight_loader.
-            extra_weight_attrs.update(
-                {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value})
-            set_weight_attrs(w13_weight_scale, extra_weight_attrs)
-            set_weight_attrs(w2_weight_scale, extra_weight_attrs)
-
-        # INPUT_SCALES
-        if self.static_input_scales:
-            w13_input_scale = torch.nn.Parameter(torch.ones(
-                num_experts, dtype=torch.float32),
-                                                 requires_grad=False)
-            layer.register_parameter("w13_input_scale", w13_input_scale)
-            set_weight_attrs(w13_input_scale, extra_weight_attrs)
-
-            w2_input_scale = torch.nn.Parameter(torch.ones(
-                num_experts, dtype=torch.float32),
-                                                requires_grad=False)
-            layer.register_parameter("w2_input_scale", w2_input_scale)
-            set_weight_attrs(w2_input_scale, extra_weight_attrs)
-        else:
-            layer.w13_input_scale = None
-            layer.w2_input_scale = None
-
-    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        # Fp8 moe kernels require a single activation scale.
-        # We take the max of all the scales in case they differ.
-        if self.static_input_scales:
-            assert self.input_quant.strategy == QuantizationStrategy.TENSOR
-            if (layer.w13_input_scale is None or layer.w2_input_scale is None):
-                raise ValueError(
-                    "QuantConfig has static quantization, but found "
-                    "activation scales are None.")
-            if (not all_close_1d(layer.w13_input_scale)
-                    or not all_close_1d(layer.w2_input_scale)):
-                logger.warning_once(
-                    "Found input_scales that are not equal for "
-                    "fp8 MoE layer. Using the maximum across experts "
-                    "for each layer.")
-            layer.w13_input_scale = torch.nn.Parameter(
-                layer.w13_input_scale.max(), requires_grad=False)
-            layer.w2_input_scale = torch.nn.Parameter(
-                layer.w2_input_scale.max(), requires_grad=False)
-
-        # For Per-TENSOR case, Fp8 moe kernel needs single weight scale
-        # for w13 per expert. Use max then dequant and requant each expert.
-        if self.weight_quant.strategy == QuantizationStrategy.TENSOR:
-            assert layer.w13_weight_scale is not None
-            shard_size = layer.intermediate_size_per_partition
-            max_w13_scales = layer.w13_weight_scale.max(dim=1).values
-            for expert_id in range(layer.local_num_experts):
-                start = 0
-                for shard_id in range(2):
-                    dq_weight = per_tensor_dequantize(
-                        layer.w13_weight[expert_id][start:start +
-                                                    shard_size, :],
-                        layer.w13_weight_scale[expert_id][shard_id])
-                    layer.w13_weight[expert_id][
-                        start:start + shard_size, :], _ = ops.scaled_fp8_quant(
-                            dq_weight, max_w13_scales[expert_id])
-                    start += shard_size
-            layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales,
-                                                        requires_grad=False)
-
-    def select_gemm_impl(
-        self,
-        prepare_finalize: FusedMoEPrepareAndFinalize,
-        moe: FusedMoEConfig,
-    ) -> FusedMoEPermuteExpertsUnpermute:
-        from vllm.model_executor.layers.fused_moe import CutlassExpertsFp8
-
-        use_batched_format = (prepare_finalize.activation_format ==
-                              FusedMoEActivationFormat.BatchedExperts)
-
-        num_dispatchers = prepare_finalize.num_dispatchers()
-
-        num_experts = (moe.num_local_experts
-                       if use_batched_format else moe.num_experts)
-
-        logger.debug("CutlassExpertsFp8(%s)", self.__class__.__name__)
-
-        experts = CutlassExpertsFp8(
-            num_experts,
-            moe.in_dtype,
-            self.input_quant.strategy == QuantizationStrategy.TOKEN,
-            self.weight_quant.strategy == QuantizationStrategy.CHANNEL,
-            num_dispatchers=num_dispatchers,
-            use_batched_format=use_batched_format,
-        )
-
-        self.disable_expert_map = (num_dispatchers > 1
-                                   or not experts.supports_expert_map())
-
-        return experts
-
-    def apply(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
-        global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
-        scoring_func: str = "softmax",
-        e_score_correction_bias: Optional[torch.Tensor] = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for "
-                "`CompressedTensorsW8A8Fp8MoECutlassMethod` yet.")
-
-        topk_weights, topk_ids = FusedMoE.select_experts(
-            hidden_states=x,
-            router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            e_score_correction_bias=e_score_correction_bias)
-
-        per_act_token = (
-            self.input_quant.strategy == QuantizationStrategy.TOKEN)
-        per_channel_quant = (
-            self.weight_quant.strategy == QuantizationStrategy.CHANNEL)
-        # Triton fused_experts is faster in small batch sizes on SM100.
-        # Fall back to fused_experts in small batch sizes.
-        if self.is_fp8_w8a8_sm100 and topk_ids.shape[0] <= 8:
-            from vllm.model_executor.layers.fused_moe import fused_experts
-            return fused_experts(
-                x,
-                layer.w13_weight,
-                layer.w2_weight,
-                topk_weights,
-                topk_ids,
-                inplace=True,
-                activation=activation,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-                use_fp8_w8a8=True,
-                per_channel_quant=per_channel_quant,
-                global_num_experts=global_num_experts,
-                expert_map=None if self.disable_expert_map else expert_map,
-                w1_scale=layer.w13_weight_scale,
-                w2_scale=layer.w2_weight_scale,
-                a1_scale=layer.w13_input_scale,
-                a2_scale=layer.w2_input_scale)
-        if self.fused_experts is None:
-            # If no modular kernel is provided, use cutlass_moe_fp8
-            from vllm.model_executor.layers.fused_moe.cutlass_moe import (
-                cutlass_moe_fp8)
-            return cutlass_moe_fp8(
-                x,
-                layer.w13_weight,
-                layer.w2_weight,
-                topk_weights,
-                topk_ids,
-                per_act_token=per_act_token,
-                activation=activation,
-                global_num_experts=global_num_experts,
-                expert_map=None if self.disable_expert_map else expert_map,
-                w1_scale=layer.w13_weight_scale,
-                w2_scale=layer.w2_weight_scale,
-                a1_scale=layer.w13_input_scale,
-                a2_scale=layer.w2_input_scale,
-            )
-        else:
-            return self.fused_experts(
-                x,
-                layer.w13_weight,
-                layer.w2_weight,
-                topk_weights,
-                topk_ids,
-                activation=activation,
-                global_num_experts=global_num_experts,
-                expert_map=None if self.disable_expert_map else expert_map,
-                w1_scale=layer.w13_weight_scale,
-                w2_scale=layer.w2_weight_scale,
-                a1_scale=layer.w13_input_scale,
-                a2_scale=layer.w2_input_scale,
-            )
-
-
 class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
 
     def __init__(
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
index 6e4e75df76043..734fa603ba7b9 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
@@ -3,6 +3,7 @@
 
 from .compressed_tensors_scheme import CompressedTensorsScheme
 from .compressed_tensors_w4a4_nvfp4 import CompressedTensorsW4A4Fp4
+from .compressed_tensors_w4a8_int import CompressedTensorsW4A8Int
 from .compressed_tensors_w4a16_24 import (W4A16SPARSE24_SUPPORTED_BITS,
                                           CompressedTensorsW4A16Sparse24)
 from .compressed_tensors_w4a16_nvfp4 import CompressedTensorsW4A16Fp4
@@ -20,5 +21,5 @@ __all__ = [
     "CompressedTensorsW8A8Int8", "CompressedTensorsW8A8Fp8",
     "WNA16_SUPPORTED_BITS", "W4A16SPARSE24_SUPPORTED_BITS",
     "CompressedTensors24", "CompressedTensorsW4A16Fp4",
-    "CompressedTensorsW4A4Fp4"
+    "CompressedTensorsW4A4Fp4", "CompressedTensorsW4A8Int"
 ]
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_int.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_int.py
new file mode 100644
index 0000000000000..f1fca85508a6b
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_int.py
@@ -0,0 +1,135 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Callable, Optional
+
+import torch
+
+from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
+    CompressedTensorsScheme)
+from vllm.model_executor.layers.quantization.kernels.mixed_precision import (
+    MPLinearLayerConfig, choose_mp_linear_kernel)
+from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
+                                           GroupQuantScaleParameter,
+                                           ModelWeightParameter)
+from vllm.scalar_type import scalar_types
+
+logger = init_logger(__name__)
+
+__all__ = ["CompressedTensorsW4A8Int"]
+W4A8_SUPPORTED_TYPES_MAP = {
+    4: scalar_types.int4,
+}
+W4A8_SUPPORTED_BITS = list(W4A8_SUPPORTED_TYPES_MAP.keys())
+
+
+class CompressedTensorsW4A8Int(CompressedTensorsScheme):
+    _kernel_backends_being_used: set[str] = set()
+
+    def __init__(self,
+                 strategy: str,
+                 num_bits: int,
+                 group_size: Optional[int] = None,
+                 is_static_input_scheme: bool = False,
+                 input_symmetric: bool = True):
+        self.strategy = strategy
+        self.group_size = -1 if group_size is None else group_size
+        self.is_static_input_scheme = is_static_input_scheme
+        self.input_symmetric = input_symmetric
+
+        if num_bits not in W4A8_SUPPORTED_TYPES_MAP:
+            raise ValueError(
+                f"Unsupported num_bits = {num_bits}."
+                f"Supported num_bits = {W4A8_SUPPORTED_TYPES_MAP.keys()}")
+        self.quant_type = W4A8_SUPPORTED_TYPES_MAP[num_bits]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 1
+
+    def create_weights(self, layer: torch.nn.Module, output_size: int,
+                       input_size: int, output_partition_sizes: list[int],
+                       input_size_per_partition: int,
+                       params_dtype: torch.dtype, weight_loader: Callable,
+                       **kwargs):
+        output_size_per_partition = sum(output_partition_sizes)
+        row_parallel = (input_size != input_size_per_partition)
+
+        # Compute effective group_size
+        if self.group_size == -1:
+            effective_group_size = (input_size_per_partition
+                                    if row_parallel else input_size)
+        else:
+            effective_group_size = self.group_size
+
+        # Ensure group_size divides input_size_per_partition
+        assert input_size_per_partition % effective_group_size == 0, (
+            f"input_size_per_partition {input_size_per_partition}"
+            f" not divisible by group_size {effective_group_size}")
+
+        # Determine scale partitioning
+        is_channelwise = (self.group_size == -1)
+        repeat_scales = (is_channelwise and row_parallel)
+        partition_scales = not repeat_scales
+
+        mp_linear_kernel_config = MPLinearLayerConfig(
+            full_weight_shape=(input_size, output_size),
+            partition_weight_shape=(input_size_per_partition,
+                                    output_size_per_partition),
+            weight_type=self.quant_type,
+            act_type=params_dtype,
+            group_size=effective_group_size,
+            zero_points=False,
+            has_g_idx=False,
+        )
+
+        kernel_type = choose_mp_linear_kernel(mp_linear_kernel_config)
+        if kernel_type.__name__ not in self._kernel_backends_being_used:
+            logger.info("Using %s for CompressedTensorsW4A8Int",
+                        kernel_type.__name__)
+            self._kernel_backends_being_used.add(kernel_type.__name__)
+
+        scales_and_zp_size = input_size_per_partition // effective_group_size
+
+        weight = ModelWeightParameter(data=torch.empty(
+            output_size_per_partition,
+            input_size_per_partition,
+            dtype=torch.int8),
+                                      input_dim=1,
+                                      output_dim=0,
+                                      weight_loader=weight_loader)
+        layer.register_parameter("weight", weight)
+
+        weight_scale_args = {
+            "weight_loader":
+            weight_loader,
+            "data":
+            torch.empty(output_size_per_partition,
+                        scales_and_zp_size,
+                        dtype=params_dtype)
+        }
+
+        if partition_scales:
+            weight_scale = GroupQuantScaleParameter(output_dim=0,
+                                                    input_dim=1,
+                                                    **weight_scale_args)
+        else:
+            weight_scale = ChannelQuantScaleParameter(output_dim=0,
+                                                      **weight_scale_args)
+
+        layer.register_parameter("weight_packed", weight)
+        layer.register_parameter("weight_scale", weight_scale)
+
+        self.kernel = kernel_type(mp_linear_kernel_config,
+                                  w_q_param_name="weight_packed",
+                                  w_s_param_name="weight_scale",
+                                  w_zp_param_name=None,
+                                  w_gidx_param_name=None)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        self.kernel.process_weights_after_loading(layer)
+
+    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
+                      bias: Optional[torch.Tensor]) -> torch.Tensor:
+        return self.kernel.apply_weights(layer, x, bias)
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 75f8adf34f7dd..8b6ed154bdbe4 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -23,6 +23,9 @@ from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
+from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
+    apply_flashinfer_per_tensor_scale_fp8, rotate_flashinfer_fp8_moe_weights,
+    swap_w13_to_w31)
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     get_col_major_tma_aligned_tensor, requant_weight_ue8m0_inplace)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
@@ -53,11 +56,6 @@ ACTIVATION_SCHEMES = ["static", "dynamic"]
 logger = init_logger(__name__)
 
 
-def _swap_w13_to_w31(x: torch.Tensor) -> torch.Tensor:
-    return x.reshape(-1, 2, x.shape[-2] // 2,
-                     x.shape[-1]).flip(dims=[1]).reshape(x.shape)
-
-
 def _is_col_major(x: torch.Tensor) -> bool:
     assert x.dim() == 3
     b, m, n = x.shape
@@ -695,11 +693,13 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             elif self.flashinfer_moe_enabled:
                 # NOTE: weights have to be swapped since the activation is
                 # applied on different half for flashinfer vs vllm
-                w13_weight = _swap_w13_to_w31(layer.w13_weight.data)
-                w13_weight_scale_inv = _swap_w13_to_w31(
+                w13_weight = swap_w13_to_w31(layer.w13_weight.data)
+                w13_weight_scale_inv = swap_w13_to_w31(
                     layer.w13_weight_scale_inv.data)
                 w2_weight = layer.w2_weight.data
                 w2_weight_scale_inv = layer.w2_weight_scale_inv.data
+                if not self.block_quant:
+                    rotate_flashinfer_fp8_moe_weights(w13_weight, w2_weight)
             else:
                 w13_weight = layer.w13_weight.data
                 w13_weight_scale_inv = layer.w13_weight_scale_inv.data
@@ -998,30 +998,43 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 global_num_experts=global_num_experts,
                 expert_map=expert_map)
         elif self.flashinfer_moe_enabled:
-            # Currently only work with DS models
-            assert self.block_quant
-            assert (renormalize and use_grouped_topk
-                    and scoring_func == 'sigmoid'
-                    and custom_routing_function is None)
-            assert activation == "silu"
-            return torch.ops.vllm.flashinfer_fused_moe_blockscale_fp8(
-                routing_logits=router_logits.to(torch.float32),
-                routing_bias=e_score_correction_bias,
-                x=x,
-                w13_weight=layer.w13_weight,
-                w13_weight_scale_inv=layer.w13_weight_scale_inv,
-                w2_weight=layer.w2_weight,
-                w2_weight_scale_inv=layer.w2_weight_scale_inv,
-                global_num_experts=global_num_experts,
-                top_k=top_k,
-                num_expert_group=num_expert_group,
-                topk_group=topk_group,
-                intermediate_size=layer.intermediate_size_per_partition,
-                expert_offset=layer.ep_rank * layer.local_num_experts,
-                local_num_experts=layer.local_num_experts,
-                block_shape=self.quant_config.weight_block_size,
-                routed_scaling=1.0,
-            )
+            assert activation == 'silu'
+            assert scoring_func == 'sigmoid'
+            if self.block_quant:
+                assert (renormalize and use_grouped_topk
+                        and custom_routing_function is None)
+
+                return torch.ops.vllm.flashinfer_fused_moe_blockscale_fp8(
+                    routing_logits=router_logits.to(torch.float32),
+                    routing_bias=e_score_correction_bias,
+                    x=x,
+                    w13_weight=layer.w13_weight,
+                    w13_weight_scale_inv=layer.w13_weight_scale_inv,
+                    w2_weight=layer.w2_weight,
+                    w2_weight_scale_inv=layer.w2_weight_scale_inv,
+                    global_num_experts=global_num_experts,
+                    top_k=top_k,
+                    num_expert_group=num_expert_group,
+                    topk_group=topk_group,
+                    intermediate_size=layer.intermediate_size_per_partition,
+                    expert_offset=layer.ep_rank * layer.local_num_experts,
+                    local_num_experts=layer.local_num_experts,
+                    block_shape=self.quant_config.weight_block_size,
+                    routed_scaling=1.0,
+                )
+            else:
+                assert (not renormalize
+                        and custom_routing_function is not None)
+                return apply_flashinfer_per_tensor_scale_fp8(
+                    layer=layer,
+                    hidden_states=x,
+                    router_logits=router_logits,
+                    routing_bias=e_score_correction_bias,
+                    global_num_experts=global_num_experts,
+                    top_k=top_k,
+                    num_expert_group=num_expert_group,
+                    topk_group=topk_group,
+                    apply_router_weight_on_input=apply_router_weight_on_input)
         else:
             return self.fused_experts(
                 hidden_states=x,
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
index 21e5ae793c3f5..a5084f6ee92cd 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
@@ -10,6 +10,8 @@ from vllm.model_executor.layers.quantization.kernels.mixed_precision.bitblas imp
     BitBLASLinearKernel)
 from vllm.model_executor.layers.quantization.kernels.mixed_precision.conch import (  # noqa: E501
     ConchLinearKernel)
+from vllm.model_executor.layers.quantization.kernels.mixed_precision.dynamic_4bit import (  # noqa: E501
+    Dynamic4bitLinearKernel)
 from vllm.model_executor.layers.quantization.kernels.mixed_precision.exllama import (  # noqa: E501
     ExllamaLinearKernel)
 from vllm.model_executor.layers.quantization.kernels.mixed_precision.machete import (  # noqa: E501
@@ -25,6 +27,7 @@ _POSSIBLE_KERNELS: list[type[MPLinearKernel]] = [
     MacheteLinearKernel,
     AllSparkLinearKernel,
     MarlinLinearKernel,
+    Dynamic4bitLinearKernel,
     BitBLASLinearKernel,
     ConchLinearKernel,
     ExllamaLinearKernel,
@@ -56,7 +59,8 @@ def choose_mp_linear_kernel(
         if current_platform is None:
             raise ValueError("Cannot determine compute capability")
         _cc = current_platform.get_device_capability()
-        compute_capability = _cc[0] * 10 + _cc[1]
+        if _cc is not None:
+            compute_capability = _cc[0] * 10 + _cc[1]
 
     failure_reasons = []
     for kernel in _POSSIBLE_KERNELS:
@@ -64,12 +68,12 @@ def choose_mp_linear_kernel(
             failure_reasons.append(
                 f' {kernel.__name__} disabled by environment variable')
             continue
-
-        if kernel.get_min_capability() > compute_capability:
+        if (compute_capability is not None
+                and kernel.get_min_capability() > compute_capability):
             failure_reasons.append(
                 f"{kernel.__name__} requires capability "
-                f"{kernel.get_min_capability()}, current compute capability "
-                f"is {compute_capability}")
+                f"{kernel.get_min_capability()}, current compute "
+                f" capability is {compute_capability}")
             continue
 
         can_implement, failure_reason = kernel.can_implement(config)
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/dynamic_4bit.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/dynamic_4bit.py
new file mode 100644
index 0000000000000..7bd326f47f9e4
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/dynamic_4bit.py
@@ -0,0 +1,92 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import torch
+
+from vllm.model_executor.layers.quantization.utils import replace_parameter
+from vllm.platforms import CpuArchEnum, current_platform
+from vllm.scalar_type import scalar_types
+
+from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
+
+
+class Dynamic4bitLinearKernel(MPLinearKernel):
+    SUPPORTED_QUANT_TYPES = [scalar_types.int4]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 1
+
+    @classmethod
+    def can_implement(cls,
+                      c: MPLinearLayerConfig) -> tuple[bool, Optional[str]]:
+        if not current_platform.is_cpu():
+            return False, "Only CPU is supported"
+        if c.weight_type not in cls.SUPPORTED_QUANT_TYPES:
+            return False, f"Unsupported quant type {c.weight_type}"
+        if current_platform.get_cpu_architecture(
+        ) == CpuArchEnum.ARM and c.act_type not in [
+                torch.float32,
+        ]:
+            return False, "Dynamic4bitLinearKernel on Arm requires"\
+                " Float32 activations"
+        if c.full_weight_shape[0] % c.group_size != 0:
+            return False, f"Group size ({c.group_size}) does not evenly divide"\
+                " the number of input features "\
+                f"({c.full_weight_shape[0]})"
+        if current_platform.get_cpu_architecture() == CpuArchEnum.ARM:
+            try:
+                # Attempt to retrieve the operation
+                _ = torch.ops.aten._dyn_quant_matmul_4bit
+            except AttributeError:
+                return False, f"PyTorch {torch.__version__} does not support"\
+                    " _dyn_quant_matmul_4bit. Install a newer version"
+        return True, None
+
+    def process_weights_after_loading(self, layer: torch.nn.Module):
+        c = self.config
+        packed_weight = getattr(layer, self.w_q_name)
+        packed_weight = packed_weight.add(8)
+        uint8_packed = (packed_weight[::, 1::2] << 4
+                        | packed_weight[::, ::2]).to(torch.uint8)
+
+        scales = getattr(layer, self.w_s_name)
+        block_size = c.group_size
+
+        # Handle scaling factors for partitioned weights
+        if block_size == c.partition_weight_shape[0]:
+            scales = scales.to(
+                torch.float32
+            )  # Float32 & Bfloat16 variants requires float32 scales
+            scales = scales.view(-1, 1)  # Channel-wise scales
+            if layer.bias is not None:
+                layer.bias = layer.bias.to(
+                    torch.float32
+                )  # Float32 & Bfloat16 variants requires float32 bias
+        else:
+            # KleidiAI kernel requires bfloat16 scales with groupwise scheme
+            scales = scales.to(torch.bfloat16)
+
+        # Repack weights as per kernel requirement
+        w = torch.ops.aten._dyn_quant_pack_4bit_weight(
+            uint8_packed, scales, layer.bias, block_size,
+            c.partition_weight_shape[0], c.partition_weight_shape[1])
+        replace_parameter(layer, self.w_q_name,
+                          torch.nn.Parameter(w, requires_grad=False))
+        setattr(layer, self.w_s_name, None)
+
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        c = self.config
+        x_2d = x.reshape(-1, x.shape[-1])
+        out_shape = x.shape[:-1] + (c.partition_weight_shape[1], )
+
+        w_q = getattr(layer, self.w_q_name)
+        output = torch.ops.aten._dyn_quant_matmul_4bit(
+            x_2d, w_q, c.group_size, c.partition_weight_shape[0],
+            c.partition_weight_shape[1])
+        return output.reshape(out_shape)
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 81611ed07aaa4..b8ffcf90c022b 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -9,8 +9,7 @@ from torch.nn.parameter import Parameter
 
 import vllm.envs as envs
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
-from vllm._custom_ops import (cutlass_scaled_fp4_mm,
-                              cutlass_scaled_mm_supports_fp4, scaled_fp4_quant)
+from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
 from vllm.distributed import get_ep_group
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig
@@ -24,17 +23,21 @@ from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
+from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
+    apply_flashinfer_per_tensor_scale_fp8, rotate_flashinfer_fp8_moe_weights,
+    swap_w13_to_w31)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
     apply_fp4_marlin_linear, is_fp4_marlin_supported,
     prepare_fp4_layer_for_marlin, prepare_moe_fp4_layer_for_marlin)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    GroupShape, is_layer_skipped)
+    GroupShape, cutlass_fp4_supported, is_layer_skipped, swizzle_blockscale)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     Fp8LinearOp, requantize_with_max_scale)
 from vllm.model_executor.parameter import (ModelWeightParameter,
                                            PerTensorScaleParameter)
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
+from vllm.utils.flashinfer import has_flashinfer_moe
 
 logger = init_logger(__name__)
 
@@ -268,6 +271,11 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
         from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
             cutlass_fp8_supported)
         self.cutlass_fp8_supported = cutlass_fp8_supported()
+        self.flashinfer_moe_enabled = False
+        if envs.VLLM_USE_FLASHINFER_MOE_FP8 and has_flashinfer_moe():
+            logger.info_once(
+                "Using FlashInfer MoE FP8 kernels for ModelOptFp8MoEMethod.")
+            self.flashinfer_moe_enabled = True
 
     def create_weights(
         self,
@@ -411,6 +419,11 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
             layer.w2_input_scale = Parameter(layer.w2_input_scale.max(),
                                              requires_grad=False)
 
+        if self.flashinfer_moe_enabled:
+            layer.w13_weight.data = swap_w13_to_w31(layer.w13_weight.data)
+            rotate_flashinfer_fp8_moe_weights(layer.w13_weight,
+                                              layer.w2_weight)
+
     def apply(
         self,
         layer: torch.nn.Module,
@@ -437,6 +450,20 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
             raise NotImplementedError(
                 "EPLB not supported for `ModelOptFp8MoEMethod` yet.")
 
+        if self.flashinfer_moe_enabled:
+            assert activation == 'silu'
+            assert not renormalize
+            return apply_flashinfer_per_tensor_scale_fp8(
+                layer=layer,
+                hidden_states=x,
+                router_logits=router_logits,
+                routing_bias=e_score_correction_bias,
+                global_num_experts=global_num_experts,
+                top_k=top_k,
+                num_expert_group=num_expert_group,
+                topk_group=topk_group,
+                apply_router_weight_on_input=apply_router_weight_on_input)
+
         # Expert selection
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
@@ -667,14 +694,6 @@ class ModelOptNvFp4Config(QuantizationConfig):
         return None
 
 
-def cutlass_fp4_supported() -> bool:
-    if not current_platform.is_cuda():
-        return False
-    capability_tuple = current_platform.get_device_capability()
-    capability = -1 if capability_tuple is None else capability_tuple.to_int()
-    return cutlass_scaled_mm_supports_fp4(capability)
-
-
 class ModelOptFp8KVCacheMethod(BaseKVCacheMethod):
     """
     Supports loading kv-cache scaling factors from FP8 checkpoints.
@@ -772,29 +791,6 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
 
         layer.register_parameter("weight_scale", weight_scale)
 
-    def swizzle_blockscale(self, scale: torch.tensor):
-        assert (scale.dtype == torch.float8_e4m3fn)
-        # Pad and blockwise interleave weight_scale
-        scale_ndim = scale.ndim
-        if scale.ndim == 2:
-            scale = scale.unsqueeze(0)
-        assert scale.ndim == 3
-        B, M, K = scale.shape
-        round_up_multiple = lambda x, m: (x + m - 1) // m * m
-        M_padded = round_up_multiple(M, 128)
-        K_padded = round_up_multiple(K, 4)
-        padded_scale = torch.zeros((B, M_padded, K_padded), dtype=scale.dtype)
-        padded_scale[:B, :M, :K] = scale
-        batches, rows, cols = padded_scale.shape
-        assert rows % 128 == 0
-        assert cols % 4 == 0
-        padded_scale = padded_scale.reshape(batches, rows // 128, 4, 32,
-                                            cols // 4, 4)
-        swizzled_scale = padded_scale.permute((0, 1, 4, 3, 2, 5))
-        swizzled_scale = swizzled_scale.contiguous().cuda()
-        return (swizzled_scale.reshape(M, K)
-                if scale_ndim == 2 else swizzled_scale.reshape(B, M, K))
-
     def process_weights_after_loading(self, layer: Module) -> None:
 
         # global scales:
@@ -810,11 +806,9 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
         # Swizzle the weight blockscale.
         # contracting dimension is input dimension
         # block_size = 16;
-        assert (layer.weight_scale.shape[1] % 16 == 0), (
-            "Expected weight_scale.dim(1) to be divisible by 16")
         assert (layer.weight_scale.dtype == torch.float8_e4m3fn), (
             "Weight Block scale must be represented as FP8-E4M3")
-        swizzled_weight_scale = self.swizzle_blockscale(layer.weight_scale)
+        swizzled_weight_scale = swizzle_blockscale(layer.weight_scale)
 
         layer.weight_scale_swizzled = Parameter(swizzled_weight_scale,
                                                 requires_grad=False)
@@ -1060,29 +1054,6 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                                                  weight_loader=weight_loader)
         layer.register_parameter("w2_input_scale", w2_input_scale)
 
-    def swizzle_blockscale(self, scale: torch.tensor):
-        assert (scale.dtype == torch.float8_e4m3fn)
-        # Pad and blockwise interleave weight_scale
-        scale_ndim = scale.ndim
-        if scale.ndim == 2:
-            scale = scale.unsqueeze(0)
-        assert scale.ndim == 3
-        B, M, K = scale.shape
-        round_up_multiple = lambda x, m: (x + m - 1) // m * m
-        M_padded = round_up_multiple(M, 128)
-        K_padded = round_up_multiple(K, 4)
-        padded_scale = torch.zeros((B, M_padded, K_padded), dtype=scale.dtype)
-        padded_scale[:B, :M, :K] = scale
-        batches, rows, cols = padded_scale.shape
-        assert rows % 128 == 0
-        assert cols % 4 == 0
-        padded_scale = padded_scale.reshape(batches, rows // 128, 4, 32,
-                                            cols // 4, 4)
-        swizzled_scale = padded_scale.permute((0, 1, 4, 3, 2, 5))
-        swizzled_scale = swizzled_scale.contiguous().cuda()
-        return (swizzled_scale.reshape(M, K)
-                if scale_ndim == 2 else swizzled_scale.reshape(B, M, K))
-
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         # GEMM 1
         # The FlashInfer Cutlass fused MoE kernel expects the combined weights
@@ -1128,8 +1099,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
             "Expected weight_scale.dim(1) to be divisible by 16")
         assert (layer.w13_weight_scale.dtype == torch.float8_e4m3fn), (
             "Weight Blockscale must be represented as FP8-E4M3")
-        w13_blockscale_swizzled = self.swizzle_blockscale(
-            layer.w13_weight_scale)
+        w13_blockscale_swizzled = swizzle_blockscale(layer.w13_weight_scale)
 
         layer.w13_blockscale_swizzled = Parameter(w13_blockscale_swizzled,
                                                   requires_grad=False)
@@ -1151,7 +1121,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
             "Expected weight_scale.dim(1) to be divisible by 16")
         assert (layer.w2_weight_scale.dtype == torch.float8_e4m3fn), (
             "Weight Blockscale must be represented as FP8-E4M3")
-        w2_blockscale_swizzled = self.swizzle_blockscale(layer.w2_weight_scale)
+        w2_blockscale_swizzled = swizzle_blockscale(layer.w2_weight_scale)
 
         layer.w2_blockscale_swizzled = Parameter(w2_blockscale_swizzled,
                                                  requires_grad=False)
diff --git a/vllm/model_executor/layers/quantization/rtn.py b/vllm/model_executor/layers/quantization/rtn.py
index 68309716cf901..cceaf9857c40f 100644
--- a/vllm/model_executor/layers/quantization/rtn.py
+++ b/vllm/model_executor/layers/quantization/rtn.py
@@ -3,18 +3,19 @@
 # Copyright © 2025, Oracle and/or its affiliates.
 
 import os
-from typing import Any, Optional
+from typing import Any, Callable, Optional
 
 import torch
 import torch.nn.functional as F
 from torch.nn.parameter import Parameter
 
 from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe import FusedMoE, FusedMoEMethodBase
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                set_weight_attrs)
 from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
+    QuantizationConfig, QuantizeMethodBase)
 
 logger = init_logger(__name__)
 """By default, use 8 bit as target precision, but it can be 
@@ -71,9 +72,11 @@ class RTNConfig(QuantizationConfig):
         return cls(weight_bits, group_size)
 
     def get_quant_method(self, layer: torch.nn.Module,
-                         prefix: str) -> Optional["RTNLinearMethod"]:
+                         prefix: str) -> Optional["QuantizeMethodBase"]:
         if isinstance(layer, LinearBase):
             return RTNLinearMethod(self)
+        elif isinstance(layer, FusedMoE):
+            return RTNMoEMethod(self)
         return None
 
 
@@ -94,11 +97,18 @@ class RTNTensor:
             self.data.narrow(dim, start // factor, length // factor),
             self.scale.narrow(dim, start, length), self.quant_config)
 
+    def __getitem__(self, key):
+        return RTNTensor(self.data[key], self.scale[key], self.quant_config)
+
     @property
     def shape(self):
         shape = self.data.shape
         factor = 1 if self.quant_config.weight_bits == 8 else 2
-        return torch.Size((shape[0] * factor, shape[1]))
+        batch_present = len(shape) == 3
+        if batch_present:
+            return torch.Size((shape[0], shape[1] * factor, shape[2]))
+        else:
+            return torch.Size((shape[0] * factor, shape[1]))
 
     def copy_(self, loaded_weight: torch.Tensor) -> None:
         qweight, weight_scale = rtn_quantize(loaded_weight.cuda(),
@@ -165,7 +175,7 @@ class RTNLinearMethod(LinearMethodBase):
         weight = RTNParameter(data=torch.empty(output_size_per_partition //
                                                factor,
                                                input_size_per_partition,
-                                               dtype=torch.int8),
+                                               dtype=torch.uint8),
                               scale=scale,
                               quant_config=self.quant_config)
 
@@ -180,18 +190,7 @@ class RTNLinearMethod(LinearMethodBase):
         layer.output_size_per_partition = output_size_per_partition
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        """torch.compile does not know how to deal with a Parameter subclass
-        (aka RTNParameter). As we don't really need RTNParameters for the
-        forward pass, we replace them with equivalent instances of Parameters.
-        """
-        old_weight = layer.weight
-        assert isinstance(old_weight, RTNParameter)
-        data = old_weight.data.data
-
-        delattr(layer, "weight")
-
-        new_weight = Parameter(data=data, requires_grad=False)
-        layer.register_parameter("weight", new_weight)
+        fix_weights(layer, "weight")
 
     def apply(self,
               layer: torch.nn.Module,
@@ -209,6 +208,128 @@ class RTNLinearMethod(LinearMethodBase):
         return out
 
 
+class RTNMoEMethod(FusedMoEMethodBase):
+
+    def __init__(self, quant_config: RTNConfig):
+        self.quant_config = quant_config
+
+    def create_weights(self, layer: torch.nn.Module, num_experts: int,
+                       hidden_size: int, intermediate_size_per_partition: int,
+                       params_dtype: torch.dtype, **extra_weight_attrs):
+
+        factor = 1 if self.quant_config.weight_bits == 8 else 2
+
+        # Fused gate_up_proj (column parallel)
+        num_groups_per_col = (hidden_size // self.quant_config.group_size
+                              if self.quant_config.group_size != -1 else 1)
+        w13_scale = Parameter(
+            torch.empty(num_experts,
+                        2 * intermediate_size_per_partition,
+                        num_groups_per_col,
+                        dtype=params_dtype),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_scale", w13_scale)
+
+        w13_weight = RTNParameter(data=torch.empty(
+            num_experts,
+            2 * intermediate_size_per_partition // factor,
+            hidden_size,
+            dtype=torch.uint8),
+                                  scale=w13_scale,
+                                  quant_config=self.quant_config)
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+
+        # down_proj (row parallel)
+        num_groups_per_col = (intermediate_size_per_partition //
+                              self.quant_config.group_size
+                              if self.quant_config.group_size != -1 else 1)
+        w2_scale = Parameter(torch.zeros(num_experts,
+                                         hidden_size,
+                                         num_groups_per_col,
+                                         dtype=params_dtype),
+                             requires_grad=False)
+        layer.register_parameter("w2_scale", w2_scale)
+
+        w2_weight = RTNParameter(data=torch.empty(
+            num_experts,
+            hidden_size // factor,
+            intermediate_size_per_partition,
+            dtype=torch.uint8),
+                                 scale=w2_scale,
+                                 quant_config=self.quant_config)
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        weight_bits = self.quant_config.weight_bits
+        fix_weights(layer, "w13_weight", weight_bits == 4)
+        fix_weights(layer, "w2_weight", weight_bits == 4)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: Optional[torch.Tensor] = None,
+        logical_to_physical_map: Optional[torch.Tensor] = None,
+        logical_replica_count: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if enable_eplb:
+            raise NotImplementedError(
+                "EPLB not supported for `RTNMoEMethod` yet.")
+
+        from vllm.model_executor.layers.fused_moe import fused_experts
+
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias)
+
+        weight_bits = self.quant_config.weight_bits
+        group_size = self.quant_config.group_size
+
+        ret = fused_experts(
+            x,
+            layer.w13_weight,
+            layer.w2_weight,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            inplace=True,
+            activation=activation,
+            use_int4_w4a16=weight_bits == 4,
+            use_int8_w8a16=weight_bits == 8,
+            global_num_experts=global_num_experts,
+            w1_scale=layer.w13_scale,
+            w2_scale=layer.w2_scale,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            expert_map=expert_map,
+            block_shape=[0, group_size])
+
+        return ret
+
+
 def rtn_quantize(tensor: torch.Tensor, num_bits: int,
                  group_size: int) -> tuple[torch.Tensor, torch.Tensor]:
     """Quantize a tensor using per-group static scaling factor.
@@ -221,34 +342,44 @@ def rtn_quantize(tensor: torch.Tensor, num_bits: int,
                     If equal to -1, each row in the input tensor is treated
                     as one group.
     """
+    batch_present = len(tensor.shape) == 3
+    if not batch_present:
+        tensor = tensor.unsqueeze(0)
 
     q_range = 2**num_bits
-    num_groups = (tensor.shape[0] * tensor.shape[1] //
-                  group_size if group_size != -1 else tensor.shape[0])
+    num_groups = (tensor.shape[1] * tensor.shape[2] //
+                  group_size if group_size != -1 else tensor.shape[1])
     """Calculate a scaling factor per input group.
     """
-    input_flat = tensor.reshape(num_groups, -1)
-    input_min = torch.min(input_flat, dim=1, keepdim=True)[0]
-    input_max = torch.max(input_flat, dim=1, keepdim=True)[0]
+    input_flat = tensor.reshape(tensor.shape[0], num_groups, -1)
+    input_min = torch.min(input_flat, dim=2, keepdim=True)[0]
+    input_max = torch.max(input_flat, dim=2, keepdim=True)[0]
     input_max_abs = torch.max(input_min.abs(), input_max.abs())
     scale = (input_max_abs * 2.0 / (q_range - 1))
-    """Scale each input group, truncate and round to the nearest integer.
+    """Scale each input group, round to the nearest integer, shift 
+    the range and truncate.
     """
     scaled_input = input_flat / scale
-    scaled_input = scaled_input.clamp(-q_range // 2, q_range // 2 - 1)
     scaled_input = scaled_input.round()
+    scaled_input += q_range // 2
+    scaled_input = scaled_input.clamp(0, q_range - 1)
 
-    scale = scale.reshape(tensor.shape[0], -1).contiguous()
-    inputs_q = scaled_input.reshape(tensor.shape).to(torch.int8)
+    scale = scale.reshape(tensor.shape[0], tensor.shape[1], -1).contiguous()
+    inputs_q = scaled_input.reshape(tensor.shape).to(torch.uint8)
     inputs_q = inputs_q.contiguous()
 
     if num_bits == 4:
         """Pack two 4-bit values into each byte.
         """
-        inputs_q = (inputs_q[:, 1::2] << 4) | (inputs_q[:, ::2] & 0xf)
-        inputs_q = inputs_q.reshape(tensor.shape[0] // 2, tensor.shape[1])
+        inputs_q = (inputs_q[:, :, 1::2] << 4) | (inputs_q[:, :, ::2] & 0xf)
+        inputs_q = inputs_q.reshape(tensor.shape[0], tensor.shape[1] // 2,
+                                    tensor.shape[2])
         inputs_q = inputs_q.contiguous()
 
+    if not batch_present:
+        inputs_q = inputs_q.squeeze(0)
+        scale = scale.squeeze(0)
+
     return inputs_q, scale
 
 
@@ -259,31 +390,60 @@ def rtn_dequantize(tensor: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
         tensor: The input tensor.
         scale: The tensor with per-group scale factors.
     """
+    batch_present = len(tensor.shape) == 3
+    if not batch_present:
+        tensor = tensor.unsqueeze(0)
+        scale = scale.unsqueeze(0)
 
-    num_groups = scale.size(0) * scale.size(1)
-    input_dim, output_dim = tensor.shape
+    num_groups = scale.size(1) * scale.size(2)
+    batch, input_dim, output_dim = tensor.shape
 
-    num_bits = 8 if input_dim == scale.size(0) else 4
+    num_bits = 8 if input_dim == scale.size(1) else 4
+    q_range = 2**num_bits
     if num_bits == 4:
         input_dim *= 2
 
-    data = torch.empty((input_dim, output_dim),
+    data = torch.empty((batch, input_dim, output_dim),
                        dtype=scale.dtype,
                        device=tensor.device)
 
     if num_bits == 8:
         data.copy_(tensor)
+        data -= q_range // 2
     else:
         """Unpack two 4-bit values from each byte.
         """
-        tensor = tensor.reshape(input_dim, output_dim // 2)
+        tensor = tensor.reshape(batch, input_dim, output_dim // 2)
         for i in range(2):
-            data[:, i::2] = (tensor << 4 * (1 - i)) >> 4
+            data[:, :, i::2] = ((tensor << 4 *
+                                 (1 - i)) >> 4).to(torch.int8) - q_range // 2
     """Scale each input group with its scaling factor.
     """
-    scale = scale.reshape(num_groups, -1)
-    data = data.reshape(num_groups, -1)
+    scale = scale.reshape(batch, num_groups, -1)
+    data = data.reshape(batch, num_groups, -1)
     data = torch.mul(data, scale)
 
-    input_deq = data.reshape((input_dim, output_dim)).contiguous()
+    input_deq = data.reshape((batch, input_dim, output_dim)).contiguous()
+    if not batch_present:
+        input_deq = input_deq.squeeze(0)
+
     return input_deq
+
+
+def fix_weights(layer: torch.nn.Module,
+                param_name: str,
+                reshape: bool = False):
+    """torch.compile does not know how to deal with a Parameter subclass
+    (aka RTNParameter). As we don't really need RTNParameters for the
+    forward pass, we replace them with equivalent instances of Parameters.
+    """
+    old_weight = getattr(layer, param_name)
+    assert isinstance(old_weight, RTNParameter)
+    data = old_weight.data.data
+
+    delattr(layer, param_name)
+
+    if reshape:
+        data = data.reshape(old_weight.shape[0], old_weight.shape[1] * 2, -1)
+    new_weight = Parameter(data=data, requires_grad=False)
+    layer.register_parameter(param_name, new_weight)
diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
new file mode 100644
index 0000000000000..c6f914febc0a2
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
@@ -0,0 +1,100 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+
+import torch
+
+
+def calculate_tile_tokens_dim(num_tokens, top_k, num_experts):
+    from flashinfer import next_positive_power_of_2
+
+    # Guess tokens per expert assuming perfect expert distribution first.
+    num_tokens_per_expert = (num_tokens * top_k) // num_experts
+    # And pad the number to the next power of 2.
+    tile_tokens_dim = next_positive_power_of_2(num_tokens_per_expert)
+    # Cap to 8-64 tokens per CTA tile as it's the range supported by the kernel.
+    tile_tokens_dim = min(max(tile_tokens_dim, 8), 64)
+    return tile_tokens_dim
+
+
+def swap_w13_to_w31(x: torch.Tensor) -> torch.Tensor:
+    return x.reshape(-1, 2, x.shape[-2] // 2,
+                     x.shape[-1]).flip(dims=[1]).reshape(x.shape)
+
+
+def rotate_flashinfer_fp8_moe_weights(gemm1_weights: torch.Tensor,
+                                      gemm2_weights: torch.Tensor):
+    from flashinfer import reorder_rows_for_gated_act_gemm, shuffle_matrix_a
+    epilogue_tile_m = 128
+    num_experts = gemm1_weights.shape[0]
+    hidden_size = gemm1_weights.shape[-1]
+    intermediate_size = gemm1_weights.shape[1] // 2
+
+    # Reorder rows of W1 for fused gated activation
+    gemm1_weights_fp8_interleaved = []
+    for i in range(num_experts):
+        gemm1_weights_fp8_interleaved.append(
+            reorder_rows_for_gated_act_gemm(gemm1_weights[i]))
+
+    # Stack weights and scales for all experts
+    gemm1_weights_fp8_interleaved = torch.stack(
+        gemm1_weights_fp8_interleaved).reshape(num_experts,
+                                               2 * intermediate_size,
+                                               hidden_size)
+
+    # Shuffle weights and scaling factors for transposed mma output
+    gemm1_weights_fp8_shuffled = []
+    gemm2_weights_fp8_shuffled = []
+    for i in range(num_experts):
+        gemm1_weights_fp8_shuffled.append(
+            shuffle_matrix_a(
+                gemm1_weights_fp8_interleaved[i].view(torch.uint8),
+                epilogue_tile_m))
+
+        gemm2_weights_fp8_shuffled.append(
+            shuffle_matrix_a(gemm2_weights[i].view(torch.uint8),
+                             epilogue_tile_m))
+
+    # Stack weights for all experts
+    gemm1_weights.data = torch.stack(gemm1_weights_fp8_shuffled).view(
+        torch.float8_e4m3fn)
+    gemm2_weights.data = torch.stack(gemm2_weights_fp8_shuffled).view(
+        torch.float8_e4m3fn)
+
+
+def apply_flashinfer_per_tensor_scale_fp8(
+    layer: torch.nn.Module,
+    hidden_states: torch.Tensor,
+    router_logits: torch.Tensor,
+    routing_bias: Optional[torch.Tensor],
+    top_k: int,
+    num_expert_group: Optional[int],
+    topk_group: Optional[int],
+    global_num_experts: int,
+    apply_router_weight_on_input: bool,
+) -> torch.Tensor:
+    from flashinfer.fused_moe import RoutingMethodType
+
+    from vllm.model_executor.models.llama4 import Llama4MoE
+    assert layer.custom_routing_function == Llama4MoE.custom_routing_function, \
+        "FusedMoE flashinfer kernels are only supported for Llama4"
+    return torch.ops.vllm.flashinfer_fused_moe_per_tensor_scale_fp8(
+        routing_logits=router_logits,
+        routing_bias=routing_bias,
+        hidden_states=hidden_states,
+        input_scale=layer.w13_input_scale,
+        gemm1_weights=layer.w13_weight,
+        gemm1_weights_scale=layer.w13_weight_scale,
+        gemm2_weights=layer.w2_weight,
+        gemm2_weights_scale=layer.w2_weight_scale,
+        activation_scale=layer.w2_input_scale,
+        num_experts=global_num_experts,
+        top_k=top_k,
+        num_expert_group=num_expert_group,
+        topk_group=topk_group,
+        intermediate_size=layer.intermediate_size_per_partition,
+        local_expert_offset=layer.ep_rank * layer.local_num_experts,
+        local_num_experts=layer.local_num_experts,
+        use_routing_scales_on_input=apply_router_weight_on_input,
+        routing_method_type=RoutingMethodType.Llama4,
+    )
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index 8a7e809d082b1..2aece9a1dee06 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -82,6 +82,13 @@ if current_platform.is_rocm():
         fake_impl=rocm_aiter_gemm_w8a8_blockscale_fake,
         dispatch_key=current_platform.dispatch_key,
     )
+    if (envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_LINEAR
+            and current_platform.is_fp8_fnuz()):
+
+        import aiter as rocm_aiter
+        from aiter import get_hip_quant
+
+        aiter_per1x128_quant = get_hip_quant(rocm_aiter.QuantType.per_1x128)
 
 
 def dispatch_w8a8_blockscale_func(
@@ -178,8 +185,12 @@ def apply_w8a8_block_fp8_linear(
                                       block_size, input.dtype)
 
     else:
-        q_input, x_scale = per_token_group_quant_fp8(
-            input_2d, block_size[1], column_major_scales=use_cutlass)
+        if use_aiter_and_is_supported:
+            q_input, x_scale = aiter_per1x128_quant(
+                input_2d.contiguous(), quant_dtype=rocm_aiter.dtypes.fp8)
+        else:
+            q_input, x_scale = per_token_group_quant_fp8(
+                input_2d, block_size[1], column_major_scales=use_cutlass)
 
         output = w8a8_blockscale_func(q_input, weight, x_scale, weight_scale,
                                       block_size, input.dtype)
diff --git a/vllm/model_executor/layers/quantization/utils/int8_utils.py b/vllm/model_executor/layers/quantization/utils/int8_utils.py
index 1fdf7d174e25e..6840cabbf1ae3 100644
--- a/vllm/model_executor/layers/quantization/utils/int8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/int8_utils.py
@@ -238,13 +238,20 @@ def per_token_group_quant_int8(
     int8_min = iinfo.min
 
     x_q = torch.empty_like(x, device=x.device, dtype=dtype)
-    M = x.numel() // group_size
-    N = group_size
     x_s = torch.empty(
         x.shape[:-1] + (x.shape[-1] // group_size, ),
         device=x.device,
         dtype=torch.float32,
     )
+    # prefer CUDA kernel if available
+    if current_platform.is_cuda():
+        torch.ops._C.per_token_group_quant_int8(x, x_q, x_s, group_size, eps,
+                                                float(int8_min),
+                                                float(int8_max))
+        return x_q, x_s
+
+    M = x.numel() // group_size
+    N = group_size
 
     BLOCK = triton.next_power_of_2(N)
     # heuristics for number of warps
diff --git a/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py b/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py
index fb3287d3b89e6..8648771cb0177 100644
--- a/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py
@@ -2,13 +2,12 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import torch
 
-from vllm._custom_ops import cutlass_scaled_mm_supports_fp4
-from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 
 __all__ = [
-    "break_fp4_bytes", "dequantize_to_dtype", "ref_nvfp4_quant",
-    "cutlass_fp4_supported"
+    "break_fp4_bytes",
+    "dequantize_to_dtype",
+    "ref_nvfp4_quant",
 ]
 
 FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
@@ -17,14 +16,6 @@ kE2M1ToFloat = torch.tensor([0., 0.5, 1., 1.5, 2., 3., 4., 6.],
                             dtype=torch.float32)
 
 
-def cutlass_fp4_supported() -> bool:
-    if not current_platform.is_cuda():
-        return False
-    capability_tuple = current_platform.get_device_capability()
-    capability = -1 if capability_tuple is None else capability_tuple.to_int()
-    return cutlass_scaled_mm_supports_fp4(capability)
-
-
 def break_fp4_bytes(a, dtype):
     assert a.dtype == torch.uint8
     m, n = a.shape
diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py
index 54361a2323c28..428e9e99aa881 100644
--- a/vllm/model_executor/layers/quantization/utils/quant_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py
@@ -8,8 +8,10 @@ from typing import ClassVar, NamedTuple, Optional
 import numpy
 import torch
 
+from vllm._custom_ops import cutlass_scaled_mm_supports_fp4
 from vllm.model_executor.layers.quantization.qqq import (
     MARLIN_QQQ_SUPPORTED_NUM_BITS)
+from vllm.platforms import current_platform
 from vllm.scalar_type import ScalarType, scalar_types
 
 
@@ -592,3 +594,56 @@ def awq_pack(
     q_w = q_w.reshape((-1, size_n)).contiguous()
 
     return pack_cols(q_w, num_bits, size_k, size_n)
+
+
+def swizzle_blockscale(scale: torch.Tensor) -> torch.Tensor:
+    """
+    Pad and block-interleave the FP4 block-scales so that they match the data
+    layout expected by the CUTLASS / FlashInfer kernels.
+
+    Parameters
+    ----------
+    scale: torch.Tensor
+
+    Returns
+    -------
+    torch.Tensor
+        The swizzled tensor with the same logical shape as *scale*.
+    """
+    assert scale.dtype == torch.float8_e4m3fn, (
+        "swizzle_blockscale expects the input tensor to be in "
+        "torch.float8_e4m3fn format.")
+
+    scale_ndim = scale.ndim
+    if scale_ndim == 2:
+        scale = scale.unsqueeze(0)  # (1, M, K)
+    assert scale.ndim == 3, "Expected a 2-D or 3-D tensor for block scales."
+
+    B, M, K = scale.shape
+
+    def _round_up(x: int, m: int) -> int:
+        return (x + m - 1) // m * m
+
+    M_padded = _round_up(M, 128)
+    K_padded = _round_up(K, 4)
+
+    padded = torch.zeros((B, M_padded, K_padded),
+                         dtype=scale.dtype,
+                         device=scale.device)
+    padded[:B, :M, :K] = scale
+
+    # Reshape / permute to the layout required by the kernel.
+    padded = padded.reshape(B, M_padded // 128, 4, 32, K_padded // 4, 4)
+    swizzled = padded.permute(0, 1, 4, 3, 2, 5).contiguous().cuda()
+
+    if scale_ndim == 2:
+        return swizzled.reshape(M, K)
+    return swizzled.reshape(B, M, K)
+
+
+def cutlass_fp4_supported() -> bool:
+    if not current_platform.is_cuda():
+        return False
+    capability_tuple = current_platform.get_device_capability()
+    capability = -1 if capability_tuple is None else capability_tuple.to_int()
+    return cutlass_scaled_mm_supports_fp4(capability)
diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py
index ad4ba9c0b827a..cd32f12f3c269 100644
--- a/vllm/model_executor/layers/utils.py
+++ b/vllm/model_executor/layers/utils.py
@@ -8,6 +8,7 @@ import torch
 from vllm import _custom_ops as ops
 from vllm import envs
 from vllm.platforms import current_platform
+from vllm.utils import direct_register_custom_op
 
 
 def get_token_bin_counts_and_mask(
@@ -70,10 +71,10 @@ def default_unquantized_gemm(layer: torch.nn.Module,
     return torch.nn.functional.linear(x, weight, bias)
 
 
-def rocm_unquantized_gemm(layer: torch.nn.Module,
-                          x: torch.Tensor,
-                          weight: torch.Tensor,
-                          bias: Optional[torch.Tensor] = None):
+def rocm_unquantized_gemm_impl(
+        x: torch.Tensor,
+        weight: torch.Tensor,
+        bias: Optional[torch.Tensor] = None) -> torch.Tensor:
     from vllm.platforms.rocm import on_gfx9
     k = weight.shape[1]
     use_skinny = (envs.VLLM_ROCM_USE_SKINNY_GEMM and on_gfx9() and \
@@ -97,6 +98,29 @@ def rocm_unquantized_gemm(layer: torch.nn.Module,
     return torch.nn.functional.linear(x, weight, bias)
 
 
+def rocm_unquantized_gemm_impl_fake(
+        x: torch.Tensor,
+        weight: torch.Tensor,
+        bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    return x.new_empty((*x.shape[:-1], weight.shape[0]))
+
+
+def rocm_unquantized_gemm(layer: torch.nn.Module,
+                          x: torch.Tensor,
+                          weight: torch.Tensor,
+                          bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    return torch.ops.vllm.rocm_unquantized_gemm_impl(x, weight, bias)
+
+
+direct_register_custom_op(
+    op_name="rocm_unquantized_gemm_impl",
+    op_func=rocm_unquantized_gemm_impl,
+    mutates_args=[],
+    fake_impl=rocm_unquantized_gemm_impl_fake,
+    dispatch_key=current_platform.dispatch_key,
+)
+
+
 def cpu_unquantized_gemm(layer: torch.nn.Module,
                          x: torch.Tensor,
                          weight: torch.Tensor,
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index a0cd94c969a1f..f57ebdb1abcbc 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -9,9 +9,8 @@ from dataclasses import dataclass, field
 from typing import Optional
 
 import torch
-import transformers
 from torch import nn
-from transformers.dynamic_module_utils import get_class_from_dynamic_module
+from typing_extensions import assert_never
 
 from vllm.attention import Attention
 from vllm.config import (ModelConfig, ModelImpl, VllmConfig,
@@ -20,13 +19,10 @@ from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import QKVCrossParallelLinear
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
-from vllm.model_executor.models import ModelRegistry
 from vllm.model_executor.models.adapters import (as_embedding_model,
                                                  as_reward_model,
                                                  as_seq_cls_model)
 from vllm.model_executor.models.interfaces import SupportsQuant
-from vllm.model_executor.models.registry import (_PREVIOUSLY_SUPPORTED_MODELS,
-                                                 _TRANSFORMERS_BACKEND_MODELS)
 from vllm.utils import is_pin_memory_available
 
 logger = init_logger(__name__)
@@ -169,61 +165,6 @@ def device_loading_context(module: torch.nn.Module,
         # New parameters or parameters already on target device are untouched
 
 
-def resolve_transformers_arch(model_config: ModelConfig,
-                              architectures: list[str]):
-    if model_config.model_impl == ModelImpl.VLLM:
-        raise ValueError(
-            "Attempting to resolve architecture from the Transformers library "
-            "but the model implementation is set to vLLM. This should never "
-            "happen.")
-
-    for i, arch in enumerate(architectures):
-        if arch in _TRANSFORMERS_BACKEND_MODELS:
-            continue
-
-        if model_config.model_impl == ModelImpl.AUTO:
-            logger.warning(
-                "%s has no vLLM implementation, falling back to Transformers "
-                "implementation. Some features may not be supported and "
-                "performance may not be optimal.", arch)
-
-        auto_map: dict[str, str] = getattr(model_config.hf_config, "auto_map",
-                                           None) or dict()
-        # Make sure that config class is always initialized before model class,
-        # otherwise the model class won't be able to access the config class,
-        # the expected auto_map should have correct order like:
-        # "auto_map": {
-        #     "AutoConfig": "<your-repo-name>--<config-name>",
-        #     "AutoModel": "<your-repo-name>--<config-name>",
-        #     "AutoModelFor<Task>": "<your-repo-name>--<config-name>",
-        # },
-        auto_modules = {
-            name:
-            get_class_from_dynamic_module(module,
-                                          model_config.model,
-                                          revision=model_config.revision)
-            for name, module in sorted(auto_map.items(), key=lambda x: x[0])
-        }
-        model_module = getattr(transformers, arch, None)
-        if model_module is None:
-            if "AutoModel" not in auto_map:
-                raise ValueError(
-                    f"Cannot find model module. '{arch}' is not a registered "
-                    "model in the Transformers library (only relevant if the "
-                    "model is meant to be in Transformers) and 'AutoModel' is "
-                    "not present in the model config's 'auto_map' (relevant "
-                    "if the model is custom).")
-            model_module = auto_modules["AutoModel"]
-
-        if not model_module.is_backend_compatible():
-            raise ValueError(
-                f"The Transformers implementation of '{arch}' is not "
-                "compatible with vLLM.")
-
-        architectures[i] = model_config._get_transformers_backend_cls()
-    return architectures
-
-
 def get_model_architecture(
         model_config: ModelConfig) -> tuple[type[nn.Module], str]:
     architectures = getattr(model_config.hf_config, "architectures", [])
@@ -239,56 +180,38 @@ def get_model_architecture(
         "bitsandbytes",
     ]
 
-    vllm_supported_archs = ModelRegistry.get_supported_archs()
-    is_supported = lambda arch: (arch in vllm_supported_archs and arch not in
-                                 _TRANSFORMERS_BACKEND_MODELS)
-    vllm_not_supported = not any(is_supported(arch) for arch in architectures)
-
-    if vllm_not_supported:
-        # try automatic conversion in adapters.py
-        for arch in architectures:
-            if not arch.endswith("ForSequenceClassification"):
-                continue
-
-            assert model_config.task == "classify"
-            causal_lm_arch = arch.replace("ForSequenceClassification",
-                                          "ForCausalLM")
-            causal_lm_arch_vllm_supported = (causal_lm_arch
-                                             in vllm_supported_archs)
-            if not causal_lm_arch_vllm_supported:
-                continue
-
-            architectures = [causal_lm_arch]
-            vllm_not_supported = False
-            break
-
-    if any(arch in _PREVIOUSLY_SUPPORTED_MODELS for arch in architectures):
-        previous_version = _PREVIOUSLY_SUPPORTED_MODELS[architectures[0]]
-        raise ValueError(
-            f"Model architecture {architectures[0]} was supported"
-            f" in vLLM until version {previous_version}, and is "
-            "not supported anymore. Please use an older version"
-            " of vLLM if you want to use this model architecture.")
-
-    if (model_config.model_impl == ModelImpl.TRANSFORMERS or
-            model_config.model_impl == ModelImpl.AUTO and vllm_not_supported):
-        architectures = resolve_transformers_arch(model_config, architectures)
-        logger.debug_once("Resolve transformers arch %s", str(architectures))
-    elif (model_config.quantization is not None
-          and model_config.quantization not in mixtral_supported
-          and "MixtralForCausalLM" in architectures):
+    if (model_config.quantization is not None
+            and model_config.quantization not in mixtral_supported
+            and "MixtralForCausalLM" in architectures):
         architectures = ["QuantMixtralForCausalLM"]
 
-    model_cls, arch = ModelRegistry.resolve_model_cls(architectures)
-    if model_config.task == "embed":
-        logger.debug_once("Automatic conversion using `as_embedding_model`.")
+    model_cls, arch = model_config.registry.resolve_model_cls(
+        architectures,
+        model_config=model_config,
+    )
+
+    if arch == model_config._get_transformers_backend_cls():
+        assert model_config.model_impl != ModelImpl.VLLM
+        if model_config.model_impl == ModelImpl.AUTO:
+            logger.warning_once(
+                "%s has no vLLM implementation, falling back to Transformers "
+                "implementation. Some features may not be supported and "
+                "performance may not be optimal.", arch)
+
+    convert_type = model_config.convert_type
+    if convert_type == "none":
+        pass
+    elif convert_type == "embed":
+        logger.debug_once("Converting to embedding model.")
         model_cls = as_embedding_model(model_cls)
-    elif model_config.task == "classify":
-        logger.debug_once("Automatic conversion using `as_seq_cls_model`.")
+    elif convert_type == "classify":
+        logger.debug_once("Converting to sequence classification model.")
         model_cls = as_seq_cls_model(model_cls)
-    elif model_config.task == "reward":
-        logger.debug_once("Automatic conversion using `as_reward_model`.")
+    elif convert_type == "reward":
+        logger.debug_once("Converting to reward model.")
         model_cls = as_reward_model(model_cls)
+    else:
+        assert_never(convert_type)
 
     return model_cls, arch
 
diff --git a/vllm/model_executor/models/aimv2.py b/vllm/model_executor/models/aimv2.py
index b13d863ebb744..d2307bb464bdb 100644
--- a/vllm/model_executor/models/aimv2.py
+++ b/vllm/model_executor/models/aimv2.py
@@ -8,6 +8,7 @@ from typing import Optional
 
 import torch
 import torch.nn as nn
+from transformers import PretrainedConfig
 
 from vllm.attention.layer import MultiHeadAttention
 from vllm.distributed import get_tensor_model_parallel_world_size
@@ -20,13 +21,12 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.transformers_utils.configs.ovis import AIMv2Config
 
 
 class AIMv2SwiGLUFFN(nn.Module):
 
-    def __init__(self, config: AIMv2Config, quant_config: QuantizationConfig,
-                 prefix: str):
+    def __init__(self, config: PretrainedConfig,
+                 quant_config: QuantizationConfig, prefix: str):
         super().__init__()
         hidden_features = config.intermediate_size
         in_features = config.hidden_size
@@ -57,7 +57,7 @@ class AIMv2SwiGLUFFN(nn.Module):
 
 class AIMv2PatchEmbed(nn.Module):
 
-    def __init__(self, config: AIMv2Config):
+    def __init__(self, config: PretrainedConfig):
         super().__init__()
         self.proj = nn.Conv2d(
             config.num_channels,
@@ -75,7 +75,7 @@ class AIMv2PatchEmbed(nn.Module):
 
 class AIMv2ViTPreprocessor(nn.Module):
 
-    def __init__(self, config: AIMv2Config):
+    def __init__(self, config: PretrainedConfig):
         super().__init__()
         num_patches = (config.image_size // config.patch_size)**2
 
@@ -93,8 +93,8 @@ class AIMv2ViTPreprocessor(nn.Module):
 
 class AIMv2Attention(nn.Module):
 
-    def __init__(self, config: AIMv2Config, quant_config: QuantizationConfig,
-                 prefix: str):
+    def __init__(self, config: PretrainedConfig,
+                 quant_config: QuantizationConfig, prefix: str):
         super().__init__()
         self.config = config
         self.embed_dim = config.hidden_size
@@ -141,8 +141,8 @@ class AIMv2Attention(nn.Module):
 
 class AIMv2Block(nn.Module):
 
-    def __init__(self, config: AIMv2Config, quant_config: QuantizationConfig,
-                 prefix: str):
+    def __init__(self, config: PretrainedConfig,
+                 quant_config: QuantizationConfig, prefix: str):
         super().__init__()
         self.attn = AIMv2Attention(config,
                                    quant_config=quant_config,
@@ -163,7 +163,7 @@ class AIMv2Transformer(nn.Module):
 
     def __init__(
         self,
-        config: AIMv2Config,
+        config: PretrainedConfig,
         quant_config: QuantizationConfig,
         *,
         require_post_norm: Optional[bool] = None,
@@ -193,7 +193,7 @@ class AIMv2Transformer(nn.Module):
 class AIMv2Model(torch.nn.Module):
 
     def __init__(self,
-                 config: AIMv2Config,
+                 config: PretrainedConfig,
                  quant_config: QuantizationConfig,
                  *,
                  require_post_norm: Optional[bool] = None,
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index 8ae1680a71f3c..e1368a3f6478a 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Optional, TypedDict, Union
+from typing import Annotated, Optional, Union
 
 import torch
 import torch.nn as nn
@@ -29,6 +29,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 # yapf: disable
 from .idefics2_vision_model import Idefics2VisionConfig
@@ -42,15 +43,26 @@ from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
                     merge_multimodal_embeddings)
 
 
-class AriaImagePixelInputs(TypedDict):
-    pixel_values: torch.Tensor
-    pixel_mask: Optional[torch.Tensor]
+class AriaImagePixelInputs(TensorSchema):
     """
-    Shape:
-        pixel_values: `(batch_size * num_images, num_channels, height, width)`
-        pixel_mask: `(batch_size * num_images, height, width)`
+    Dimensions:
+        - b: Batch size
+        - n: Number of images
+        - c: Number of channels
+        - h: Height of each image
+        - w: Width of each image
     """
 
+    pixel_values: Annotated[
+        torch.Tensor,
+        TensorShape("bn", 3, "h", "w"),
+    ]
+
+    pixel_mask: Annotated[
+        Optional[torch.Tensor],
+        TensorShape("bn", "h", "w"),
+    ]
+
 
 class AriaVisionTransformer(Idefics3VisionTransformer, SupportsQuant):
     packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
@@ -540,12 +552,6 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 self.vocab_size, logit_scale)
 
-    def _validate_image_sizes(
-            self, images: list[torch.Tensor]) -> list[torch.Tensor]:
-        if not all(img.shape == images[0].shape for img in images):
-            raise ValueError("All images must be the same size")
-        return images
-
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[AriaImagePixelInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
@@ -554,23 +560,9 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
         if pixel_values is None:
             return None
 
-        if not isinstance(pixel_values, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of pixel values. "
-                             f"Got type: {type(pixel_values)}")
-
-        pixel_values = self._validate_image_sizes(pixel_values)
-        pixel_values = flatten_bn(pixel_values, concat=True)
-
-        if pixel_mask is not None:
-            if not isinstance(pixel_mask, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of pixel mask. "
-                                 f"Got type: {type(pixel_mask)}")
-
-            pixel_mask = flatten_bn(pixel_mask, concat=True)
-
         return AriaImagePixelInputs(
-            pixel_values=pixel_values,
-            pixel_mask=pixel_mask,
+            pixel_values=flatten_bn(pixel_values, concat=True),
+            pixel_mask=flatten_bn(pixel_mask, concat=True),
         )
 
     def _create_patch_attention_mask(
diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py
index 45dd660c89375..a3eee9f065aea 100644
--- a/vllm/model_executor/models/aya_vision.py
+++ b/vllm/model_executor/models/aya_vision.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Adapted from https://github.com/huggingface/transformers/tree/main/src/transformers/models/aya_vision
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Literal, Optional, TypedDict, Union, cast
+from typing import Annotated, Literal, Optional, Union, cast
 
 import torch
 from torch import nn
@@ -29,6 +29,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .siglip import SiglipVisionModel
@@ -37,18 +38,28 @@ from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
                     merge_multimodal_embeddings)
 
 
-class AyaVisionImagePixelInputs(TypedDict):
+class AyaVisionImagePixelInputs(TensorSchema):
+    """
+    Dimensions:
+        - np: The total number of patches over each image over each prompt in
+              the batch
+        - c: Number of channels
+        - h: Height of each image patch
+        - w: Width of each image patch
+        - bn: Batch size * number of images
+    """
+
     type: Literal["pixel_values"]
-    pixel_values: torch.Tensor
-    """
-    Shape: `(num_patches_total, num_channels, height, width)`
 
-    `num_patches_total` is the total number of patches over each image over each
-    prompt in the batch.
-    """
+    pixel_values: Annotated[
+        torch.Tensor,
+        TensorShape("np", 3, "h", "w"),
+    ]
 
-    num_patches: torch.Tensor
-    """Shape: `(batch_size * num_images)`"""
+    num_patches: Annotated[
+        torch.Tensor,
+        TensorShape("bn"),
+    ]
 
 
 class AyaVisionMultiModalProjector(nn.Module):
@@ -383,21 +394,6 @@ class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal,
             e.flatten(0, 2) for e in image_embeds.split(num_patches.tolist())
         ]
 
-    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
-        h = w = self.config.vision_config.image_size
-        expected_dims = (3, h, w)
-
-        def _validate_shape(d: torch.Tensor):
-            if d.shape != expected_dims:
-                raise ValueError(
-                    "The expected shape of pixel values per image per batch "
-                    f"is {expected_dims}. You supplied {tuple(d.shape)}.")
-
-        for d in data:
-            _validate_shape(d)
-
-        return data
-
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[AyaVisionImagePixelInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
@@ -405,22 +401,17 @@ class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal,
         image_embeds = kwargs.pop("image_embeds", None)
         assert image_embeds is None, "Aya Vision does not support image_embeds."
 
-        if not isinstance(pixel_values, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of pixel values. "
-                             f"Got type: {type(pixel_values)}")
-        if num_patches is not None and not isinstance(num_patches,
-                                                      (torch.Tensor, list)):
-            raise ValueError("Incorrect type of num_patches. "
-                             f"Got type: {type(num_patches)}")
-
-        pixel_values = flatten_bn(pixel_values, concat=True)
-        num_patches = flatten_bn(num_patches, concat=True)
+        if pixel_values is None:
+            return None
 
         return AyaVisionImagePixelInputs(
             type="pixel_values",
-            pixel_values=self._validate_pixel_values(pixel_values),
-            num_patches=num_patches,
-        )
+            pixel_values=flatten_bn(pixel_values, concat=True),
+            num_patches=flatten_bn(num_patches, concat=True),
+            resolve_bindings={
+                "h": self.config.vision_config.image_size,
+                "w": self.config.vision_config.image_size,
+            })
 
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
diff --git a/vllm/model_executor/models/bailing_moe.py b/vllm/model_executor/models/bailing_moe.py
index 853c13b135eac..23cab3509ca82 100644
--- a/vllm/model_executor/models/bailing_moe.py
+++ b/vllm/model_executor/models/bailing_moe.py
@@ -32,6 +32,7 @@ from torch import nn
 from transformers.configuration_utils import PretrainedConfig
 
 from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
@@ -291,6 +292,7 @@ class BailingMoeBlock(nn.Module):
         return hidden_states, residual
 
 
+@support_torch_compile
 class BailingMoeModel(nn.Module):
 
     def __init__(
diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index c3066aaa2b87d..504621c8abd8f 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -12,7 +12,6 @@ from vllm.attention import Attention, AttentionType
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, PoolerConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
-from vllm.forward_context import get_forward_context
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                QKVParallelLinear,
@@ -60,7 +59,6 @@ class BertEmbedding(nn.Module):
     def forward(
         self,
         input_ids: torch.Tensor,
-        seq_lens: torch.Tensor,
         position_ids: torch.Tensor,
         token_type_ids: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
@@ -119,7 +117,6 @@ class BertPooler(Pooler):
         return pooled_output
 
 
-@support_torch_compile
 class BertEncoder(nn.Module):
 
     def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
@@ -337,6 +334,7 @@ class BertOutput(nn.Module):
         return hidden_states
 
 
+@support_torch_compile
 class BertModel(nn.Module, SupportsQuant):
 
     is_pooling_model = True
@@ -368,13 +366,9 @@ class BertModel(nn.Module, SupportsQuant):
         if inputs_embeds is not None:
             hidden_states = inputs_embeds
         else:
-            attn_metadata = get_forward_context().attn_metadata
-            assert hasattr(attn_metadata, "seq_lens_tensor")
-            hidden_states = self.embeddings(
-                input_ids=input_ids,
-                seq_lens=attn_metadata.seq_lens_tensor,
-                position_ids=position_ids,
-                token_type_ids=token_type_ids)
+            hidden_states = self.embeddings(input_ids=input_ids,
+                                            position_ids=position_ids,
+                                            token_type_ids=token_type_ids)
         return self.encoder(hidden_states)
 
     def _load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
@@ -447,7 +441,7 @@ class BertPoolingModel(BertModel):
         return loaded_params
 
 
-class BertEmbeddingModel(nn.Module, SupportsV0Only, SupportsQuant):
+class BertEmbeddingModel(nn.Module, SupportsQuant):
     """A model that uses Bert to provide embedding functionalities.
 
     This class encapsulates the BertModel and provides an interface for
@@ -474,11 +468,13 @@ class BertEmbeddingModel(nn.Module, SupportsV0Only, SupportsQuant):
         self,
         input_ids: Optional[torch.Tensor],
         positions: torch.Tensor,
+        token_type_ids: Optional[torch.Tensor] = None,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         return self.model(input_ids=input_ids,
                           position_ids=positions,
+                          token_type_ids=token_type_ids,
                           inputs_embeds=inputs_embeds,
                           intermediate_tensors=intermediate_tensors)
 
diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py
index 0b7350f07d3f6..59033cb74a338 100644
--- a/vllm/model_executor/models/bert_with_rope.py
+++ b/vllm/model_executor/models/bert_with_rope.py
@@ -8,11 +8,13 @@ from torch import nn
 from transformers import PretrainedConfig
 
 from vllm.attention import Attention, AttentionType
-from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
-from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.distributed import (divide, get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size,
+                              tensor_model_parallel_all_reduce)
 from vllm.model_executor.layers.activation import (get_act_and_mul_fn,
                                                    get_act_fn)
+from vllm.model_executor.layers.fused_moe import fused_moe
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                MergedColumnParallelLinear,
                                                QKVParallelLinear,
@@ -23,9 +25,10 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.models import SupportsV0Only
 from vllm.model_executor.models.interfaces import SupportsQuant
 from vllm.model_executor.models.utils import WeightsMapper
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 
 
@@ -201,114 +204,101 @@ class BertWithRopeMLP(nn.Module):
         return hidden_states
 
 
-class NomicRouter(nn.Module):
+class NomicMoE(nn.Module):
 
-    def __init__(self, hidden_size: int, moe_num_experts: int, moe_top_k: int):
+    def __init__(
+        self,
+        num_experts: int,
+        top_k: int,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        params_dtype: Optional[torch.dtype] = None,
+        tp_size: Optional[int] = None,
+    ):
         super().__init__()
-        self.moe_top_k = moe_top_k
-        self.layer = ReplicatedLinear(hidden_size, moe_num_experts, bias=False)
 
-    def forward(
-        self, x: torch.Tensor
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.LongTensor]:
-        weights = self.layer(x.view(-1, x.shape[-1]))[0].softmax(
-            dim=-1, dtype=torch.float32)
-        top_weights, top_experts = torch.topk(weights, self.moe_top_k, dim=-1)
-        weights = weights.to(x.dtype)
-        top_weights = top_weights.to(x.dtype)
-        return weights, top_weights, top_experts  # type: ignore
-
-
-class NomicExpertMLP(nn.Module):
-
-    def __init__(self, hidden_size: int, ffn_hidden_size: int,
-                 moe_num_experts: int, ffn_act_fn: str):
-        super().__init__()
+        self.tp_size = tp_size or get_tensor_model_parallel_world_size()
+        self.num_total_experts = num_experts
+        self.top_k = top_k
         self.hidden_size = hidden_size
-        self.ffn_hidden_size = ffn_hidden_size
-        self.moe_num_experts = moe_num_experts
+        self.total_intermediate_size = intermediate_size
+        self.intermediate_size = divide(intermediate_size, self.tp_size)
+        self.hidden_act = hidden_act
 
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+        self.params_dtype = params_dtype
+
+        self.router = ReplicatedLinear(self.hidden_size,
+                                       self.num_total_experts,
+                                       bias=False)
         self.w1 = nn.Parameter(
-            torch.empty(moe_num_experts * ffn_hidden_size, hidden_size))
+            torch.empty(self.num_total_experts,
+                        self.intermediate_size,
+                        self.hidden_size,
+                        device=current_platform.device_type,
+                        dtype=self.params_dtype))
         self.w2 = nn.Parameter(
-            torch.empty(moe_num_experts * ffn_hidden_size, hidden_size))
-        self.activation_fn = get_act_fn(ffn_act_fn)
+            torch.empty(self.num_total_experts,
+                        self.hidden_size,
+                        self.intermediate_size,
+                        device=current_platform.device_type,
+                        dtype=self.params_dtype))
+        self.bias = nn.Parameter(torch.zeros(self.hidden_size))
+        set_weight_attrs(self.w1, {
+            "weight_loader": self.weight_loader,
+        })
+        set_weight_attrs(self.w2, {
+            "weight_loader": self.weight_loader,
+        })
 
-    def forward(self, x: torch.Tensor, expert_idx: int) -> torch.Tensor:
-        expert_w1 = self.w1.view(self.moe_num_experts, self.ffn_hidden_size,
-                                 self.hidden_size)[expert_idx]
-        expert_w2 = self.w2.view(self.moe_num_experts, self.ffn_hidden_size,
-                                 self.hidden_size)[expert_idx]
+    def weight_loader(
+        self,
+        param: nn.Parameter,
+        loaded_weight: torch.Tensor,
+        weight_name: str,
+    ):
+        # NOTE: Nomic-MoE has fused experts weights with shape
+        # (num_experts * intermediate_size, hidden_size)
+        tp_rank = get_tensor_model_parallel_rank()
+        param_data = param.data
+        shard_size = self.intermediate_size
+        shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size)
+        if weight_name.endswith("w1"):
+            loaded_weight = loaded_weight.reshape(
+                self.num_total_experts,
+                self.total_intermediate_size,
+                self.hidden_size,
+            )[:, shard]
+        if weight_name.endswith("w2"):
+            loaded_weight = loaded_weight.reshape(
+                self.num_total_experts,
+                self.total_intermediate_size,
+                self.hidden_size,
+            )[:, shard].transpose(1, 2)
+        param_data.copy_(loaded_weight)
 
-        x1 = x.matmul(expert_w1.t())
-        act_out = self.activation_fn(x1)
-        x2 = act_out.matmul(expert_w2)
-        return x2
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        num_tokens, hidden_size = hidden_states.shape
+        hidden_states = hidden_states.view(-1, self.hidden_size)
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.router(hidden_states)
+        final_hidden_states = fused_moe(hidden_states,
+                                        self.w1,
+                                        self.w2,
+                                        router_logits,
+                                        self.top_k,
+                                        renormalize=False,
+                                        inplace=False,
+                                        activation=self.hidden_act,
+                                        is_act_and_mul=False)
 
+        if self.tp_size > 1:
+            final_hidden_states = tensor_model_parallel_all_reduce(
+                final_hidden_states)
 
-class NomicExperts(nn.Module):
-
-    def __init__(self, config, hidden_size: int, ffn_hidden_size: int,
-                 moe_num_experts: int):
-        super().__init__()
-        self.moe_num_experts = moe_num_experts
-
-        self.mlp = NomicExpertMLP(hidden_size=config.n_embd,
-                                  ffn_hidden_size=config.n_inner,
-                                  moe_num_experts=moe_num_experts,
-                                  ffn_act_fn=config.hidden_act)
-        self.bias = nn.Parameter(torch.zeros(config.n_embd))
-
-    def forward(self, x: torch.Tensor, weights: torch.Tensor,
-                top_weights: torch.Tensor,
-                top_experts: torch.LongTensor) -> torch.Tensor:
-        q_len, hidden_size = x.shape
-        x = x.view(-1, hidden_size)
-        out = torch.zeros_like(x)
-
-        expert_mask = nn.functional.one_hot(
-            top_experts, num_classes=self.moe_num_experts).permute(2, 1, 0)
-        for expert_idx in range(0, self.moe_num_experts):
-            topk_idx, token_idx = torch.where(expert_mask[expert_idx])
-            if token_idx.shape[0] == 0:
-                continue
-
-            token_list = token_idx.tolist()
-            topk_list = topk_idx.tolist()
-
-            expert_tokens = x[None, token_list].reshape(-1, hidden_size)
-            expert_out = self.mlp(
-                expert_tokens, expert_idx) * top_weights[token_list, topk_list,
-                                                         None]
-
-            out.index_add_(0, token_idx, expert_out)
-
-        out = out.reshape(q_len, hidden_size)
-        return out + self.bias
-
-
-class NomicMoELayer(nn.Module):
-
-    def __init__(self, config: PretrainedConfig):
-        super().__init__()
-
-        self.router = NomicRouter(
-            config.n_embd,
-            moe_num_experts=config.num_experts,
-            moe_top_k=config.moe_top_k,
-        )
-
-        self.experts = NomicExperts(
-            config,
-            hidden_size=config.n_embd,
-            ffn_hidden_size=config.n_inner,
-            moe_num_experts=config.num_experts,
-        )
-
-    def forward(self, x: torch.Tensor):
-        weights, top_weights, top_experts = self.router(x)
-        out = self.experts(x, weights, top_weights, top_experts)
-        return out
+        return final_hidden_states.view(num_tokens, hidden_size) + self.bias
 
 
 class BertWithRopeBlock(nn.Module):
@@ -332,7 +322,11 @@ class BertWithRopeBlock(nn.Module):
             prefix=f"{prefix}.attention")
 
         if moe:
-            self.mlp = NomicMoELayer(config=config, )
+            self.mlp = NomicMoE(num_experts=config.num_experts,
+                                top_k=config.moe_top_k,
+                                hidden_size=config.hidden_size,
+                                intermediate_size=config.intermediate_size,
+                                hidden_act=config.hidden_act)
         else:
             if config.hidden_act in ["silu", "geglu"]:
                 self.mlp = BertWithRopeGatedMLP(
@@ -364,7 +358,6 @@ class BertWithRopeBlock(nn.Module):
         return hidden_states
 
 
-@support_torch_compile
 class BertWithRopeEncoder(nn.Module):
 
     def __init__(self,
@@ -398,7 +391,7 @@ class BertWithRopeEncoder(nn.Module):
         return hidden_states
 
 
-class BertWithRope(nn.Module, SupportsV0Only, SupportsQuant):
+class BertWithRope(nn.Module, SupportsQuant):
     hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
@@ -463,7 +456,11 @@ class BertWithRope(nn.Module, SupportsV0Only, SupportsQuant):
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
-                weight_loader(param, loaded_weight)
+                if name.endswith((".w1", ".w2")):
+                    # Nomic-MoE has fused experts weights
+                    weight_loader(param, loaded_weight, name)
+                else:
+                    weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
 
@@ -481,6 +478,10 @@ class NomicBertModel(BertWithRope):
             "mlp.fc12": "mlp.gate_proj",
             "mlp.fc2": "mlp.down_proj",
             "norm2": "mlp_ln",
+            # MoE mapping
+            "experts.mlp.": "",
+            "experts.": "",
+            "router.layer": "router",
         })
 
 
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index 27a9208107871..8e3505f872eb2 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Literal, Optional, TypedDict, Union
+from typing import Annotated, Literal, Optional, Union
 
 import torch
 import torch.nn as nn
@@ -22,6 +22,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         PromptInsertion, PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .blip import BlipVisionModel
 from .interfaces import (MultiModalEmbeddings, SupportsMultiModal, SupportsPP,
@@ -34,19 +35,27 @@ from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
 _IMAGE_TOKEN_ID = 50265
 
 
-class Blip2ImagePixelInputs(TypedDict):
-    type: Literal["pixel_values"]
-    data: torch.Tensor
-    """Shape: `(batch_size * num_images, num_channels, height, width)`"""
-
-
-class Blip2ImageEmbeddingInputs(TypedDict):
-    type: Literal["image_embeds"]
-    data: torch.Tensor
-    """Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
-
-    `hidden_size` must match the hidden size of language model backbone.
+class Blip2ImagePixelInputs(TensorSchema):
     """
+    Dimensions:
+        - bn: Batch size * number of images
+        - c: Number of channels (3)
+        - h: Height of each image
+        - w: Width of each image
+    """
+    type: Literal["pixel_values"]
+    data: Annotated[torch.Tensor, TensorShape("bn", 3, "h", "w")]
+
+
+class Blip2ImageEmbeddingInputs(TensorSchema):
+    """
+    Dimensions:
+        - bn: Batch size * number of images
+        - f: Image feature size
+        - h: Hidden size (must match the hidden size of language model backbone)
+    """
+    type: Literal["image_embeds"]
+    data: Annotated[torch.Tensor, TensorShape("bn", "f", "h")]
 
 
 Blip2ImageInputs = Union[Blip2ImagePixelInputs, Blip2ImageEmbeddingInputs]
@@ -551,21 +560,8 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
-    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
-        h = w = self.config.vision_config.image_size
-        expected_dims = (3, h, w)
-        actual_dims = tuple(data.shape[1:])
-
-        if actual_dims != expected_dims:
-            expected_expr = ("batch_size", *map(str, expected_dims))
-            raise ValueError(
-                f"The expected shape of pixel values is {expected_expr}. "
-                f"You supplied {tuple(data.shape)}.")
-
-        return data
-
-    def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[Blip2ImageInputs]:
+    def _create_image_input(self,
+                            **kwargs: object) -> Optional[Blip2ImageInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
         image_embeds = kwargs.pop("image_embeds", None)
 
@@ -573,27 +569,19 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
             return None
 
         if pixel_values is not None:
-            if not isinstance(pixel_values, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of pixel values. "
-                                 f"Got type: {type(pixel_values)}")
-
-            pixel_values = flatten_bn(pixel_values, concat=True)
-
-            return Blip2ImagePixelInputs(
-                type="pixel_values",
-                data=self._validate_pixel_values(pixel_values),
-            )
+            expected_h = expected_w = self.config.vision_config.image_size
+            return Blip2ImagePixelInputs(type="pixel_values",
+                                         data=flatten_bn(pixel_values,
+                                                         concat=True),
+                                         resolve_bindings={
+                                             "h": expected_h,
+                                             "w": expected_w
+                                         })
 
         if image_embeds is not None:
-            if not isinstance(image_embeds, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of image embeddings. "
-                                 f"Got type: {type(image_embeds)}")
-
-            image_embeds = flatten_bn(image_embeds, concat=True)
-
             return Blip2ImageEmbeddingInputs(
                 type="image_embeds",
-                data=image_embeds,
+                data=flatten_bn(image_embeds, concat=True),
             )
 
         raise AssertionError("This line should be unreachable.")
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 74b18df7214b4..8d705f40ce8ff 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -3,7 +3,7 @@
 
 from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property
-from typing import Any, Literal, Optional, TypedDict, Union
+from typing import Annotated, Any, Literal, Optional, Union
 
 import torch
 import torch.nn as nn
@@ -38,6 +38,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         PromptUpdate, PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import (MultiModalEmbeddings, SupportsMultiModal, SupportsPP,
                          SupportsQuant)
@@ -48,10 +49,16 @@ from .utils import (flatten_bn, is_pp_missing_parameter,
 logger = init_logger(__name__)
 
 
-class ChameleonImagePixelInputs(TypedDict):
+class ChameleonImagePixelInputs(TensorSchema):
+    """
+    Dimensions:
+        - bn: Batch size * number of images
+        - c: Number of channels (3)
+        - h: Height of each image
+        - w: Width of each image
+    """
     type: Literal["pixel_values"]
-    data: torch.Tensor
-    """Shape: `(batch_size * num_images, num_channels, height, width)`"""
+    data: Annotated[torch.Tensor, TensorShape("bn", 3, "h", "w")]
 
 
 class ChameleonProcessingInfo(BaseProcessingInfo):
@@ -962,19 +969,6 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
-    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
-        vq_config: ChameleonVQVAEConfig = self.config.vq_config
-        expected_dims = (3, vq_config.resolution, vq_config.resolution)
-        actual_dims = tuple(data.shape[1:])
-
-        if actual_dims != expected_dims:
-            expected_expr = ("batch_size", *map(str, expected_dims))
-            raise ValueError(
-                f"The expected shape of pixel values is {expected_expr}. "
-                f"You supplied {tuple(data.shape)}.")
-
-        return data
-
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[ChameleonImagePixelInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
@@ -982,16 +976,16 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
         if pixel_values is None:
             return None
 
-        if not isinstance(pixel_values, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of pixel values. "
-                             f"Got type: {type(pixel_values)}")
+        vq_config: ChameleonVQVAEConfig = self.config.vq_config
+        expected_h = expected_w = vq_config.resolution
 
-        pixel_values = flatten_bn(pixel_values, concat=True)
-
-        return ChameleonImagePixelInputs(
-            type="pixel_values",
-            data=self._validate_pixel_values(pixel_values),
-        )
+        return ChameleonImagePixelInputs(type="pixel_values",
+                                         data=flatten_bn(pixel_values,
+                                                         concat=True),
+                                         resolve_bindings={
+                                             "h": expected_h,
+                                             "w": expected_w
+                                         })
 
     def get_language_model(self) -> torch.nn.Module:
         return self.model
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index cb07fe7d9e1dc..9030ff307bee3 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -93,7 +93,7 @@ class NomicBertModelConfig(VerifyAndUpdateConfig):
         config.num_hidden_layers = config.n_layer
 
         head_dim = config.hidden_size // config.num_attention_heads
-        rotary_emb_dim = head_dim * config.rotary_emb_fraction
+        rotary_emb_dim = int(head_dim * config.rotary_emb_fraction)
         max_trained_positions = getattr(config, "max_trained_positions", 2048)
         config.rotary_kwargs = {
             "head_size": head_dim,
@@ -253,8 +253,10 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
             dtype=kv_cache_dtype,
             use_mla=model_config.use_mla).page_size_bytes
 
-        model_cls = ModelRegistry.resolve_model_cls(
-            model_config._model_info.architecture)[0]
+        model_cls, _ = ModelRegistry.resolve_model_cls(
+            model_config.architecture,
+            model_config=model_config,
+        )
 
         # get mamba page size
         mamba_page_size = MambaSpec(
diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py
index 7a4dd69443ad7..360c7e66bf5ce 100644
--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@@ -6,6 +6,7 @@ from typing import Optional, Union
 
 import torch
 import torch.nn as nn
+from transformers import PretrainedConfig
 
 from vllm.attention import Attention
 from vllm.config import CacheConfig, VllmConfig
@@ -24,7 +25,6 @@ from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.configs.dbrx import DbrxConfig
 
 from .interfaces import SupportsPP
 from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
@@ -39,7 +39,7 @@ class DbrxRouter(nn.Module):
 
     def __init__(
         self,
-        config: DbrxConfig,
+        config: PretrainedConfig,
         params_dtype: Optional[torch.dtype] = None,
     ):
         super().__init__()
@@ -63,7 +63,7 @@ class DbrxExperts(FusedMoE):
 
     def __init__(
         self,
-        config: DbrxConfig,
+        config: PretrainedConfig,
         quant_config: Optional[QuantizationConfig] = None,
         params_dtype: Optional[torch.dtype] = None,
         prefix: str = "",
@@ -138,7 +138,7 @@ class DbrxMoE(nn.Module):
 
     def __init__(
         self,
-        config: DbrxConfig,
+        config: PretrainedConfig,
         quant_config: Optional[QuantizationConfig] = None,
         params_dtype: Optional[torch.dtype] = None,
         prefix: str = "",
@@ -169,7 +169,7 @@ class DbrxAttention(nn.Module):
 
     def __init__(
         self,
-        config: DbrxConfig,
+        config: PretrainedConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
@@ -249,7 +249,7 @@ class DbrxFusedNormAttention(nn.Module):
 
     def __init__(
         self,
-        config: DbrxConfig,
+        config: PretrainedConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
@@ -284,7 +284,7 @@ class DbrxBlock(nn.Module):
 
     def __init__(
         self,
-        config: DbrxConfig,
+        config: PretrainedConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 79ddd3d0f6276..68a0a83d6204c 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -830,20 +830,6 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP, MixtureOfExperts):
                                        sampling_metadata)
         return logits
 
-    def make_empty_intermediate_tensors(
-            self, batch_size: int, dtype: torch.dtype,
-            device: torch.device) -> IntermediateTensors:
-        return IntermediateTensors({
-            "hidden_states":
-            torch.zeros((batch_size, self.config.hidden_size),
-                        dtype=dtype,
-                        device=device),
-            "residual":
-            torch.zeros((batch_size, self.config.hidden_size),
-                        dtype=dtype,
-                        device=device),
-        })
-
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index 0ca6b28073ec8..544de5fe02d35 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -5,7 +5,7 @@
 """Inference-only Deepseek-VL2 model compatible with HuggingFace weights."""
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Literal, Optional, TypedDict, Union
+from typing import Annotated, Literal, Optional, Union
 
 import torch
 import torch.nn as nn
@@ -36,6 +36,7 @@ from vllm.transformers_utils.processors.deepseek_vl2 import (
     DeepseekVLV2Processor)
 from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 from vllm.utils import is_list_of
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
@@ -46,25 +47,30 @@ from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
 _IMAGE_TOKEN = "<image>"
 
 
-class DeepseekVL2ImagePixelInputs(TypedDict):
+class DeepseekVL2ImagePixelInputs(TensorSchema):
+    """
+    Dimensions:
+        - bn: Batch size * number of images
+        - c: Number of channels (3)
+        - h: Height of each image
+        - w: Width of each image
+    """
     type: Literal["pixel_values"]
-    data: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    Shape: `(batch_size * num_images, num_channels, height, width)`
-    """
-    images_spatial_crop: torch.Tensor
-    """
-    Shape: `(batch_size * num_images, 2)`
-    """
+    data: Annotated[Union[torch.Tensor, list[torch.Tensor]],
+                    TensorShape("bn", 3, "h", "w")]
+    images_spatial_crop: Annotated[torch.Tensor, TensorShape("bn", 2)]
 
 
-class DeepseekVL2VImageEmbeddingInputs(TypedDict):
+class DeepseekVL2VImageEmbeddingInputs(TensorSchema):
+    """
+    Dimensions:
+        - bn: Batch size * number of images
+        - f: Image feature size
+        - h: Hidden size (must match language model backbone)
+    """
     type: Literal["image_embeds"]
-    data: Union[torch.Tensor, list[torch.Tensor]]
-    """Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
-
-    `hidden_size` must match the hidden size of language model backbone.
-    """
+    data: Annotated[Union[torch.Tensor, list[torch.Tensor]],
+                    TensorShape("bn", "f", "h")]
 
 
 DeepseekVL2ImageInputs = Union[DeepseekVL2ImagePixelInputs,
@@ -439,46 +445,6 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         model = model.to(dtype=torch.get_default_dtype())
         return model
 
-    def _validate_pixel_values(
-        self, data: Union[torch.Tensor, list[torch.Tensor]]
-    ) -> Union[torch.Tensor, list[torch.Tensor]]:
-
-        h = w = self.vision_config.image_size
-        expected_dims = (3, h, w)
-
-        def _validate_shape(d: torch.Tensor):
-            actual_dims = tuple(d.shape[1:])
-
-            if actual_dims != expected_dims:
-                expected_expr = ("num_patches", *map(str, expected_dims))
-                raise ValueError(
-                    "The expected shape of pixel values per image per batch "
-                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
-
-        for d in data:
-            _validate_shape(d)
-
-        return data
-
-    def _validate_images_spatial_crop(
-        self, data: Union[torch.Tensor, list[torch.Tensor]]
-    ) -> Union[torch.Tensor, list[torch.Tensor]]:
-        expected_dims = 2
-
-        def _validate_shape(d: torch.Tensor):
-            actual_dims = d.size(-1)
-
-            if actual_dims != expected_dims:
-                expected_expr = str(expected_dims)
-                raise ValueError(
-                    f"The expected shape of image sizes per image per batch "
-                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
-
-        for d in data:
-            _validate_shape(d)
-
-        return data
-
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[DeepseekVL2ImageInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
@@ -489,25 +455,18 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
             return None
 
         if pixel_values is not None:
-            if not isinstance(pixel_values, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of pixel values. "
-                                 f"Got type: {type(pixel_values)}")
-
-            if not isinstance(images_spatial_crop, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of image sizes. "
-                                 f"Got type: {type(images_spatial_crop)}")
-
-            return DeepseekVL2ImagePixelInputs(
-                type="pixel_values",
-                data=self._validate_pixel_values(flatten_bn(pixel_values)),
-                images_spatial_crop=self._validate_images_spatial_crop(
-                    flatten_bn(images_spatial_crop, concat=True)))
+            expected_h = expected_w = self.vision_config.image_size
+            return DeepseekVL2ImagePixelInputs(type="pixel_values",
+                                               data=flatten_bn(pixel_values),
+                                               images_spatial_crop=flatten_bn(
+                                                   images_spatial_crop,
+                                                   concat=True),
+                                               resolve_bindings={
+                                                   "h": expected_h,
+                                                   "w": expected_w,
+                                               })
 
         if image_embeds is not None:
-            if not isinstance(image_embeds, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of image embeddings. "
-                                 f"Got type: {type(image_embeds)}")
-
             return DeepseekVL2VImageEmbeddingInputs(
                 type="image_embeds",
                 data=flatten_bn(image_embeds),
diff --git a/vllm/model_executor/models/ernie45.py b/vllm/model_executor/models/ernie45.py
index 2a89fffe80e35..e7302dc5ecdd7 100644
--- a/vllm/model_executor/models/ernie45.py
+++ b/vllm/model_executor/models/ernie45.py
@@ -28,7 +28,7 @@ from vllm.model_executor.models.llama import LlamaForCausalLM
 from .utils import PPMissingLayer
 
 
-class Ernie4_5_ForCausalLM(LlamaForCausalLM):
+class Ernie4_5ForCausalLM(LlamaForCausalLM):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__(vllm_config=vllm_config, prefix=prefix)
diff --git a/vllm/model_executor/models/ernie45_moe.py b/vllm/model_executor/models/ernie45_moe.py
index 5824b0967e773..4780ea931ea50 100644
--- a/vllm/model_executor/models/ernie45_moe.py
+++ b/vllm/model_executor/models/ernie45_moe.py
@@ -109,8 +109,8 @@ class Ernie4_5_MoeMoE(nn.Module):
         layer_idx = extract_layer_index(prefix)
         self.layer_idx = layer_idx
         self.tp_size = get_tensor_model_parallel_world_size()
-        self.moe_num_shared_experts = getattr(config, "moe_num_shared_experts",
-                                              None)
+        self.has_shared_experts = (getattr(config, "moe_num_shared_experts", 0)
+                                   > 0)
 
         if self.tp_size > config.moe_num_experts:
             raise ValueError(
@@ -137,7 +137,7 @@ class Ernie4_5_MoeMoE(nn.Module):
             prefix=f"{prefix}.experts",
             e_score_correction_bias=self.gate.e_score_correction_bias)
 
-        if self.moe_num_shared_experts is not None:
+        if self.has_shared_experts:
             intermediate_size = (config.moe_intermediate_size *
                                  config.moe_num_shared_experts)
             self.shared_experts = Ernie4_5_MoeMLP(
@@ -153,7 +153,8 @@ class Ernie4_5_MoeMoE(nn.Module):
         orig_shape = hidden_states.shape
         hidden_dim = hidden_states.shape[-1]
         hidden_states = hidden_states.view(-1, hidden_dim)
-        if self.moe_num_shared_experts is not None:
+        shared_output = None
+        if self.has_shared_experts:
             shared_output = self.shared_experts(hidden_states)
 
         router_logits, _ = self.gate(hidden_states)
@@ -161,7 +162,7 @@ class Ernie4_5_MoeMoE(nn.Module):
         final_hidden_states = self.experts(hidden_states=hidden_states,
                                            router_logits=router_logits)
 
-        if self.moe_num_shared_experts is not None and \
+        if self.has_shared_experts and \
               shared_output is not None:
             final_hidden_states = final_hidden_states + shared_output
 
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index aaf105ec2552a..8052b6bb82348 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -30,6 +30,7 @@ from typing import Any, Optional, Union
 
 import torch
 from torch import nn
+from transformers import PretrainedConfig
 
 from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
@@ -49,7 +50,6 @@ from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.configs.exaone import ExaoneConfig
 
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
@@ -99,7 +99,7 @@ class ExaoneAttention(nn.Module):
 
     def __init__(
         self,
-        config: ExaoneConfig,
+        config: PretrainedConfig,
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
@@ -194,7 +194,7 @@ class ExaoneBlockAttention(nn.Module):
 
     def __init__(
         self,
-        config: ExaoneConfig,
+        config: PretrainedConfig,
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
@@ -236,7 +236,7 @@ class ExaoneDecoderLayer(nn.Module):
 
     def __init__(
         self,
-        config: ExaoneConfig,
+        config: PretrainedConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py
index 97aeb6fd7b172..3d6ce3e8895fb 100644
--- a/vllm/model_executor/models/exaone4.py
+++ b/vllm/model_executor/models/exaone4.py
@@ -26,6 +26,7 @@ from typing import Any, Optional, Union
 
 import torch
 from torch import nn
+from transformers import PretrainedConfig
 
 from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
@@ -45,7 +46,6 @@ from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.configs.exaone4 import Exaone4Config
 
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index,
@@ -96,7 +96,7 @@ class Exaone4Attention(nn.Module):
 
     def __init__(
         self,
-        config: Exaone4Config,
+        config: PretrainedConfig,
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
@@ -224,7 +224,7 @@ class Exaone4DecoderLayer(nn.Module):
 
     def __init__(
         self,
-        config: Exaone4Config,
+        config: PretrainedConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py
index 1bedac29af9f3..399c739f408ee 100644
--- a/vllm/model_executor/models/florence2.py
+++ b/vllm/model_executor/models/florence2.py
@@ -4,7 +4,7 @@
 import math
 from collections import OrderedDict
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Literal, Optional, TypedDict, Union
+from typing import Annotated, Literal, Optional, Union
 
 import torch
 import torch.nn as nn
@@ -29,16 +29,28 @@ from vllm.multimodal.processing import (BaseProcessingInfo,
                                         PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import (MultiModalEmbeddings, SupportsMultiModal,
                          SupportsV0Only)
 from .utils import AutoWeightsLoader, flatten_bn, merge_multimodal_embeddings
 
 
-class Florence2ImagePixelInputs(TypedDict):
+class Florence2ImagePixelInputs(TensorSchema):
+    """
+    Dimensions:
+        - b: Batch size
+        - c: Number of channels (3)
+        - h: Height of the image
+        - w: Width of the image
+    """
+
     type: Literal["pixel_values"]
-    data: torch.Tensor
-    """Shape: (batch_size, num_channel, height, width)"""
+
+    data: Annotated[
+        torch.Tensor,
+        TensorShape("b", 3, "h", "w"),
+    ]
 
 
 # ViT implementation are all copied from
@@ -931,28 +943,6 @@ class Florence2ForConditionalGeneration(nn.Module, SupportsMultiModal,
             raise NotImplementedError(
                 'Florence2 only supports COSINE as temporal embedding.')
 
-    def _validate_pixel_values(
-        self, data: Union[torch.Tensor, list[torch.Tensor]]
-    ) -> Union[torch.Tensor, list[torch.Tensor]]:
-
-        size = self.processor_config["size"]
-        h, w = size["height"], size["width"]
-        expected_dims = (3, h, w)
-
-        def _validate_shape(d: torch.Tensor):
-            actual_dims = tuple(d.shape)
-
-            if actual_dims != expected_dims:
-                expected_expr = tuple(*map(str, expected_dims))
-                raise ValueError(
-                    "The expected shape of pixel values per batch "
-                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
-
-        for d in data:
-            _validate_shape(d)
-
-        return data
-
     def _parse_and_validate_image_input(self, **kwargs: object):
         pixel_values: Optional[Union[list[list[torch.Tensor]],
                                      list[torch.Tensor],
@@ -971,10 +961,16 @@ class Florence2ForConditionalGeneration(nn.Module, SupportsMultiModal,
                 "Both pixel values and image embeds are provided.")
 
         if pixel_values is not None:
+            size = self.processor_config["size"]
+            expected_h, expected_w = size["height"], size["width"]
+
             return Florence2ImagePixelInputs(
                 type="pixel_values",
-                data=self._validate_pixel_values(
-                    flatten_bn(pixel_values, concat=True)),
+                data=flatten_bn(pixel_values, concat=True),
+                resolve_bindings={
+                    "h": expected_h,
+                    "w": expected_w
+                },
             )
 
         if image_embeds is not None:
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 558d4fbb4de11..7e1d478562a4c 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -19,7 +19,7 @@
 """ PyTorch Fuyu model."""
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Literal, Optional, TypedDict
+from typing import Annotated, Literal, Optional
 
 import torch
 import torch.nn as nn
@@ -40,6 +40,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         PromptUpdate, PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, maybe_prefix,
@@ -50,18 +51,25 @@ _IMAGE_TOKEN_ID = 71011
 _NEWLINE_TOKEN_ID = 71019
 
 
-class FuyuImagePatchInputs(TypedDict):
-    type: Literal["image_patches"]
-    flat_data: torch.Tensor
+class FuyuImagePatchInputs(TensorSchema):
     """
-    Shape: 
-    `(batch_size * num_patches, patch_size_x * patch_size_y * num_channels)`
+    Dimensions:
+        - bn: Batch size * number of images
+        - bnp: Batch size * number of images * number of patches
+        - fn: patch_size_x * patch_size_y * num_channels
     """
 
-    patches_per_image: list[int]
+    type: Literal["image_patches"] = "image_patches"
+
+    flat_data: Annotated[
+        torch.Tensor,
+        TensorShape("bnp", "fn"),
+    ]
+
+    patches_per_image: Annotated[list[int], TensorShape("bn")]
     """
     The number of total patches for each image in the batch.
-
+    
     This is used to split the embeddings which has the first two dimensions
     flattened just like `flat_data`.
     """
@@ -297,42 +305,18 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
-    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
-
-        h = w = self.config.patch_size
-        num_channels = self.config.num_channels
-        expected_dims = num_channels * h * w
-
-        def _validate_shape(d: torch.Tensor):
-            actual_dims = d.size(-1)
-
-            if actual_dims != expected_dims:
-                expected_expr = str(expected_dims)
-                raise ValueError(
-                    "The expected shape of pixel values per image per batch "
-                    f"per patch is {expected_expr}. "
-                    f"You supplied {tuple(d.shape)}.")
-
-        for d in data:
-            _validate_shape(d)
-
-        return data.to(self.vision_embed_tokens.weight.dtype)
-
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[FuyuImagePatchInputs]:
         image_patches = kwargs.pop("image_patches", None)
         if image_patches is not None:
-            if not isinstance(image_patches, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of image patches. "
-                                 f"Got type: {type(image_patches)}")
-
             image_patches_flat = flatten_bn(image_patches)
+            flat_data = flatten_bn(image_patches_flat, concat=True)
 
             return FuyuImagePatchInputs(
                 type="image_patches",
-                flat_data=self._validate_pixel_values(
-                    flatten_bn(image_patches_flat, concat=True)),
+                flat_data=flat_data,
                 patches_per_image=[x.size(0) for x in image_patches_flat],
+                resolve_bindings={"fn": self.image_feature_size},
             )
 
         return None
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index d756f54c49b0e..e9ee1ebdcc680 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Any, Literal, Optional, TypedDict
+from typing import Annotated, Any, Literal, Optional
 
 import torch
 from torch import nn
@@ -31,6 +31,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
 # yapf: enable
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
                          SupportsMultiModal, SupportsPP)
@@ -42,18 +43,21 @@ from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
 logger = init_logger(__name__)
 
 
-class Gemma3ImagePixelInputs(TypedDict):
-    type: Literal["pixel_values"]
-    pixel_values: torch.Tensor
+class Gemma3ImagePixelInputs(TensorSchema):
     """
-    Shape: `(num_patches_total, num_channels, height, width)`
-
-    `num_patches_total` is the total number of patches
-    over each image over each prompt in the batch.
+    Dimensions:
+        - p: Number of patches total (over each image over each prompt in the
+          batch)
+        - c: Number of channels (3)
+        - h: Height of each patch
+        - w: Width of each patch
+        - bn: Batch size * number of images
     """
+    type: Literal["pixel_values"] = "pixel_values"
 
-    num_patches: torch.Tensor
-    """Shape: `(batch_size * num_images)`"""
+    pixel_values: Annotated[torch.Tensor, TensorShape("p", 3, "h", "w")]
+
+    num_patches: Annotated[torch.Tensor, TensorShape("bn")]
 
 
 Gemma3ImageInputs = Gemma3ImagePixelInputs
@@ -523,15 +527,6 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
     def dtype(self):
         return next(self.parameters()).dtype
 
-    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
-        image_size = self.config.vision_config.image_size
-        expected_dims = (3, image_size, image_size)
-        if data.shape[1:] != expected_dims:
-            raise ValueError(
-                "The expected shape of pixel values per image per batch is "
-                f"{expected_dims}. You supplied {tuple(data.shape)}.")
-        return data
-
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[Gemma3ImageInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
@@ -549,14 +544,15 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
             raise ValueError("Incorrect type of num_crops. "
                              f"Got type: {type(num_crops)}")
 
-        pixel_values = flatten_bn(pixel_values, concat=True)
-        num_crops = flatten_bn(num_crops, concat=True)
+        image_size = self.config.vision_config.image_size
 
         return Gemma3ImagePixelInputs(
-            type="pixel_values",
-            pixel_values=self._validate_pixel_values(pixel_values),
-            num_patches=num_crops + 1,
-        )
+            pixel_values=flatten_bn(pixel_values, concat=True),
+            num_patches=flatten_bn(num_crops, concat=True) + 1,
+            resolve_bindings={
+                "h": image_size,
+                "w": image_size
+            })
 
     def _image_pixels_to_features(
         self,
diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py
index 7d163320e0d6a..a58b32793dbef 100644
--- a/vllm/model_executor/models/gemma3n.py
+++ b/vllm/model_executor/models/gemma3n.py
@@ -167,22 +167,33 @@ class Gemma3nAltUp(nn.Module):
 class Gemma3nLaurelBlock(nn.Module):
     """Learned Augmented Residual Layer"""
 
-    def __init__(self, hidden_size: int, laurel_rank: int, rms_norm_eps: float,
-                 prefix: str):
+    def __init__(
+        self,
+        hidden_size: int,
+        laurel_rank: int,
+        rms_norm_eps: float,
+        *,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str,
+    ) -> None:
         super().__init__()
 
         self.linear_left = ColumnParallelLinear(
             hidden_size,
             laurel_rank,
             bias=False,
+            quant_config=quant_config,
             prefix=f"{prefix}.linear_left",
             return_bias=False,
         )
-        self.linear_right = RowParallelLinear(laurel_rank,
-                                              hidden_size,
-                                              bias=False,
-                                              prefix=f"{prefix}.linear_right",
-                                              return_bias=False)
+        self.linear_right = RowParallelLinear(
+            laurel_rank,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.linear_right",
+            return_bias=False,
+        )
         self.post_laurel_norm = RMSNorm(
             hidden_size=hidden_size,
             eps=rms_norm_eps,
@@ -297,8 +308,13 @@ class Gemma3nAttention(nn.Module):
                               has_weight=False)
 
         layer_idx = extract_layer_index(prefix)
-        if config.layer_types[layer_idx] == "sliding_attention":
-            self.sliding_window = config.sliding_window
+
+        is_sliding_window = (
+            getattr(config, "interleaved_sliding_window", None) is not None
+            and config.layer_types[layer_idx] == "sliding_attention")
+
+        if is_sliding_window:
+            self.sliding_window = config.interleaved_sliding_window
             rope_theta = config.rope_local_base_freq
             rope_scaling = {"rope_type": "default"}
         else:
@@ -412,6 +428,7 @@ class Gemma3nDecoderLayer(nn.Module):
             hidden_size=config.hidden_size,
             laurel_rank=config.laurel_rank,
             rms_norm_eps=config.rms_norm_eps,
+            quant_config=quant_config,
             prefix=f"{prefix}.laurel",
         )
 
@@ -422,6 +439,7 @@ class Gemma3nDecoderLayer(nn.Module):
             config.hidden_size,
             config.hidden_size_per_layer_input,
             bias=False,
+            quant_config=quant_config,
             prefix=f"{prefix}.per_layer_input_gate",
             return_bias=False,
         )
@@ -429,6 +447,7 @@ class Gemma3nDecoderLayer(nn.Module):
             config.hidden_size_per_layer_input,
             config.hidden_size,
             bias=False,
+            quant_config=quant_config,
             prefix=f"{prefix}.per_layer_projection",
             return_bias=False,
         )
@@ -542,6 +561,7 @@ class Gemma3nTextModel(nn.Module):
             bias=False,
             gather_output=True,
             return_bias=False,
+            quant_config=quant_config,
             prefix=f"{prefix}.per_layer_model_projection",
         )
         self.per_layer_projection_norm = RMSNorm(
@@ -561,6 +581,7 @@ class Gemma3nTextModel(nn.Module):
                 bias=False,
                 gather_output=True,
                 return_bias=False,
+                quant_config=quant_config,
                 prefix=f"{prefix}.{idx-1}.altup_projections",
             ) for idx in range(1, self.config.altup_num_inputs)
         ])
@@ -571,6 +592,7 @@ class Gemma3nTextModel(nn.Module):
                 bias=False,
                 gather_output=True,
                 return_bias=False,
+                quant_config=quant_config,
                 prefix=f"{prefix}.{idx-1}.altup_unembed_projections",
             ) for idx in range(1, self.config.altup_num_inputs)
         ])
@@ -771,6 +793,7 @@ class Gemma3nForConditionalGeneration(nn.Module):
         del lora_config  # Unused.
         super().__init__()
         self.config = config
+        self.cache_config = vllm_config.cache_config
         self.model = Gemma3nModel(vllm_config=vllm_config,
                                   prefix=maybe_prefix(prefix, "model"))
         self.logits_processor = LogitsProcessor(
diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index 0996bcf60aa1c..ae1bf22c704e5 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -29,7 +29,7 @@
 import math
 from collections.abc import Iterable, Mapping, Sequence
 from functools import partial
-from typing import Any, Callable, Literal, Optional, TypedDict, Union
+from typing import Annotated, Any, Callable, Literal, Optional, Union
 
 import numpy as np
 import torch
@@ -70,6 +70,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.platforms import _Backend
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.config import uses_mrope
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from ..layers.activation import SiluAndMul
 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
@@ -88,80 +89,66 @@ _MAX_FRAMES_PER_VIDEO = 600
 # === Vision Inputs === #
 
 
-class Glm4vImagePixelInputs(TypedDict):
-    type: Literal["pixel_values"]
-    pixel_values: torch.Tensor
-    """Shape:
-    `(num_patches, num_channels * patch_size * patch_size)`
+class Glm4vImagePixelInputs(TensorSchema):
     """
-
-    image_grid_thw: torch.Tensor
-    """Shape: `(num_images, 3)`
-    This should be in `(grid_t, grid_h, grid_w)` format.
+    Dimensions:
+        - np: Number of patches
+        - cpp: Number of channels * patch_size * patch_size
+        - ni: Number of images
+        - g: Grid dimensions (3 for grid_t, grid_h, grid_w)
     """
+    type: Literal["pixel_values"] = "pixel_values"
+
+    pixel_values: Annotated[torch.Tensor, TensorShape("np", "cpp")]
+    image_grid_thw: Annotated[torch.Tensor, TensorShape("ni", 3)]
 
 
-class Glm4vImageEmbeddingInputs(TypedDict):
-    type: Literal["image_embeds"]
-    image_embeds: torch.Tensor
-    """Supported types:
-    - List[`torch.Tensor`]: A list of tensors holding all images' features.
-        Each tensor holds an image's features.
-    - `torch.Tensor`: A tensor holding all images' features
-        (concatenation of all images' feature tensors).
-
-    Tensor shape: `(num_image_features, hidden_size)`
-    - `num_image_features` varies based on
-        the number and resolution of the images.
-    - `hidden_size` must match the hidden size of language model backbone.
+class Glm4vImageEmbeddingInputs(TensorSchema):
     """
-
-    image_grid_thw: torch.Tensor
-    """Shape: `(num_images, 3)`
-    This should be in `(grid_t, grid_h, grid_w)` format.
+    Dimensions:
+        - f: Number of image features (varies based on image resolution)
+        - h: Hidden size (must match language model backbone)
+        - n: Number of images
+        - g: Grid dimensions (3 for grid_t, grid_h, grid_w)
     """
+    type: Literal["image_embeds"] = "image_embeds"
+
+    image_embeds: Annotated[torch.Tensor, TensorShape("f", "h")]
+    image_grid_thw: Annotated[torch.Tensor, TensorShape("n", 3)]
 
 
 Glm4vImageInputs = Union[Glm4vImagePixelInputs, Glm4vImageEmbeddingInputs]
 
 
-class Glm4vVideoPixelInputs(TypedDict):
-    type: Literal["pixel_values_videos"]
-    pixel_values_videos: torch.Tensor
-    """Shape:
-    `(num_patches,
-      num_channels * temporal_patch_size * patch_size * patch_size)`
+class Glm4vVideoPixelInputs(TensorSchema):
     """
-    # video_metadata: Union[list[VideoMetadata], list[dict]]
-    video_grid_thw: Union[list[torch.Tensor], torch.Tensor]
-    """Shape: `(num_videos, num_frames, 3)` or `(1, num_frames, 3)` 
-    for single video.
-    Each entry represents [grid_t, grid_h, grid_w] format where:
-    - grid_t: Temporal grid size (usually 1 for processed video)
-    - grid_h: Height grid size  
-    - grid_w: Width grid size
-    This describes the grid structure of the video patches.
+    Dimensions:
+        - np: Number of patches
+        - ctpp: Number of channels * temporal_patch_size *
+            patch_size * patch_size
+        - f: Number of frames
+        - g: Grid dimensions (3 for grid_t which is usually 1 for processed 
+          video, grid_h, grid_w)
     """
+    type: Literal["pixel_values_videos"] = "pixel_values_videos"
+
+    pixel_values_videos: Annotated[torch.Tensor, TensorShape("np", "ctpp")]
+    video_grid_thw: Annotated[torch.Tensor, TensorShape("f", 3)]
 
 
-class Glm4vVideoEmbeddingInputs(TypedDict):
-    type: Literal["video_embeds"]
+class Glm4vVideoEmbeddingInputs(TensorSchema):
+    """
+    Dimensions:
+        - p: Number of video patches across all frames
+        - h: Hidden size (must match language model backbone)
+        - f: Number of frames
+        - g: Grid dimensions (3 for grid_t which is usually 1 for processed 
+          video, grid_h, grid_w)
+    """
+    type: Literal["video_embeds"] = "video_embeds"
 
-    video_embeds: torch.Tensor
-    """
-    Tensor shape: `(num_video_patches, hidden_size)`
-    - `num_video_patches`: Total number of video patches across all frames
-    - `hidden_size`: Must match the hidden size of language model backbone
-    """
-
-    video_grid_thw: torch.Tensor
-    """Shape: `(num_videos, 1, 3)` or `(1, 1, 3)` for single video
-    Each entry represents [grid_t, grid_h, grid_w] format where:
-    - grid_t: Temporal grid size (usually 1 for processed video)
-    - grid_h: Height grid size  
-    - grid_w: Width grid size
-    This describes the grid structure of the video patches.
-    """
+    video_embeds: Annotated[torch.Tensor, TensorShape("p", "h")]
+    video_grid_thw: Annotated[torch.Tensor, TensorShape("f", 3)]
 
 
 Glm4vVideoInputs = Union[Glm4vVideoPixelInputs, Glm4vVideoEmbeddingInputs]
@@ -1288,6 +1275,7 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal,
             vllm_config=vllm_config,
             prefix=maybe_prefix(prefix, ""),
             architectures=["Glm4ForCausalLM"],
+            hf_config=self.config.get_text_config(),
         )
 
         self.make_empty_intermediate_tensors = (
@@ -1324,10 +1312,6 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal,
             image_grid_thw = self._validate_and_reshape_mm_tensor(
                 image_grid_thw, "image grid_thw")
 
-            if not isinstance(pixel_values, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of image pixel values. "
-                                 f"Got type: {type(pixel_values)}")
-
             return Glm4vImagePixelInputs(
                 type="pixel_values",
                 pixel_values=pixel_values,
@@ -1340,9 +1324,6 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal,
             image_grid_thw = self._validate_and_reshape_mm_tensor(
                 image_grid_thw, "image grid_thw")
 
-            if not isinstance(image_embeds, torch.Tensor):
-                raise ValueError("Incorrect type of image embeddings. "
-                                 f"Got type: {type(image_embeds)}")
             return Glm4vImageEmbeddingInputs(
                 type="image_embeds",
                 image_embeds=image_embeds,
@@ -1354,8 +1335,10 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal,
         pixel_values_videos = kwargs.pop("pixel_values_videos", None)
         video_embeds = kwargs.pop("video_embeds", None)
         video_grid_thw = kwargs.pop("video_grid_thw", None)
+
         if pixel_values_videos is None and video_embeds is None:
             return None
+
         if pixel_values_videos is not None:
             pixel_values_videos = self._validate_and_reshape_mm_tensor(
                 pixel_values_videos, "video pixel values")
@@ -1364,7 +1347,6 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal,
 
             return Glm4vVideoPixelInputs(
                 type="pixel_values_videos",
-                # video_metadata=video_metadata,
                 pixel_values_videos=pixel_values_videos,
                 video_grid_thw=video_grid_thw,
             )
@@ -1375,9 +1357,6 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal,
             video_grid_thw = self._validate_and_reshape_mm_tensor(
                 video_grid_thw, "video grid_thw")
 
-            if not isinstance(video_embeds, torch.Tensor):
-                raise ValueError("Incorrect type of video embeddings. "
-                                 f"Got type: {type(video_embeds)}")
             return Glm4vVideoEmbeddingInputs(
                 type="video_embeds",
                 video_embeds=video_embeds,
diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py
index 7584b5188cf2a..537aeabf72d5a 100644
--- a/vllm/model_executor/models/glm4v.py
+++ b/vllm/model_executor/models/glm4v.py
@@ -6,7 +6,7 @@
 """Inference-only CogAgent model compatible with THUDM weights."""
 from argparse import Namespace
 from collections.abc import Mapping, Sequence
-from typing import Literal, Optional, TypedDict, Union
+from typing import Annotated, Literal, Optional, Union
 
 import torch
 from torch import nn
@@ -38,6 +38,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs import ChatGLMConfig
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .chatglm import ChatGLMBaseModel, ChatGLMModel
 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
@@ -45,10 +46,16 @@ from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
 from .utils import flatten_bn, merge_multimodal_embeddings
 
 
-class GLMVImagePixelInputs(TypedDict):
-    type: Literal["pixel_values"]
-    data: torch.Tensor
-    """Shape: `(batch_size, num_channels, height, width)`"""
+class GLMVImagePixelInputs(TensorSchema):
+    """
+    Dimensions:
+        - b: Batch size
+        - c: Number of channels (3)
+        - h: Height of image
+        - w: Width of image
+    """
+    type: Literal["pixel_values"] = "pixel_values"
+    data: Annotated[torch.Tensor, TensorShape("b", 3, "h", "w")]
 
 
 class EVA2CLIPPatchEmbedding(nn.Module):
@@ -562,19 +569,6 @@ class GLM4VForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP,
 
         self.transformer: GLM4VModel
 
-    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
-        h = w = self.config.vision_config["image_size"]
-        expected_dims = (3, h, w)
-        actual_dims = tuple(data.shape[1:])
-
-        if actual_dims != expected_dims:
-            expected_expr = ("batch_size", *map(str, expected_dims))
-            raise ValueError(
-                f"The expected shape of pixel values is {expected_expr}. "
-                f"You supplied {tuple(data.shape)}.")
-
-        return data
-
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[GLMVImagePixelInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
@@ -584,11 +578,14 @@ class GLM4VForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP,
                 raise ValueError("Incorrect type of pixel values. "
                                  f"Got type: {type(pixel_values)}")
 
-            return GLMVImagePixelInputs(
-                type="pixel_values",
-                data=self._validate_pixel_values(
-                    flatten_bn(pixel_values, concat=True)),
-            )
+            expected_h = expected_w = self.config.vision_config["image_size"]
+            return GLMVImagePixelInputs(type="pixel_values",
+                                        data=flatten_bn(pixel_values,
+                                                        concat=True),
+                                        resolve_bindings={
+                                            "h": expected_h,
+                                            "w": expected_w
+                                        })
 
         return None
 
diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py
index 6a4dee9ae48d4..c9e3b74e7c3c4 100644
--- a/vllm/model_executor/models/granite_speech.py
+++ b/vllm/model_executor/models/granite_speech.py
@@ -25,7 +25,7 @@
 """Inference-only IBM Granite speech model."""
 import math
 from collections.abc import Iterable, Mapping
-from typing import Optional, TypedDict, Union
+from typing import Annotated, Optional, Union
 
 import torch
 import torch.nn.functional as F
@@ -48,6 +48,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .blip2 import Blip2QFormerModel
 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
@@ -57,16 +58,25 @@ from .utils import (AutoWeightsLoader, embed_multimodal,
 
 
 ### Audio Input
-class GraniteSpeechAudioInputs(TypedDict):
+class GraniteSpeechAudioInputs(TensorSchema):
+    """
+    Audio input features for Granite Speech model.
+    
+    Dimensions:
+        - b: Batch size
+        - fi: Number of input features from the Mel spectrogram.
+        - fo: Number of output features, i.e. the embedding size.
+        - 160: Fixed feature dimension for Mel spectrogram features
+    """
 
-    input_features: torch.Tensor
-    """Shape: `(bsz, num_features, 160)`"""
+    input_features: Annotated[torch.Tensor, TensorShape("b", "fi", 160)]
+    """Audio input features."""
 
-    input_features_mask: torch.Tensor
-    """Shape: `(bsz, num_features)`"""
+    input_features_mask: Annotated[torch.Tensor, TensorShape("b", "fo")]
+    """Mask for variable length audio features."""
 
-    audio_embed_sizes: list[int]
-    """List of length `bsz`"""
+    audio_embed_sizes: Annotated[list[int], TensorShape("b")]
+    """List of audio embedding sizes for each item in batch."""
 
 
 class GraniteSpeechMultiModalProcessingInfo(BaseProcessingInfo):
@@ -581,6 +591,7 @@ class GraniteSpeechForConditionalGeneration(
         input_features = kwargs.pop("input_features", None)
         input_features_mask = kwargs.pop("input_features_mask", None)
         audio_embed_sizes = kwargs.pop("audio_embed_sizes", None)
+
         if input_features is None:
             return None
 
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index de216a81e9344..6e991d99b9638 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -18,7 +18,7 @@
 
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Literal, Optional, TypedDict, Union
+from typing import Annotated, Literal, Optional, Union
 
 import torch
 from torch import nn
@@ -45,6 +45,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
 # yapf: enable
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 # yapf: disable
 from .idefics2_vision_model import (
@@ -56,26 +57,30 @@ from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
                     merge_multimodal_embeddings)
 
 
-class Idefics3ImagePixelInputs(TypedDict):
+class Idefics3ImagePixelInputs(TensorSchema):
+    """
+    Dimensions:
+        - bn: Batch size * number of images
+        - bnp: Batch size * number of images * number of patches
+        - c: Number of channels (3)
+        - h: Height
+        - w: Width
+    """
     type: Literal["pixel_values"]
-    pixel_values: torch.Tensor
-    """
-    Shape: `(batch_size * num_images * num_patches, 
-             num_channels, height, width)`
-    """
+    pixel_values: Annotated[torch.Tensor, TensorShape("bnp", 3, "h", "w")]
     pixel_attention_mask: torch.Tensor
-
-    num_patches: torch.Tensor
-    """Shape: `(batch_size * num_images)`"""
+    num_patches: Annotated[torch.Tensor, TensorShape("bn")]
 
 
-class Idefics3ImageEmbeddingInputs(TypedDict):
+class Idefics3ImageEmbeddingInputs(TensorSchema):
+    """
+    Dimensions:
+        - bn: Batch size * number of images
+        - f: Image feature size
+        - h: Hidden size (must match the hidden size of language model backbone)
+    """
     type: Literal["image_embeds"]
-    data: torch.Tensor
-    """
-    Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
-    `hidden_size` must match the hidden size of language model backbone.
-    """
+    data: Annotated[torch.Tensor, TensorShape("bn", "f", "h")]
 
 
 ImageInputs = Union[Idefics3ImagePixelInputs, Idefics3ImageEmbeddingInputs]
@@ -614,25 +619,6 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
             self.lm_head.weight = self.model.text_model.wte.weight
         self.logits_processor = LogitsProcessor(config.text_config.vocab_size)
 
-    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
-        h = w = self.config.vision_config.image_size
-        expected_dims = (3, h, w)
-
-        def _validate_shape(d: torch.Tensor):
-            actual_dims = tuple(d.shape)
-
-            if actual_dims != expected_dims:
-                expected_expr = str(expected_dims)
-                raise ValueError(
-                    "The expected shape of pixel values per image per batch "
-                    f" per patch is {expected_expr}. "
-                    f"You supplied {tuple(d.shape)}.")
-
-        for d in data:
-            _validate_shape(d)
-
-        return data
-
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[ImageInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
@@ -666,16 +652,17 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
                 raise ValueError("Incorrect type of num_patches. "
                                  f"Got type: {type(num_patches)}")
 
-            pixel_values = flatten_bn(pixel_values, concat=True)
-            pixel_attention_mask = flatten_bn(pixel_attention_mask,
-                                              concat=True)
-            num_patches = flatten_bn(num_patches, concat=True)
-
+            expected_h = expected_w = self.config.vision_config.image_size
             return Idefics3ImagePixelInputs(
                 type="pixel_values",
-                pixel_values=self._validate_pixel_values(pixel_values),
-                pixel_attention_mask=pixel_attention_mask,
-                num_patches=num_patches,
+                pixel_values=flatten_bn(pixel_values, concat=True),
+                pixel_attention_mask=flatten_bn(pixel_attention_mask,
+                                                concat=True),
+                num_patches=flatten_bn(num_patches, concat=True),
+                resolve_bindings={
+                    "h": expected_h,
+                    "w": expected_w
+                },
             )
 
         raise AssertionError("This line should be unreachable.")
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 957b57276b4ca..b6d9877cd01b6 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -1,13 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from collections.abc import Iterable, MutableSequence
+from collections.abc import Iterable, Mapping, MutableSequence
 from typing import (TYPE_CHECKING, ClassVar, Literal, Optional, Protocol,
                     Union, overload, runtime_checkable)
 
 import numpy as np
 import torch
 from torch import Tensor
+from transformers.models.whisper.tokenization_whisper import LANGUAGES
 from typing_extensions import Self, TypeIs
 
 from vllm.config import ModelConfig, SpeechToTextConfig
@@ -685,6 +686,8 @@ class SupportsQuant:
 @runtime_checkable
 class SupportsTranscription(Protocol):
     """The interface required for all models that support transcription."""
+    # Mapping from ISO639_1 language codes: language names
+    supported_languages: ClassVar[Mapping[str, str]]
 
     supports_transcription: ClassVar[Literal[True]] = True
 
@@ -694,11 +697,22 @@ class SupportsTranscription(Protocol):
     `True`.
     """
 
+    def __init_subclass__(cls, **kwargs):
+        super().__init_subclass__(**kwargs)
+        # language codes in supported_languages
+        # that don't exist in the full language map
+        invalid = set(cls.supported_languages) - set(LANGUAGES.keys())
+        if invalid:
+            raise ValueError(
+                f"{cls.__name__}.supported_languages contains invalid "
+                f"language codes: {sorted(invalid)}\n. "
+                f"Valid choices are: {sorted(LANGUAGES.keys())}")
+
     @classmethod
     def get_generation_prompt(cls, audio: np.ndarray,
                               stt_config: SpeechToTextConfig,
-                              model_config: ModelConfig, language: str,
-                              task_type: str,
+                              model_config: ModelConfig,
+                              language: Optional[str], task_type: str,
                               request_prompt: str) -> PromptType:
         """Get the prompt for the ASR model.
         The model has control over the construction, as long as it
@@ -706,9 +720,36 @@ class SupportsTranscription(Protocol):
         ...
 
     @classmethod
-    def validate_language(cls, language: str) -> bool:
-        """Check if the model supports a specific ISO639_1 language."""
-        ...
+    def get_other_languages(cls) -> Mapping[str, str]:
+        # other possible language codes from the whisper map
+        return {
+            k: v
+            for k, v in LANGUAGES.items() if k not in cls.supported_languages
+        }
+
+    @classmethod
+    def validate_language(cls, language: Optional[str]) -> Optional[str]:
+        """
+        Ensure the language specified in the transcription request 
+        is a valid ISO 639-1 language code. If the request language is 
+        valid, but not natively supported by the model, trigger a 
+        warning (but not an exception).
+        """
+        if language is None or language in cls.supported_languages:
+            return language
+        elif language in cls.get_other_languages():
+            logger.warning(
+                "Language %r is not natively supported by %s; "
+                "results may be less accurate. Supported languages: %r",
+                language,
+                cls.__name__,
+                list(cls.supported_languages.keys()),
+            )
+            return language
+        else:
+            raise ValueError(
+                f"Unsupported language: {language!r}.  Must be one of "
+                f"{list(cls.supported_languages.keys())}.")
 
     @classmethod
     def get_speech_to_text_config(
diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py
new file mode 100644
index 0000000000000..ab21cbe91aa1f
--- /dev/null
+++ b/vllm/model_executor/models/interns1.py
@@ -0,0 +1,832 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# --------------------------------------------------------
+# InternS1
+# Copyright (c) 2025 Shanghai AI Lab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from collections.abc import Iterable, Mapping, Sequence
+from typing import Literal, Optional, TypedDict, Union
+
+import regex as re
+import torch
+import torch.nn as nn
+from transformers import BatchFeature, InternVLProcessor, PretrainedConfig
+from transformers.activations import ACT2FN
+from transformers.models.got_ocr2.image_processing_got_ocr2_fast import (
+    GotOcr2ImageProcessorFast)
+
+from vllm.config import VllmConfig
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.models.interns1_vit import InternS1VisionModel
+from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs, NestedTensors)
+from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
+                                   ImageSize, MultiModalDataItems)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate, PromptUpdateDetails)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
+                         SupportsMultiModal, SupportsPP)
+from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
+                    init_vllm_registered_model, maybe_prefix,
+                    merge_multimodal_embeddings)
+
+
+class InternS1MultiModalProjector(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(config.vision_config.hidden_size *
+                                       int(1 / config.downsample_ratio)**2)
+        self.linear_1 = nn.Linear(
+            config.vision_config.hidden_size *
+            int(1 / config.downsample_ratio)**2,
+            config.text_config.hidden_size)
+        self.act = ACT2FN[config.projector_hidden_act]
+        self.linear_2 = nn.Linear(config.text_config.hidden_size,
+                                  config.text_config.hidden_size)
+
+    def forward(self, image_features):
+        hidden_states = self.layer_norm(image_features)
+        hidden_states = self.linear_1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+
+class InternS1ImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    pixel_values: torch.Tensor
+    """
+    Shape:
+    `(batch_size * num_images * (1 + num_patches), num_channels, height, width)`
+    """
+
+
+class InternS1ImageEmbeddingInputs(TypedDict):
+    type: Literal["image_embeds"]
+    data: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A tensor of shape `(num_images, total_image_feature_size, hidden_size)`
+    or a list of tensors of shape `(total_image_feature_size, hidden_size)`
+
+    `hidden_size` must match the hidden size of language model backbone.
+    """
+
+
+InternS1ImageInputs = Union[InternS1ImagePixelInputs,
+                            InternS1ImageEmbeddingInputs]
+
+
+class InternS1VideoPixelInputs(TypedDict):
+    type: Literal["pixel_values_videos"]
+    pixel_values: torch.Tensor
+    """
+    Shape:
+    `(batch_size * num_video * num_frames, num_channels, height, width)`
+    """
+
+    num_patches: torch.Tensor
+    """Shape: `(batch_size * num_images)`"""
+
+
+class InternS1VideoEmbeddingInputs(TypedDict):
+    type: Literal["video_embeds"]
+    data: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A tensor of shape `(num_videos, total_video_feature_size, hidden_size)`
+    or a list of tensors of shape `(total_video_feature_size, hidden_size)`
+
+    `hidden_size` must match the hidden size of language model backbone.
+    """
+
+
+InternS1VideoInputs = Union[InternS1VideoPixelInputs,
+                            InternS1VideoEmbeddingInputs]
+
+
+def resolve_interns1_min_max_num(
+    min_dynamic_patch: int,
+    max_dynamic_patch: int,
+    dynamic_image_size: bool,
+    use_thumbnail: bool,
+) -> tuple[int, int]:
+    min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
+    max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
+
+    if use_thumbnail and max_dynamic_patch != 1:
+        max_dynamic_patch += 1
+
+    return min_dynamic_patch, max_dynamic_patch
+
+
+def get_interns1_target_ratios(
+    min_num: int,
+    max_num: int,
+) -> list[tuple[int, int]]:
+    target_ratios = {(i, j)
+                     for n in range(min_num, max_num + 1)
+                     for i in range(1, n + 1)
+                     for j in range(1, n + 1) if min_num <= i * j <= max_num}
+    return sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+
+class InternS1ProcessingInfo(BaseProcessingInfo):
+    """ProcessingInfo for InternS1-style models."""
+
+    def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
+        return self.ctx.get_hf_processor(InternVLProcessor, **kwargs)
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None, "video": None}
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        processor: Optional['GotOcr2ImageProcessorFast'] = None,
+    ) -> int:
+        if processor is None:
+            processor = self.get_hf_processor().image_processor
+
+        if not isinstance(processor, GotOcr2ImageProcessorFast):
+            raise ValueError(f'GotOcr2ImageProcessorFast is expected but got '
+                             f'{type(processor)}')
+        num_image_patches = processor.get_number_of_image_tokens(
+            image_height, image_width, images_kwargs=dict())
+        num_image_tokens = self.get_hf_processor(
+        ).image_seq_length * num_image_patches
+        return num_image_tokens
+
+    def resolve_target_ratios(self, use_thumbnail: Optional[bool] = None):
+        image_processor = self.get_hf_processor().image_processor
+        min_dynamic_patch = image_processor.min_patches
+        max_dynamic_patch = image_processor.max_patches
+        # HF format's InternVL processor uses `crop_to_patches` which is
+        # equivalent to `use_thumbnail` in original format.
+        use_thumbnail = image_processor.crop_to_patches
+        dynamic_image_size = True
+        min_num, max_num = resolve_interns1_min_max_num(
+            min_dynamic_patch,
+            max_dynamic_patch,
+            dynamic_image_size,
+            use_thumbnail=use_thumbnail)
+
+        return get_interns1_target_ratios(min_num, max_num)
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        processor = self.get_hf_processor()
+
+        hf_config = self.ctx.get_hf_config()
+        base_height, base_width = hf_config.vision_config.image_size
+        target_ratios = self.resolve_target_ratios()
+
+        largest_feature_size, largest_feature_pinpoint = 0, None
+        for wr, hr in target_ratios:
+            width, height = base_width * wr, base_height * hr
+
+            feat_size = self.get_num_image_tokens(
+                image_width=width,
+                image_height=height,
+                processor=processor.image_processor,
+            )
+            if feat_size > largest_feature_size:
+                largest_feature_size = feat_size
+                largest_feature_pinpoint = ImageSize(width=width,
+                                                     height=height)
+
+        assert not (largest_feature_size == 0 or largest_feature_pinpoint
+                    is None), ("Cannot have a largest feature size of 0!")
+
+        return largest_feature_pinpoint
+
+    def get_max_image_tokens(self) -> int:
+        processor = self.get_hf_processor()
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        return self.get_num_image_tokens(
+            image_width=target_width,
+            image_height=target_height,
+            processor=processor.image_processor,
+        )
+
+    def get_num_frames_with_most_features(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
+        max_images = mm_counts.get("image", 0)
+        max_videos = mm_counts.get("video", 0)
+
+        processor = self.get_hf_processor()
+
+        max_image_tokens = self.get_max_image_tokens() * max_images
+        max_total_frames = (seq_len -
+                            max_image_tokens) // processor.image_seq_length
+        max_frames_per_video = max_total_frames // max(max_videos, 1)
+
+        return max(max_frames_per_video, 1)
+
+
+class InternS1DummyInputsBuilder(BaseDummyInputsBuilder[InternS1ProcessingInfo]
+                                 ):
+    """DummyInputsBuilder for InternS1-style models."""
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+        image_token = self.info.get_hf_processor().image_token
+        video_token = self.info.get_hf_processor().video_token
+
+        return image_token * num_images + video_token * num_videos
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+        target_num_frames = \
+                self.info.get_num_frames_with_most_features(seq_len, mm_counts)
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+
+        config = self.info.get_hf_config()
+        image_size_h, image_size_w = config.vision_config.image_size
+
+        return {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images),
+            "video":
+            self._get_dummy_videos(width=image_size_w,
+                                   height=image_size_h,
+                                   num_frames=target_num_frames,
+                                   num_videos=num_videos),
+        }
+
+
+class InternS1MultiModalProcessor(
+        BaseMultiModalProcessor[InternS1ProcessingInfo]):
+    """ Basic image-only MultiModalProcessor for InternS1-style models."""
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> Mapping[str, NestedTensors]:
+        mm_data = dict(mm_data)
+        videos = mm_data.pop("videos", [])
+        images = mm_data.pop("images", [])
+        assert isinstance(videos, list)
+        assert isinstance(images, list)
+
+        hf_processor = self.info.get_hf_processor(**mm_kwargs)
+        tokenizer = hf_processor.tokenizer
+        video_token_id = tokenizer.encode(hf_processor.video_token,
+                                          add_special_tokens=False)
+        assert len(video_token_id) == 1
+        video_token_id = video_token_id[0]
+
+        prompt = re.sub(hf_processor.image_token, "<image_placeholder>",
+                        prompt)
+        prompt = re.sub(hf_processor.video_token, "<video_placeholder>",
+                        prompt)
+
+        image_outputs = {}
+        if images:
+            image_pixel_values = []
+            for image in images:
+                processed_outputs = super()._call_hf_processor(
+                    prompt=hf_processor.image_token,
+                    mm_data={"images": image},
+                    mm_kwargs=mm_kwargs,
+                    tok_kwargs=tok_kwargs,
+                )
+                image_pixel_values.append(
+                    processed_outputs.pop("pixel_values"))
+
+                input_ids = processed_outputs.pop("input_ids")
+                image_placeholder = tokenizer.batch_decode(input_ids)[0]
+                prompt = prompt.replace("<image_placeholder>",
+                                        image_placeholder, 1)
+
+            num_patches = [len(item) for item in image_pixel_values]
+            image_outputs: dict[str, NestedTensors] = {
+                "pixel_values": torch.concat(image_pixel_values),
+                "image_num_patches": torch.tensor(num_patches),
+                "image_token_id": torch.tensor(hf_processor.image_token_id),
+            }
+
+        video_outputs = {}
+        if videos:
+            video_pixel_values = []
+            for video in videos:
+                processed_outputs = super()._call_hf_processor(
+                    prompt=hf_processor.video_token,
+                    mm_data={"videos": video},
+                    mm_kwargs=mm_kwargs,
+                    tok_kwargs=tok_kwargs,
+                )
+                video_pixel_values.append(
+                    processed_outputs.pop("pixel_values"))
+
+                input_ids = processed_outputs.pop("input_ids")
+                input_ids[input_ids ==
+                          hf_processor.image_token_id] = video_token_id
+
+                video_placeholder = tokenizer.batch_decode(input_ids)[0]
+                prompt = prompt.replace("<video_placeholder>",
+                                        video_placeholder, 1)
+
+            num_frames = [len(item) for item in video_pixel_values]
+            video_outputs: dict[str, NestedTensors] = {
+                "pixel_values_videos": torch.concat(video_pixel_values),
+                "video_num_patches": torch.tensor(num_frames),
+                "video_token_id": torch.tensor(video_token_id),
+            }
+
+        prompt = re.sub("<image_placeholder>", hf_processor.image_token,
+                        prompt)
+        prompt = re.sub("<video_placeholder>", hf_processor.video_token,
+                        prompt)
+        text_outputs = tokenizer(prompt, **tok_kwargs, return_tensors="pt")
+
+        combined_outputs = dict(
+            **text_outputs,
+            **image_outputs,
+            **video_outputs,
+        )
+        return BatchFeature(combined_outputs)
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: Mapping[str, NestedTensors],
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+
+        image_num_patches = hf_inputs.get("image_num_patches", torch.empty(0))
+        video_num_patches = hf_inputs.get("video_num_patches", torch.empty(0))
+        num_images = len(image_num_patches)
+        num_videos = len(video_num_patches)
+
+        return dict(
+            pixel_values=MultiModalFieldConfig.flat_from_sizes(
+                "image", image_num_patches),
+            image_num_patches=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+            image_token_id=MultiModalFieldConfig.shared("image", num_images),
+            pixel_values_videos=MultiModalFieldConfig.flat_from_sizes(
+                "video", video_num_patches),
+            video_num_patches=MultiModalFieldConfig.batched("video"),
+            video_token_id=MultiModalFieldConfig.shared("video", num_videos),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        img_context_token = hf_processor.image_token
+        start_image_token = hf_processor.start_image_token
+        end_image_token = hf_processor.end_image_token
+        video_token = hf_processor.video_token
+
+        if "video_num_patches" in out_mm_kwargs:
+            video_num_patches = out_mm_kwargs["video_num_patches"]
+            assert isinstance(video_num_patches, torch.Tensor)
+            video_num_patches = video_num_patches.tolist()
+        else:
+            video_num_patches = []
+
+        if "image_num_patches" in out_mm_kwargs:
+            image_num_patches = out_mm_kwargs["image_num_patches"]
+            assert isinstance(image_num_patches, torch.Tensor)
+            image_num_patches = image_num_patches.tolist()
+        else:
+            image_num_patches = []
+
+        def get_replacement_interns1_image(item_idx: int):
+            images = mm_items.get_items(
+                "image", (ImageEmbeddingItems, ImageProcessorItems))
+
+            if isinstance(images, ImageEmbeddingItems):
+                feature_size = images.get_feature_size(item_idx)
+            else:
+                num_patches = image_num_patches[item_idx]
+                feature_size = num_patches * hf_processor.image_seq_length
+
+            repl_features = img_context_token * feature_size
+            repl_full = start_image_token + repl_features + end_image_token
+            return PromptUpdateDetails.select_text(repl_full,
+                                                   img_context_token)
+
+        def get_replacement_interns1_video(item_idx: int):
+            num_patches = video_num_patches[item_idx]
+            repl_features = video_token * hf_processor.image_seq_length
+            repl_features_with_sep = (start_image_token + repl_features +
+                                      end_image_token)
+            # num_patches is equal to num_frames
+            repl_full = '\n'.join([
+                f'Frame{i+1}: {repl_features_with_sep}'
+                for i in range(num_patches)
+            ])
+
+            return PromptUpdateDetails.select_text(repl_full, video_token)
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=img_context_token,
+                replacement=get_replacement_interns1_image,
+            ),
+            PromptReplacement(
+                modality="video",
+                target=video_token,
+                replacement=get_replacement_interns1_video,
+            ),
+        ]
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    InternS1MultiModalProcessor,
+    info=InternS1ProcessingInfo,
+    dummy_inputs=InternS1DummyInputsBuilder)
+class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                       SupportsPP, SupportsLoRA):
+
+    # To ensure correct weight loading and mapping.
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "lm_head.": "language_model.lm_head.",
+            "model.language_model.": "language_model.model.",
+            "model.vision_tower.": "vision_tower.",
+            "model.multi_modal_projector.": "multi_modal_projector.",
+        })
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        # transformers InternVLProcessor uses <IMG_CONTEXT> as the seperator
+        # refer to https://github.com/huggingface/transformers/blob/f90de364c2484c7c325bbe05befdcf487bd75b63/src/transformers/models/internvl/processing_internvl.py#L116
+        if modality.startswith("image"):
+            return '<IMG_CONTEXT>'
+        if modality.startswith("video"):
+            return "<video>"
+
+        raise ValueError("Only image or video modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        image_size = config.vision_config.image_size[0]
+        patch_size = config.vision_config.patch_size[0]
+        self.patch_size = patch_size
+        self.num_image_token = int(
+            (image_size // patch_size)**2 * (config.downsample_ratio**2))
+        self.downsample_ratio = config.downsample_ratio
+
+        self.llm_arch_name = config.text_config.architectures[0]
+        self.vision_tower = self._init_vision_model(
+            config,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "vision_tower"),
+        )
+
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
+
+        self.multi_modal_projector = self._init_mlp1(config)
+
+        self.img_context_token_id = None
+        self.video_context_token_id = None
+
+        self.visual_token_mask = None
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    def _init_vision_model(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig],
+        *,
+        prefix: str,
+    ):
+        num_hidden_layers = config.vision_config.num_hidden_layers
+        return InternS1VisionModel(
+            config.vision_config,
+            quant_config=quant_config,
+            num_hidden_layers_override=num_hidden_layers,
+            prefix=prefix,
+        )
+
+    def _init_mlp1(self, config: PretrainedConfig) -> nn.Sequential:
+        return InternS1MultiModalProjector(config)
+
+    def pixel_shuffle(self, x, scale_factor=0.5):
+        n, w, h, c = x.size()
+        # N, W, H, C --> N, W, H * scale, C // scale
+        x = x.view(n, w, int(h * scale_factor), int(c / scale_factor))
+        # N, W, H * scale, C // scale --> N, H * scale, W, C // scale
+        x = x.permute(0, 2, 1, 3).contiguous()
+        x = x.view(n, int(h * scale_factor), int(w * scale_factor),
+                   int(c / (scale_factor * scale_factor)))
+        x = x.permute(0, 2, 1, 3).contiguous()
+        return x
+
+    def extract_feature(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        vit_embeds = self.vision_tower(pixel_values=pixel_values)
+        vit_embeds = vit_embeds[:, 1:, :]
+
+        h = w = int(vit_embeds.shape[1]**0.5)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
+        vit_embeds = self.pixel_shuffle(vit_embeds,
+                                        scale_factor=self.downsample_ratio)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1,
+                                        vit_embeds.shape[-1])
+
+        vit_embeds = self.multi_modal_projector(vit_embeds)
+        return vit_embeds
+
+    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
+
+        h, w = self.config.vision_config.image_size
+        expected_dims = (3, h, w)
+
+        def _validate_shape(d: torch.Tensor):
+            actual_dims = tuple(d.shape)
+
+            if actual_dims != expected_dims:
+                expected_expr = str(expected_dims)
+                raise ValueError(
+                    "The expected shape of pixel values per image per batch "
+                    f" per patch is {expected_expr}. "
+                    f"You supplied {tuple(d.shape)}.")
+
+        for d in data:
+            _validate_shape(d)
+
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[InternS1ImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_num_patches = kwargs.pop("image_num_patches", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        if image_embeds is not None:
+            if not isinstance(image_embeds, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image embeddings. "
+                                 f"Got type: {type(image_embeds)}")
+
+            return InternS1ImageEmbeddingInputs(
+                type="image_embeds",
+                data=flatten_bn(image_embeds),
+            )
+
+        image_token_id = kwargs["image_token_id"]
+        assert isinstance(image_token_id, torch.Tensor)
+        self.img_context_token_id = image_token_id.flatten().unique().item()
+
+        if pixel_values is not None:
+            if not isinstance(pixel_values, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            if not isinstance(image_num_patches, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image_num_patches. "
+                                 f"Got type: {type(image_num_patches)}")
+
+            pixel_values = flatten_bn(pixel_values, concat=True)
+            image_num_patches = flatten_bn(image_num_patches, concat=True)
+
+            return InternS1ImagePixelInputs(
+                type="pixel_values",
+                pixel_values=self._validate_pixel_values(pixel_values),
+                num_patches=image_num_patches,
+            )
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _parse_and_validate_video_input(
+            self, **kwargs: object) -> Optional[InternS1VideoPixelInputs]:
+        pixel_values_flat_video = kwargs.pop("pixel_values_videos", None)
+        video_num_patches = kwargs.pop("video_num_patches", None)
+        video_embeds = kwargs.pop("video_embeds", None)
+
+        if pixel_values_flat_video is None and video_embeds is None:
+            return None
+
+        if video_embeds is not None:
+            if not isinstance(video_embeds, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of video embeddings. "
+                                 f"Got type: {type(video_embeds)}")
+
+            return InternS1ImageEmbeddingInputs(
+                type="video_embeds",
+                data=flatten_bn(video_embeds),
+            )
+
+        video_token_id = kwargs["video_token_id"]
+        assert isinstance(video_token_id, torch.Tensor)
+        self.video_context_token_id = video_token_id.flatten().unique().item()
+
+        if pixel_values_flat_video is not None:
+            if not isinstance(pixel_values_flat_video, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values_flat_video)}")
+
+            if not isinstance(video_num_patches, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image_num_patches. "
+                                 f"Got type: {type(video_num_patches)}")
+
+            pixel_values_flat_video = flatten_bn(pixel_values_flat_video,
+                                                 concat=True)
+            video_num_patches = flatten_bn(video_num_patches, concat=True)
+
+            return InternS1VideoPixelInputs(
+                type="pixel_values_videos",
+                pixel_values=self._validate_pixel_values(
+                    pixel_values_flat_video),
+                num_patches=video_num_patches,
+            )
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _process_image_input(
+        self,
+        image_input: Union[InternS1ImageInputs, InternS1VideoPixelInputs],
+    ) -> tuple[torch.Tensor, ...]:
+        if image_input["type"] == "image_embeds":
+            return image_input["data"]
+
+        assert self.vision_tower is not None
+
+        image_embeds = self.extract_feature(image_input["pixel_values"])
+
+        num_patches = image_input["num_patches"]
+
+        # Only one image in the current batch
+        if len(num_patches) == 1:
+            return (image_embeds.view(-1,
+                                      self.config.text_config.hidden_size), )
+
+        # NOTE: Image embeddings are split into separate tensors for each image
+        # by the size of each embedding.
+        feature_size = image_embeds.shape[1]
+        image_embeds = image_embeds.view(-1,
+                                         self.config.text_config.hidden_size)
+        image_feature_sizes = [
+            num_patches * feature_size for num_patches in num_patches
+        ]
+        return image_embeds.split(image_feature_sizes)
+
+    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
+        modalities = {}
+
+        # Preserve the order of modalities if there are multiple of them
+        # from the order of kwargs.
+        for input_key in kwargs:
+            if input_key in ("pixel_values",
+                             "image_embeds") and "images" not in modalities:
+                modalities["images"] = self._parse_and_validate_image_input(
+                    **kwargs)
+            if input_key in (
+                    "pixel_values_videos", ) and "videos" not in modalities:
+                modalities["videos"] = self._parse_and_validate_video_input(
+                    **kwargs)
+
+        return modalities
+
+    def _set_visual_token_mask(self, input_ids: torch.Tensor) -> None:
+        self.visual_token_mask = None
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def get_multimodal_embeddings(self,
+                                  **kwargs: object) -> MultiModalEmbeddings:
+
+        modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
+        if not modalities:
+            return []
+
+        # The result multimodal_embeddings is tuple of tensors, with each
+        # tensor correspoending to a multimodal data item (image or video).
+        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
+
+        # NOTE: It is important to iterate over the keys in this dictionary
+        # to preserve the order of the modalities.
+        for modality in modalities:
+            if modality == "images":
+                image_input = modalities["images"]
+                vision_embeddings = self._process_image_input(image_input)
+                multimodal_embeddings += vision_embeddings
+            if modality == "videos":
+                video_input = modalities["videos"]
+                video_embeddings = self._process_image_input(video_input)
+                multimodal_embeddings += video_embeddings
+
+        return multimodal_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None \
+            and len(multimodal_embeddings) != 0:
+            context_token_ids = [
+                token_id for token_id in (self.img_context_token_id,
+                                          self.video_context_token_id)
+                if token_id is not None
+            ]
+            assert len(context_token_ids) >= 1
+            self._set_visual_token_mask(input_ids)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                multimodal_embeddings,
+                context_token_ids,
+            )
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> IntermediateTensors:
+
+        if intermediate_tensors is not None:
+            input_ids = None
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        forward_kwargs = {
+            "input_ids": input_ids,
+            "positions": positions,
+            "intermediate_tensors": intermediate_tensors,
+            "inputs_embeds": inputs_embeds,
+        }
+
+        hidden_states = self.language_model.model(**forward_kwargs)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="language_model",
+            connector="multi_modal_projector",
+            tower_model="vision_tower")
diff --git a/vllm/model_executor/models/interns1_vit.py b/vllm/model_executor/models/interns1_vit.py
new file mode 100644
index 0000000000000..300ed17ecaabc
--- /dev/null
+++ b/vllm/model_executor/models/interns1_vit.py
@@ -0,0 +1,421 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_intern_vit.py
+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2023 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from collections.abc import Iterable
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PretrainedConfig
+from transformers.utils import torch_int
+
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+
+NORM2FN = {
+    'rms_norm': RMSNorm,
+    'layer_norm': nn.LayerNorm,
+}
+
+
+class InternS1VisionPatchEmbeddings(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.hidden_size
+
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] //
+                                                          patch_size[0])
+        patch_shape = (image_size[0] // patch_size[0],
+                       image_size[1] // patch_size[1])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+        self.patch_shape = patch_shape
+
+        self.projection = nn.Conv2d(num_channels,
+                                    hidden_size,
+                                    kernel_size=patch_size,
+                                    stride=patch_size)
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        batch_size, num_channels, height, width = pixel_values.shape
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values "
+                "match with the one set in the configuration.")
+
+        embeddings = self.projection(
+            pixel_values.to(self.projection.weight.dtype))
+        patch_height, patch_width = embeddings.shape[2], embeddings.shape[3]
+        embeddings = embeddings.flatten(2).transpose(1, 2)
+
+        return embeddings, (patch_height, patch_width)
+
+
+class InternS1VisionEmbeddings(nn.Module):
+
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.config = config
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+        if config.use_mask_token:
+            self.mask_token = nn.Parameter(
+                torch.zeros(1, 1, config.hidden_size))
+        else:
+            self.mask_token = None
+        self.patch_embeddings = InternS1VisionPatchEmbeddings(config)
+        self.patch_size = config.patch_size
+        self.image_size = (config.image_size if isinstance(
+            config.image_size, Iterable) else
+                           (config.image_size, config.image_size))
+        num_patches = self.patch_embeddings.num_patches
+        if config.use_absolute_position_embeddings:
+            self.position_embeddings = nn.Parameter(
+                torch.zeros(1, num_patches + 1, config.hidden_size))
+        else:
+            self.position_embeddings = None
+
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int,
+                                 width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
+        images. This method is also adapted to support torch.jit tracing.
+
+        Adapted from:
+        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
+        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
+        """  # noqa: E501
+
+        num_patches = embeddings.shape[1] - 1
+        num_positions = self.position_embeddings.shape[1] - 1
+
+        # always interpolate when tracing to ensure the exported model
+        # works for dynamic input shapes
+        if not torch.jit.is_tracing(
+        ) and num_patches == num_positions and height == width:
+            return self.position_embeddings
+
+        class_pos_embed = self.position_embeddings[:, :1]
+        patch_pos_embed = self.position_embeddings[:, 1:]
+
+        dim = embeddings.shape[-1]
+
+        new_height = height // self.patch_size[0]
+        new_width = width // self.patch_size[1]
+
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions,
+                                                  sqrt_num_positions, dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            size=(new_height, new_width),
+            mode="bicubic",
+            align_corners=False,
+        )
+
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+
+        return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+    ) -> torch.Tensor:
+        _, _, height, width = pixel_values.shape
+        embeddings, (patch_height,
+                     patch_width) = self.patch_embeddings(pixel_values)
+        batch_size, seq_len, _ = embeddings.size()
+
+        if bool_masked_pos is not None:
+            mask_tokens = self.mask_token.expand(batch_size, seq_len, -1)
+            # replace the masked visual tokens by mask_tokens
+            w = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
+            embeddings = embeddings * (1 - w) + mask_tokens * w
+
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+
+        if self.position_embeddings is not None:
+            embeddings = embeddings + self.interpolate_pos_encoding(
+                embeddings, height, width)
+
+        return embeddings, (patch_height, patch_width)
+
+
+class InternSdpaAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        *,
+        num_dummy_heads: int = 0,
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f'embed_dim must be divisible by num_heads '
+                f'(got `embed_dim`: {self.embed_dim} and `num_heads`:'
+                f' {self.num_heads}).')
+
+        # Additional dummy heads are used to enable TP for common GPU counts.
+        self.dummy_dim = (num_dummy_heads + self.num_heads) * self.head_dim
+
+        self.scale = self.head_dim**-0.5
+
+        self.q_proj = nn.Linear(self.embed_dim,
+                                self.num_heads * self.head_dim,
+                                bias=config.attention_bias)
+        self.k_proj = nn.Linear(self.embed_dim,
+                                self.num_heads * self.head_dim,
+                                bias=config.attention_bias)
+        self.v_proj = nn.Linear(self.embed_dim,
+                                self.num_heads * self.head_dim,
+                                bias=config.attention_bias)
+
+        self.qk_normalization = config.use_qk_norm
+        if self.qk_normalization:
+            self.q_norm = RMSNorm(self.dummy_dim,
+                                  eps=config.layer_norm_eps,
+                                  var_hidden_size=self.embed_dim)
+            self.k_norm = RMSNorm(self.dummy_dim,
+                                  eps=config.layer_norm_eps,
+                                  var_hidden_size=self.embed_dim)
+
+        self.projection_layer = nn.Linear(self.dummy_dim, self.embed_dim)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, N, C = x.shape
+
+        q = self.q_proj(x)
+        k = self.k_proj(x)
+        v = self.v_proj(x)
+
+        q = q.view(B, N, self.num_heads, self.head_dim)
+        k = k.view(B, N, self.num_heads, self.head_dim)
+        v = v.view(B, N, self.num_heads, self.head_dim)
+
+        if self.qk_normalization:
+            B_, N_, H_, D_ = q.shape
+            q = self.q_norm(q.flatten(-2, -1)).view(B_, N_, H_, D_)
+            k = self.k_norm(k.flatten(-2, -1)).view(B_, N_, H_, D_)
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
+
+        x = F.scaled_dot_product_attention(q, k, v, scale=self.scale)
+        x = x.transpose(1, 2).reshape(B, N, -1)
+
+        x = self.projection_layer(x)
+        return x
+
+
+class InternS1VisionMLP(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.activation_fn = get_act_fn(config.hidden_act)
+        self.fc1 = ColumnParallelLinear(config.hidden_size,
+                                        config.intermediate_size,
+                                        bias=True,
+                                        quant_config=quant_config,
+                                        prefix=f"{prefix}.fc1")
+        self.fc2 = RowParallelLinear(config.intermediate_size,
+                                     config.hidden_size,
+                                     bias=True,
+                                     quant_config=quant_config,
+                                     prefix=f"{prefix}.fc2")
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states, _ = self.fc2(hidden_states)
+
+        return hidden_states
+
+
+class InternS1VisionLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        *,
+        num_dummy_heads: int = 0,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.attention = self._init_attn(config,
+                                         quant_config,
+                                         num_dummy_heads=num_dummy_heads,
+                                         prefix=f"{prefix}.attention")
+
+        self.mlp = InternS1VisionMLP(config,
+                                     quant_config=quant_config,
+                                     prefix=f"{prefix}.mlp")
+        self.layernorm_before = NORM2FN[config.norm_type](
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.layernorm_after = NORM2FN[config.norm_type](
+            config.hidden_size, eps=config.layer_norm_eps)
+
+        init_values = config.layer_scale_init_value
+        self.lambda_1 = nn.Parameter(init_values *
+                                     torch.ones(config.hidden_size),
+                                     requires_grad=True)
+        self.lambda_2 = nn.Parameter(init_values *
+                                     torch.ones(config.hidden_size),
+                                     requires_grad=True)
+
+    def _init_attn(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig],
+        *,
+        num_dummy_heads: int,
+        prefix: str = "",
+    ):
+        return InternSdpaAttention(config, num_dummy_heads=num_dummy_heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ):
+        hidden_states = hidden_states + self.attention(
+            self.layernorm_before(hidden_states)) * self.lambda_1
+
+        hidden_states = hidden_states + self.mlp(
+            self.layernorm_after(hidden_states)) * self.lambda_2
+
+        return hidden_states
+
+
+class InternS1VisionEncoder(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        *,
+        num_hidden_layers_override: Optional[int] = None,
+        num_dummy_heads: int = 0,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.config = config
+
+        if num_hidden_layers_override is None:
+            num_hidden_layers = config.num_hidden_layers
+        else:
+            num_hidden_layers = num_hidden_layers_override
+
+        self.layer = nn.ModuleList([
+            InternS1VisionLayer(config,
+                                quant_config,
+                                num_dummy_heads=num_dummy_heads,
+                                prefix=f"{prefix}.layer.{layer_idx}")
+            for layer_idx in range(num_hidden_layers)
+        ])
+
+    def forward(self, inputs_embeds: torch.Tensor):
+
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layer:
+            hidden_states = encoder_layer(hidden_states)
+
+        return hidden_states
+
+
+class InternS1VisionModel(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        *,
+        num_hidden_layers_override: Optional[int] = None,
+        num_dummy_heads: int = 0,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+
+        self.embeddings = InternS1VisionEmbeddings(config)
+        self.encoder = InternS1VisionEncoder(
+            config=config,
+            num_hidden_layers_override=num_hidden_layers_override,
+            num_dummy_heads=num_dummy_heads,
+            prefix=f"{prefix}.encoder",
+        )
+        self.layernorm = (nn.Identity() if config.use_mean_pooling else
+                          nn.LayerNorm(config.hidden_size,
+                                       eps=config.layer_norm_eps))
+
+    def get_input_embeddings(self):
+        return self.embeddings.patch_embeddings
+
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        pixel_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.FloatTensor:
+        if pixel_values is None and pixel_embeds is None:
+            raise ValueError(
+                'You have to specify pixel_values or pixel_embeds')
+
+        if pixel_embeds is not None:
+            hidden_states = pixel_embeds
+        elif pixel_values is not None:
+            if pixel_values.ndim == 4:
+                hidden_states, _ = self.embeddings(pixel_values)
+            else:
+                raise ValueError(
+                    f'wrong pixel_values size: {pixel_values.shape}')
+
+        encoder_outputs = self.encoder(inputs_embeds=hidden_states)
+        encoder_outputs = self.layernorm(encoder_outputs)
+
+        return encoder_outputs
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index f8b9ea2c5b6a0..a0e98ca3f8155 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -9,7 +9,7 @@
 # --------------------------------------------------------
 from abc import ABC, abstractmethod
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Any, Literal, Optional, TypedDict, TypeVar, Union
+from typing import Annotated, Any, Literal, Optional, TypeVar, Union
 
 import numpy.typing as npt
 import torch
@@ -37,6 +37,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
                          SupportsMultiModal, SupportsPP)
@@ -51,54 +52,60 @@ IMAGENET_MEAN = (0.485, 0.456, 0.406)
 IMAGENET_STD = (0.229, 0.224, 0.225)
 
 
-class InternVLImagePixelInputs(TypedDict):
+class InternVLImagePixelInputs(TensorSchema):
+    """
+    Dimensions:
+        - bn: Batch size * number of images
+        - bnp: Batch size * number of images * (1 + num_patches)
+        - c: Number of channels (3)
+        - h: Height of each image patch
+        - w: Width of each image patch
+    """
     type: Literal["pixel_values"]
-    pixel_values_flat: torch.Tensor
+    pixel_values_flat: Annotated[torch.Tensor, TensorShape("bnp", 3, "h", "w")]
+    num_patches: Annotated[torch.Tensor, TensorShape("bn")]
+
+
+class InternVLImageEmbeddingInputs(TensorSchema):
     """
-    Shape:
-    `(batch_size * num_images * (1 + num_patches), num_channels, height, width)`
+    Dimensions:
+        - n: Number of images
+        - f: Total image feature size
+        - h: Hidden size (must match the hidden size of language model backbone)
     """
-
-    num_patches: torch.Tensor
-    """Shape: `(batch_size * num_images)`"""
-
-
-class InternVLImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
-    data: Union[torch.Tensor, list[torch.Tensor]]
-    """ 
-    A tensor of shape `(num_images, total_image_feature_size, hidden_size)`
-    or a list of tensors of shape `(total_image_feature_size, hidden_size)`
-
-    `hidden_size` must match the hidden size of language model backbone.
-    """
+    data: Annotated[Union[torch.Tensor, list[torch.Tensor]],
+                    TensorShape("n", "f", "h")]
 
 
 InternVLImageInputs = Union[InternVLImagePixelInputs,
                             InternVLImageEmbeddingInputs]
 
 
-class InternVLVideoPixelInputs(TypedDict):
+class InternVLVideoPixelInputs(TensorSchema):
+    """
+    Dimensions:
+        - bvf: Batch size * number of videos * num_frames
+        - bn: Batch size * number of images
+        - c: Number of channels (3)
+        - h: Height of each video frame
+        - w: Width of each video frame
+    """
     type: Literal["pixel_values_videos"]
-    pixel_values_flat: torch.Tensor
+    pixel_values_flat: Annotated[torch.Tensor, TensorShape("bvf", 3, "h", "w")]
+    num_patches: Annotated[torch.Tensor, TensorShape("bn")]
+
+
+class InternVLVideoEmbeddingInputs(TensorSchema):
     """
-    Shape:
-    `(batch_size * num_video * num_frames, num_channels, height, width)`
+    Dimensions:
+        - n: Number of videos
+        - f: Total video feature size
+        - h: Hidden size (must match the hidden size of language model backbone)
     """
-
-    num_patches: torch.Tensor
-    """Shape: `(batch_size * num_images)`"""
-
-
-class InternVLVideoEmbeddingInputs(TypedDict):
     type: Literal["video_embeds"]
-    data: Union[torch.Tensor, list[torch.Tensor]]
-    """ 
-    A tensor of shape `(num_videos, total_video_feature_size, hidden_size)`
-    or a list of tensors of shape `(total_video_feature_size, hidden_size)`
-
-    `hidden_size` must match the hidden size of language model backbone.
-    """
+    data: Annotated[Union[torch.Tensor, list[torch.Tensor]],
+                    TensorShape("n", "f", "h")]
 
 
 InternVLVideoInputs = Union[InternVLVideoPixelInputs,
@@ -1151,26 +1158,6 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
         vit_embeds = self.mlp1(vit_embeds)
         return vit_embeds
 
-    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
-
-        h = w = self.config.vision_config.image_size
-        expected_dims = (3, h, w)
-
-        def _validate_shape(d: torch.Tensor):
-            actual_dims = tuple(d.shape)
-
-            if actual_dims != expected_dims:
-                expected_expr = str(expected_dims)
-                raise ValueError(
-                    "The expected shape of pixel values per image per batch "
-                    f" per patch is {expected_expr}. "
-                    f"You supplied {tuple(d.shape)}.")
-
-        for d in data:
-            _validate_shape(d)
-
-        return data
-
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[InternVLImageInputs]:
         pixel_values_flat = kwargs.pop("pixel_values_flat", None)
@@ -1205,12 +1192,14 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
 
             pixel_values_flat = flatten_bn(pixel_values_flat, concat=True)
             image_num_patches = flatten_bn(image_num_patches, concat=True)
+            expected_h = expected_w = self.config.vision_config.image_size
+            resolve_bindings = {"h": expected_h, "w": expected_w}
 
             return InternVLImagePixelInputs(
                 type="pixel_values",
-                pixel_values_flat=self._validate_pixel_values(
-                    pixel_values_flat),
+                pixel_values_flat=pixel_values_flat,
                 num_patches=image_num_patches,
+                resolve_bindings=resolve_bindings,
             )
 
         raise AssertionError("This line should be unreachable.")
@@ -1225,11 +1214,7 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
             return None
 
         if video_embeds is not None:
-            if not isinstance(video_embeds, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of video embeddings. "
-                                 f"Got type: {type(video_embeds)}")
-
-            return InternVLImageEmbeddingInputs(
+            return InternVLVideoEmbeddingInputs(
                 type="video_embeds",
                 data=flatten_bn(video_embeds),
             )
@@ -1250,12 +1235,14 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
             pixel_values_flat_video = flatten_bn(pixel_values_flat_video,
                                                  concat=True)
             video_num_patches = flatten_bn(video_num_patches, concat=True)
+            expected_h = expected_w = self.config.vision_config.image_size
+            resolve_bindings = {"h": expected_h, "w": expected_w}
 
             return InternVLVideoPixelInputs(
                 type="pixel_values_videos",
-                pixel_values_flat=self._validate_pixel_values(
-                    pixel_values_flat_video),
+                pixel_values_flat=pixel_values_flat_video,
                 num_patches=video_num_patches,
+                resolve_bindings=resolve_bindings,
             )
 
         raise AssertionError("This line should be unreachable.")
@@ -1322,7 +1309,6 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not modalities:
             return []
-            return None
 
         # The result multimodal_embeddings is tuple of tensors, with each
         # tensor correspoending to a multimodal data item (image or video).
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 34281b2e99ee8..263f4c8379cf2 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -25,6 +25,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.llama import LlamaMLP as JambaMLP
 from vllm.model_executor.models.mamba_cache import (MambaCacheManager,
                                                     MambaCacheParams)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
@@ -33,7 +34,7 @@ from vllm.utils import LayerBlockType
 
 from .interfaces import (HasInnerState, IsHybrid, SupportsLoRA, SupportsPP,
                          SupportsV0Only)
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -87,23 +88,6 @@ class JambaMoE(nn.Module):
         return hidden_states.view(orig_shape)
 
 
-class JambaMLP(JambaMoE):
-
-    def __init__(self,
-                 config: JambaConfig,
-                 params_dtype: Optional[torch.dtype] = None,
-                 tp_size: Optional[int] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
-        super().__init__(config,
-                         num_experts=1,
-                         top_k=1,
-                         params_dtype=params_dtype,
-                         tp_size=tp_size,
-                         quant_config=quant_config,
-                         prefix=prefix)
-
-
 class JambaMambaDecoderLayer(nn.Module):
 
     def __init__(self,
@@ -132,10 +116,20 @@ class JambaMambaDecoderLayer(nn.Module):
                                 )
 
         num_experts = config.layers_num_experts[layer_idx]
-        ffn_layer_class = JambaMoE if num_experts > 1 else JambaMLP
-        self.feed_forward = ffn_layer_class(config,
-                                            quant_config=quant_config,
-                                            prefix=f"{prefix}.feed_forward")
+        if num_experts > 1:
+            self.feed_forward = JambaMoE(
+                config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.feed_forward",
+            )
+        else:
+            self.feed_forward = JambaMLP(
+                config.hidden_size,
+                config.intermediate_size,
+                config.hidden_act,
+                quant_config=quant_config,
+                prefix=f"{prefix}.feed_forward",
+            )
         self.input_layernorm = RMSNorm(config.hidden_size,
                                        eps=config.rms_norm_eps)
         self.pre_ff_layernorm = RMSNorm(config.hidden_size,
@@ -216,10 +210,20 @@ class JambaAttentionDecoderLayer(nn.Module):
         )
 
         num_experts = config.layers_num_experts[layer_idx]
-        ffn_layer_class = JambaMoE if num_experts > 1 else JambaMLP
-        self.feed_forward = ffn_layer_class(config,
-                                            quant_config=quant_config,
-                                            prefix=f"{prefix}.feed_forward")
+        if num_experts > 1:
+            self.feed_forward = JambaMoE(
+                config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.feed_forward",
+            )
+        else:
+            self.feed_forward = JambaMLP(
+                config.hidden_size,
+                config.intermediate_size,
+                config.hidden_act,
+                quant_config=quant_config,
+                prefix=f"{prefix}.feed_forward",
+            )
         self.input_layernorm = RMSNorm(config.hidden_size,
                                        eps=config.rms_norm_eps)
         self.pre_ff_layernorm = RMSNorm(config.hidden_size,
@@ -359,15 +363,97 @@ class JambaModel(nn.Module):
         hidden_states, _ = self.final_layernorm(hidden_states, residual)
         return hidden_states
 
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        return FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.num_experts)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        expert_params_mapping = self.get_expert_mapping()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                if 'experts' in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for (
+                        param_name,
+                        weight_name,
+                        expert_id,
+                        shard_id,
+                ) in expert_params_mapping:
+                    if weight_name not in name:
+                        continue
+
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param,
+                                  loaded_weight,
+                                  name,
+                                  shard_id=shard_id,
+                                  expert_id=expert_id)
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    if is_pp_missing_parameter(name, self):
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
                        IsHybrid, SupportsV0Only):
+    hf_to_vllm_mapper = WeightsMapper(orig_to_new_substr={
+        ".self_attn.": ".",
+        ".A_log": ".A"
+    }, )
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
             "k_proj",
             "v_proj",
         ],
+        "gate_up_proj": ["gate_proj", "up_proj"],
         "in_proj": ["in_proj"],
     }
 
@@ -468,96 +554,11 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
 
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-        ]
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
 
-        # Params for weights, fp8 weight scales, fp8 activation scales
-        # (param_name, weight_name, expert_id, shard_id)
-        expert_params_mapping = FusedMoE.make_expert_params_mapping(
-            ckpt_gate_proj_name="gate_proj",
-            ckpt_down_proj_name="down_proj",
-            ckpt_up_proj_name="up_proj",
-            num_experts=self.config.num_experts)
-
-        params_dict = dict(self.named_parameters())
-        loaded_params: set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-
-            if "A_log" in name:
-                name = name.replace("A_log", "A")
-
-            if ".self_attn." in name:
-                name = name.replace(".self_attn", "")
-
-            if "feed_forward" in name and not _is_moe_layer(name):
-                ## map MLP layers to expert with ID=0
-                name = name.replace("feed_forward", "feed_forward.experts.0")
-
-            for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                if 'experts' in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # Skip layers on other devices.
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                for (
-                        param_name,
-                        weight_name,
-                        expert_id,
-                        shard_id,
-                ) in expert_params_mapping:
-                    if weight_name not in name:
-                        continue
-
-                    if is_pp_missing_parameter(name, self):
-                        continue
-                    name = name.replace(weight_name, param_name)
-                    param = params_dict[name]
-                    weight_loader = param.weight_loader
-                    weight_loader(param,
-                                  loaded_weight,
-                                  name,
-                                  shard_id=shard_id,
-                                  expert_id=expert_id)
-                    break
-                else:
-                    # Skip loading extra bias for GPTQ models.
-                    if name.endswith(".bias") and name not in params_dict:
-                        continue
-                    if is_pp_missing_parameter(name, self):
-                        continue
-
-                    param = params_dict[name]
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
-                    weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
-
-
-def _is_moe_layer(name: str):
-    return any(
-        [experts_name in name for experts_name in [
-            "experts",
-            "router",
-        ]])
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        return self.model.get_expert_mapping()
 
 
 class JambaForSequenceClassification(JambaForCausalLM):
diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py
index 3e1c64bb62eab..892d970aaade0 100644
--- a/vllm/model_executor/models/keye.py
+++ b/vllm/model_executor/models/keye.py
@@ -3,7 +3,7 @@
 import math
 from collections.abc import Iterable, Mapping, Sequence
 from functools import partial
-from typing import Any, Literal, Optional, TypedDict, Union
+from typing import Annotated, Any, Literal, Optional, Union
 
 import numpy as np
 import torch
@@ -46,6 +46,7 @@ from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.config import uses_mrope
 from vllm.transformers_utils.processor import (
     cached_image_processor_from_config)
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
                          SupportsMultiModal, SupportsPP)
@@ -102,77 +103,62 @@ def smart_resize(
     return h_bar, w_bar
 
 
-class KeyeImagePixelInputs(TypedDict):
+class KeyeImagePixelInputs(TensorSchema):
+    """
+    Dimensions:
+        - np: Number of patches
+        - cps: Number of channels * patch_size * patch_size
+        - ni: Number of images
+        - g: Grid dimensions (3 for t, h, w)
+    """
     type: Literal["pixel_values"]
-    pixel_values: torch.Tensor
-    """Shape:
-    `(num_patches, num_channels * patch_size * patch_size)`
+    pixel_values: Annotated[torch.Tensor, TensorShape("np", "cps")]
+    image_grid_thw: Annotated[torch.Tensor, TensorShape("ni", 3)]
+
+
+class KeyeImageEmbeddingInputs(TensorSchema):
     """
-
-    image_grid_thw: torch.Tensor
-    """Shape: `(num_images, 3)`
-    This should be in `(grid_t, grid_h, grid_w)` format.
+    Dimensions:
+        - nf: Number of image features
+        - hs: Hidden size (must match the hidden size of language model 
+          backbone)
+        - ni: Number of images
+        - g: Grid dimensions (3 for t, h, w)
     """
-
-
-class KeyeImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
-    image_embeds: torch.Tensor
-    """Supported types:
-    - list[`torch.Tensor`]: A list of tensors holding all images' features.
-        Each tensor holds an image's features.
-    - `torch.Tensor`: A tensor holding all images' features
-        (concatenation of all images' feature tensors).
-    
-    Tensor shape: `(num_image_features, hidden_size)`
-    - `num_image_features` varies based on
-        the number and resolution of the images.
-    - `hidden_size` must match the hidden size of language model backbone.
-    """
-
-    image_grid_thw: torch.Tensor
-    """Shape: `(num_images, 3)`
-    This should be in `(grid_t, grid_h, grid_w)` format.
-    """
+    image_embeds: Annotated[torch.Tensor, TensorShape("nf", "hs")]
+    image_grid_thw: Annotated[torch.Tensor, TensorShape("ni", 3)]
 
 
 KeyeImageInputs = Union[KeyeImagePixelInputs, KeyeImageEmbeddingInputs]
 
 
-class KeyeVideoPixelInputs(TypedDict):
+class KeyeVideoPixelInputs(TensorSchema):
+    """
+    Dimensions:
+        - np: Number of patches
+        - ctps: Number of channels * temporal_patch_size * patch_size * 
+          patch_size
+        - nv: Number of videos
+        - g: Grid dimensions (3 for t, h, w)
+    """
     type: Literal["pixel_values_videos"]
-    pixel_values_videos: torch.Tensor
-    """Shape:
-    `(num_patches,
-      num_channels * temporal_patch_size * patch_size * patch_size)`
+    pixel_values_videos: Annotated[torch.Tensor, TensorShape("np", "ctps")]
+    video_grid_thw: Annotated[torch.Tensor, TensorShape("nv", 3)]
+
+
+class KeyeVideoEmbeddingInputs(TensorSchema):
     """
-
-    video_grid_thw: torch.Tensor
-    """Shape: `(num_videos, 3)`
-
-    This should be in `(grid_t, grid_h, grid_w)` format.
+    Dimensions:
+        - nf: Number of video features
+        - hs: Hidden size (must match the hidden size of language model 
+          backbone)
+        - nv: Number of videos
+        - g: Grid dimensions (3 for t, h, w)
     """
-
-
-class KeyeVideoEmbeddingInputs(TypedDict):
     type: Literal["video_embeds"]
-    video_embeds: torch.Tensor
-    """Supported types:
-    - list[`torch.Tensor`]: A list of tensors holding all videos' features.
-        Each tensor holds an video's features.
-    - `torch.Tensor`: A tensor holding all videos' features
-        (concatenation of all videos' feature tensors).
-    
-    Tensor shape: `(num_image_features, hidden_size)`
-    - `num_image_features` varies based on 
-        the number and resolution of the videos.
-    - `hidden_size` must match the hidden size of language model backbone.
-    """
-
-    video_grid_thw: torch.Tensor
-    """Shape: `(num_videos, 3)`
-    This should be in `(grid_t, grid_h, grid_w)` format.
-    """
+    video_embeds: Annotated[torch.Tensor, TensorShape("nf", "hs")]
+    video_grid_thw: Annotated[torch.Tensor, TensorShape("nv", 3)]
 
 
 KeyeVideoInputs = Union[KeyeVideoPixelInputs, KeyeVideoEmbeddingInputs]
@@ -994,9 +980,6 @@ class KeyeMultiModalDataParser(MultiModalDataParser):
 
 class KeyeProcessingInfo(BaseProcessingInfo):
 
-    def get_hf_config(self):
-        return self.ctx.get_hf_config(PretrainedConfig)
-
     def get_hf_processor(
         self,
         *,
@@ -1420,10 +1403,6 @@ class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA,
             image_grid_thw = self._validate_and_reshape_mm_tensor(
                 image_grid_thw, "image grid_thw")
 
-            if not isinstance(pixel_values, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of image pixel values. "
-                                 f"Got type: {type(pixel_values)}")
-
             return KeyeImagePixelInputs(
                 type="pixel_values",
                 pixel_values=pixel_values,
@@ -1436,9 +1415,6 @@ class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA,
             image_grid_thw = self._validate_and_reshape_mm_tensor(
                 image_grid_thw, "image grid_thw")
 
-            if not isinstance(image_embeds, torch.Tensor):
-                raise ValueError("Incorrect type of image embeddings. "
-                                 f"Got type: {type(image_embeds)}")
             return KeyeImageEmbeddingInputs(
                 type="image_embeds",
                 image_embeds=image_embeds,
@@ -1474,9 +1450,6 @@ class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA,
             video_grid_thw = self._validate_and_reshape_mm_tensor(
                 video_grid_thw, "video grid_thw")
 
-            if not isinstance(video_embeds, torch.Tensor):
-                raise ValueError("Incorrect type of video embeddings. "
-                                 f"Got type: {type(video_embeds)}")
             return KeyeVideoEmbeddingInputs(
                 type="video_embeds",
                 video_embeds=video_embeds,
diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py
index fab1c163ac288..470e701d98013 100644
--- a/vllm/model_executor/models/llama4.py
+++ b/vllm/model_executor/models/llama4.py
@@ -342,34 +342,94 @@ class Llama4Model(LlamaModel):
         expert_params_mapping: list[tuple[str, str, int, str]],
         fused: bool = True,
     ) -> bool:
+        """
+        Load MoE expert weights.
+
+        Args:
+            name: The name of the weight to load.
+            loaded_weight: The weight to load.
+            params_dict: The dictionary of module parameters.
+            loaded_params: The set of already loaded parameters.
+            expert_params_mapping: The mapping of expert parameters. Must be
+                generated by FusedMoE.make_expert_params_mapping().
+            fused: Whether the expert weights are fused into a single weight
+                tensor or are separate weight tensors for each expert.
+                When fused is True, loaded_weight should have shape of:
+                [num_experts, hidden_in, hidden_out] for gate/up/down proj and
+                [hidden_out, hidden_in] for the others like router.
+                When fused is False, loaded_weight should have shape of:
+                [hidden_out, hidden_in].
+
+        Returns:
+            True if loaded_weight is one of MoE weights and the MoE expert
+            weights are loaded successfully, False otherwise.
+        """
+
+        # Whether the MoE expert weights are loaded successfully.
         expert_param_loaded = False
-        if "experts.gate_up_proj" in name:
-            loaded_weight = loaded_weight.chunk(2, dim=-1)
+
+        # If fused is True, the loaded weight is in the layout of:
+        # [num_experts, hidden_in, hidden_out], so we must transpose the last
+        # two dimensions to match the expected layout of the parameters.
+        if fused and loaded_weight.ndim == 3:
+            loaded_weight = loaded_weight.transpose(-1, -2)
+
+            # If the gate_proj and up_proj weights are fused into a single
+            # weight tensor, we need to split the weight tensor into a tuple
+            # of two weight tensors along the hidden_out dimension.
+            if "experts.gate_up_proj" in name:
+                loaded_weight = loaded_weight.chunk(2, dim=-2)
+
+        # Iterate over all the expert parameters and load the weights if we find
+        # a match in weight name.
         for (param_name, weight_name, expert_id,
              shard_id) in expert_params_mapping:
+
+            # Get a view of the loaded_weight to avoid modifying the original
+            # one across iterations.
             new_loaded_weight = loaded_weight
+
+            # If expert weights are fused into a single weight tensor, remove
+            # the expert index from the expected weight name.
             if fused:
+                # The string between e_str and proj_str is the expert index.
                 e_str, _, proj_str, _ = weight_name.split('.')
                 weight_name = f"{e_str}.{proj_str}"
                 param_name = f"{param_name}weight"
+
+            # Skip if the current weight is not one of the MoE weights.
             if weight_name not in name:
                 continue
+
+            # Replace the weight name with the parameter name.
             full_param_name = name.replace(weight_name, param_name)
-            # Skip layers on other devices.
+
+            # Skip if the current weight corresponds to a parameter that
+            # does not exist on the current PP (pipeline parallel) rank.
             if is_pp_missing_parameter(name, self):
                 continue
+
+            # Skip if the current weight is for the bias.
             if ((name.endswith(".bias") or name.endswith("_bias"))
                     and name not in params_dict):
                 continue
+
             param = params_dict[full_param_name]
             weight_loader = param.weight_loader
+
             if fused:
+                # If the parameter is for w13 together, the corresponding weight
+                # will be a tuple, so we must select the correct weight
+                # depending on the shard id, which is either "w1" or "w3".
                 if "w13" in full_param_name:
+                    assert shard_id in ["w1", "w3"]
                     shard_idx = 0 if shard_id == "w1" else 1
                     new_loaded_weight = new_loaded_weight[shard_idx]
-                new_loaded_weight = new_loaded_weight.transpose(-1, -2)
+
+                # If EP (expert parallel) is enabled, update expert_id to the
+                # starting expert index for the current EP rank and extract the
+                # corresponding expert weights.
                 layer_idx = extract_layer_index(name)
-                # EP mapping
                 expert_map = self.layers[
                     layer_idx].feed_forward.experts.expert_map
                 if expert_map is not None:
@@ -382,6 +442,9 @@ class Llama4Model(LlamaModel):
             else:
                 # TODO: add EP support for non fused weights
                 pass
+
+            # Load the weight into the module parameter with corresponding
+            # shard id and expert id.
             weight_loader(param,
                           new_loaded_weight,
                           full_param_name,
@@ -390,10 +453,13 @@ class Llama4Model(LlamaModel):
 
             loaded_params.add(full_param_name)
             expert_param_loaded = True
+
         return expert_param_loaded
 
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
+        # Name mapping from the parameter name to the shard name and
+        # corresponding shard id.
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             (".qkv_proj", ".q_proj", "q"),
@@ -402,26 +468,43 @@ class Llama4Model(LlamaModel):
             (".gate_up_proj", ".gate_proj", 0),
             (".gate_up_proj", ".up_proj", 1),
         ]
+        # Indicate whether the expert weights are fused into a single weight
+        # tensor.
         fused_experts_params = False
+        # Expert parameter mapping for the case where the expert weights are
+        # not fused into a single weight tensor.
         expert_params_mapping = FusedMoE.make_expert_params_mapping(
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
             num_experts=self.num_experts)
+        # Expert parameter mapping for the case where the expert weights are
+        # fused into a single weight tensor.
         expert_params_mapping_fused = FusedMoE.make_expert_params_mapping(
             ckpt_gate_proj_name="gate_up_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="gate_up_proj",
             num_experts=1)
+        # All the module parameters.
         params_dict = dict(self.named_parameters())
+        # The module parameters that have been loaded.
         loaded_params: set[str] = set()
+
+        # Iterate over all the weights and load them into module parameters.
         for name, loaded_weight in weights:
+
+            # If the name contains "experts.gate_up_proj" or "experts.down_proj"
+            # without the expert indices, it means the expert weights are fused
+            # into a single weight tensor across all experts.
             if "experts.gate_up_proj" in name or "experts.down_proj" in name:
                 fused_experts_params = True
                 expert_params_mapping = expert_params_mapping_fused
+
+            # If kv cache quantization scales exist and the weight name
+            # corresponds to one of the kv cache quantization scales, load
+            # them.
             if (self.quant_config is not None and
                 (scale_name := self.quant_config.get_cache_scale(name))):
-                # Loading kv cache quantization scales
                 param = params_dict[scale_name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
@@ -430,84 +513,119 @@ class Llama4Model(LlamaModel):
                 weight_loader(param, loaded_weight)
                 loaded_params.add(scale_name)
                 continue
+
+            # Iterate over stacked_params_mapping to check if the current weight
+            # is one of the stacked parameters. If so, load the weight with the
+            # corresponding shard id. Note that MoE weights are handled
+            # separately in the else block.
             for param_name, weight_name, shard_id in stacked_params_mapping:
+                # Skip if the current weight is not one of the stacked
+                # parameters or if the current weight is a MoE weight.
                 if weight_name not in name or "experts" in name:
                     continue
-                # This check is for ModelOpt ckpts with kv cache quant enabled
+
+                # For ModelOpt checkpoints, we need to rename the self_attn
+                # weight/weight_scale names except for kv cache scales.
                 if not (name.endswith(
                     (".k_scale", ".v_scale")) and "self_attn" in name):
                     name = name.replace(weight_name, param_name)
+
+                # Skip if the current weight corresponds to a parameter that
+                # does not exist on the current PP (pipeline parallel) rank.
                 if is_pp_missing_parameter(name, self):
                     continue
-                if name.endswith("scale") and "expert" not in name:
-                    # Remapping the name of FP8 kv-scale.
+
+                # Remap kv cache scale names for ModelOpt checkpoints.
+                # TODO: ModelOpt should implement get_cache_scale() such that
+                #       kv cache scale name remapping can be done there.
+                if name.endswith("scale"):
                     name = maybe_remap_kv_scale_name(name, params_dict)
                     if name is None:
                         continue
+
+                # Load the weight into the module parameter with corresponding
+                # shard id and exit the for loop and the else block.
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
+
                 if weight_loader == default_weight_loader:
                     weight_loader(param, loaded_weight)
                 else:
                     weight_loader(param, loaded_weight, shard_id)
+
                 loaded_params.add(name)
                 break
+
+            # Handle normal (non-stacked) weights and MoE weights.
             else:
-                moe_loaded = self.load_moe_expert_weights(
-                    name,
-                    loaded_weight,
-                    params_dict,
-                    loaded_params,
-                    expert_params_mapping,
-                    fused=fused_experts_params)
+                # First, try to load MoE weights using load_moe_expert_weights.
+                # If successful, move on to next loaded weight.
+                if self.load_moe_expert_weights(name,
+                                                loaded_weight,
+                                                params_dict,
+                                                loaded_params,
+                                                expert_params_mapping,
+                                                fused=fused_experts_params):
+                    continue
 
-                if not moe_loaded:
-                    if is_pp_missing_parameter(name, self):
-                        continue
+                # Skip if the current weight corresponds to a parameter that
+                # does not exist on the current PP (pipeline parallel) rank.
+                if is_pp_missing_parameter(name, self):
+                    continue
 
-                    # Handle flat expert scale parameters that
-                    # don't match per-expert patterns
-                    if ("experts." in name and ("w13_input_scale" in name
-                                                or "w13_weight_scale" in name
-                                                or "w2_input_scale" in name
-                                                or "w2_weight_scale" in name)):
-                        # These are flat expert scales that apply to all experts
-                        param = params_dict[name]
-                        weight_loader = getattr(param, "weight_loader",
-                                                default_weight_loader)
-
-                        # Check for MoE-specific loading support via
-                        # attribute instead of expensive runtime reflection
-                        supports_moe = getattr(weight_loader,
-                                               'supports_moe_loading', False)
-
-                        if supports_moe:
-                            # This is a MoE weight loader
-                            if "w13_" in name:
-                                shard_id = "w1"
-                            elif "w2_" in name:
-                                shard_id = "w2"
-                            else:
-                                shard_id = "w1"
-
-                            weight_loader(param,
-                                          loaded_weight,
-                                          name,
-                                          shard_id=shard_id,
-                                          expert_id=0)
-                        else:
-                            # Regular weight loader (handles both
-                            # param.weight_loader and default_weight_loader)
-                            weight_loader(param, loaded_weight)
-                        loaded_params.add(name)
-                        continue
+                # Handle flat expert scale parameters that don't match
+                # per-expert patterns, i.e. one weight scale tensor for all
+                # experts.
+                scale_names = [
+                    "w13_input_scale", "w13_weight_scale", "w2_input_scale",
+                    "w2_weight_scale"
+                ]
+                if ("experts." in name and any(scale_name in name
+                                               for scale_name in scale_names)):
 
                     param = params_dict[name]
                     weight_loader = getattr(param, "weight_loader",
                                             default_weight_loader)
-                    weight_loader(param, loaded_weight)
+
+                    # If weight loader supports special moe loading, use it to
+                    # avoid expensive runtime reflection
+                    if getattr(weight_loader, 'supports_moe_loading', False):
+                        # Map the weight name to the corresponding shard id.
+                        shard_id = "w2" if "w2_" in name else "w1"
+
+                        # Transpose if weight scales are FP8 block scales with
+                        # three dimensions:
+                        # [num_experts, hidden_in, hidden_out].
+                        if name.endswith("weight_scale") \
+                            and loaded_weight.dtype == torch.float8_e4m3fn \
+                            and loaded_weight.ndim == 3:
+                            loaded_weight = loaded_weight.transpose(-1, -2)
+
+                        # Load the weight into the module parameter with
+                        # corresponding shard id and expert id.
+                        weight_loader(param,
+                                      loaded_weight,
+                                      name,
+                                      shard_id=shard_id,
+                                      expert_id=0)
+
+                    else:
+                        # Regular weight loader (handles both
+                        # param.weight_loader and default_weight_loader)
+                        weight_loader(param, loaded_weight)
+
                     loaded_params.add(name)
+                    continue
+
+                # Handle normal (non-stacked, non-MoE) weights.
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+                loaded_params.add(name)
+
+        # Finally, return the set of loaded parameters.
         return loaded_params
 
 
@@ -560,23 +678,43 @@ class Llama4ForCausalLM(LlamaForCausalLM):
         loaded_weight: torch.Tensor,
     ) -> tuple[str, torch.Tensor]:
 
-        def permute(w: torch.Tensor, n_heads: int):
+        # Helper function to permute the weight's channels
+        def permute(w: torch.Tensor, n_heads: int, is_weight_scale: bool):
+
+            # Calculate the expected shape of the weight.
+            # Do not rely on w's shape, as it may be in another layout.
             attn_in = self.config.head_dim * n_heads
             attn_out = self.config.hidden_size
 
+            # If the weight is FP4 packed as uint8, we need to divide attn_out
+            # by 2.
+            if w.dtype == torch.uint8 and w.shape[1] * 2 == attn_out:
+                attn_out = attn_out // 2
+
+            # If the weight is a weight scale, we need to divide attn_out by
+            # block size, which is currently 16.
+            elif w.dtype == torch.float8_e4m3fn and is_weight_scale \
+                and w.shape[1] * 16 == attn_out:
+                attn_out = attn_out // 16
+
             return w.view(n_heads, attn_in // n_heads // 2, 2,
                           attn_out).transpose(1, 2).reshape(attn_in, attn_out)
 
         modules = name.split(".")
 
-        # rotary embeds should be sliced
-        if ("wk" in modules or "k_proj" in modules) \
-           and modules[-1] == "weight":
-            loaded_weight = permute(loaded_weight,
-                                    self.config.num_key_value_heads)
-        elif ("wq" in modules or "q_proj" in modules) \
-                and modules[-1] == "weight":
-            loaded_weight = permute(loaded_weight,
-                                    self.config.num_attention_heads)
+        # Permute Q/K weights and weight block scales for rotary embedding
+        is_weight = modules[-1] == "weight"
+        is_nvfp4_weight_scale = (modules[-1] == "weight_scale" and
+                                 loaded_weight.dtype == torch.float8_e4m3fn)
+
+        if is_weight or is_nvfp4_weight_scale:
+            if ("wk" in modules or "k_proj" in modules):
+                loaded_weight = permute(loaded_weight,
+                                        self.config.num_key_value_heads,
+                                        is_nvfp4_weight_scale)
+            elif ("wq" in modules or "q_proj" in modules):
+                loaded_weight = permute(loaded_weight,
+                                        self.config.num_attention_heads,
+                                        is_nvfp4_weight_scale)
 
         return name, loaded_weight
diff --git a/vllm/model_executor/models/minimax_vl_01.py b/vllm/model_executor/models/minimax_vl_01.py
index 9aba82cb115ed..62a7d37ec9d33 100644
--- a/vllm/model_executor/models/minimax_vl_01.py
+++ b/vllm/model_executor/models/minimax_vl_01.py
@@ -5,7 +5,7 @@ from typing import Literal, Optional, TypedDict, Union, cast
 
 import torch
 import torch.nn as nn
-from transformers import BatchFeature
+from transformers import BatchFeature, PretrainedConfig
 
 from vllm.config import VllmConfig
 from vllm.jsontree import json_map_leaves
@@ -17,7 +17,6 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalFieldConfig
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.configs.minimax_vl_01 import MiniMaxVL01Config
 
 from .clip import CLIPVisionModel
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
@@ -90,8 +89,8 @@ class MiniMaxVL01DummyInputsBuilder(LlavaDummyInputsBuilder):
 
 class MiniMaxVL01ProcessingInfo(LlavaNextProcessingInfo):
 
-    def get_hf_config(self):
-        return self.ctx.get_hf_config(MiniMaxVL01Config)
+    def get_hf_config(self):  # Need to override the config type
+        return self.ctx.get_hf_config(PretrainedConfig)
 
     def get_hf_processor(self, **kwargs: object):
         hf_processor = self.ctx.get_hf_processor(**kwargs)
diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py
index fc2b0c1f51821..4967032a244ec 100644
--- a/vllm/model_executor/models/modernbert.py
+++ b/vllm/model_executor/models/modernbert.py
@@ -8,7 +8,6 @@ from torch import nn
 from transformers import ModernBertConfig
 
 from vllm.attention import Attention, AttentionType
-from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.linear import (QKVParallelLinear,
@@ -200,7 +199,6 @@ class ModernBertEncoderLayer(nn.Module):
         return hidden_states
 
 
-@support_torch_compile
 class ModernBertModel(nn.Module):
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={"layers.": "encoder_layer.layers."})
diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py
index 0878ada34d1d8..8db52a69924c9 100644
--- a/vllm/model_executor/models/mpt.py
+++ b/vllm/model_executor/models/mpt.py
@@ -8,6 +8,7 @@ from typing import Optional, Union
 
 import torch
 import torch.nn as nn
+from transformers import MptConfig
 
 from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
@@ -25,7 +26,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.configs.mpt import MPTConfig
 
 from .interfaces import SupportsPP
 from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
@@ -50,7 +50,7 @@ class MPTAttention(nn.Module):
 
     def __init__(
         self,
-        config: MPTConfig,
+        config: MptConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
@@ -59,15 +59,15 @@ class MPTAttention(nn.Module):
         self.d_model = config.d_model
         self.total_num_heads = config.n_heads
         self.head_dim = self.d_model // self.total_num_heads
-        self.clip_qkv = config.attn_config["clip_qkv"]
-        self.qk_ln = config.attn_config["qk_ln"]
-        self.alibi_bias_max = config.attn_config["alibi_bias_max"]
+        self.clip_qkv = config.attn_config.clip_qkv
+        self.qk_ln = config.attn_config.qk_ln
+        self.alibi_bias_max = config.attn_config.alibi_bias_max
         if "kv_n_heads" in config.attn_config:
-            self.total_num_kv_heads = config.attn_config['kv_n_heads']
+            self.total_num_kv_heads = config.attn_config.kv_n_heads
         else:
             self.total_num_kv_heads = self.total_num_heads
-        assert not config.attn_config["prefix_lm"]
-        assert config.attn_config["alibi"]
+        assert not config.attn_config.prefix_lm
+        assert config.attn_config.alibi
 
         # pylint: disable=invalid-name
         self.Wqkv = QKVParallelLinear(
@@ -144,7 +144,7 @@ class MPTMLP(nn.Module):
 
     def __init__(
         self,
-        config: MPTConfig,
+        config: MptConfig,
         quant_config: Optional[QuantizationConfig] = None,
     ):
         super().__init__()
@@ -176,7 +176,7 @@ class MPTBlock(nn.Module):
 
     def __init__(
         self,
-        config: MPTConfig,
+        config: MptConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py
index 111628d8d18cb..c8b528048b557 100644
--- a/vllm/model_executor/models/ovis.py
+++ b/vllm/model_executor/models/ovis.py
@@ -25,7 +25,7 @@ import torch
 import torch.nn as nn
 from torch import Tensor
 from torch.nn.functional import gumbel_softmax, pad, softmax
-from transformers import BaseImageProcessor, BatchFeature
+from transformers import BaseImageProcessor, BatchFeature, PretrainedConfig
 
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.linear import ReplicatedLinear
@@ -48,8 +48,6 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.configs.ovis import (BaseVisualTokenizerConfig,
-                                                  OvisConfig)
 from vllm.transformers_utils.processors.ovis import OvisProcessor
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
@@ -83,7 +81,7 @@ class VisualTokenizer(torch.nn.Module):
 
     def __init__(
         self,
-        config: BaseVisualTokenizerConfig,
+        config: PretrainedConfig,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
     ):
@@ -107,7 +105,7 @@ class VisualTokenizer(torch.nn.Module):
 
     def _init_backbone(
         self,
-        config: BaseVisualTokenizerConfig,
+        config: PretrainedConfig,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
     ) -> nn.Module:
@@ -247,9 +245,6 @@ class VisualEmbedding(torch.nn.Embedding):
 
 class OvisProcessingInfo(BaseProcessingInfo):
 
-    def get_hf_config(self):
-        return self.ctx.get_hf_config(OvisConfig)
-
     def get_hf_processor(self, **kwargs):
         return self.ctx.get_hf_processor(
             OvisProcessor,
@@ -417,7 +412,7 @@ class Ovis(nn.Module, SupportsMultiModal, SupportsPP):
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
 
-        self.config: OvisConfig = config
+        self.config: PretrainedConfig = config
         self.llm = init_vllm_registered_model(
             vllm_config=vllm_config.with_hf_config(config.get_text_config()),
             prefix=maybe_prefix(prefix, "llm"),
diff --git a/vllm/model_executor/models/phi4_multimodal.py b/vllm/model_executor/models/phi4_multimodal.py
new file mode 100644
index 0000000000000..432b707a61591
--- /dev/null
+++ b/vllm/model_executor/models/phi4_multimodal.py
@@ -0,0 +1,1455 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import math
+from collections.abc import Iterable, Mapping, Sequence
+from typing import Any, Literal, Optional, TypedDict, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import (BatchFeature, Phi4MultimodalAudioConfig,
+                          Phi4MultimodalConfig, Phi4MultimodalFeatureExtractor,
+                          Phi4MultimodalImageProcessorFast)
+from transformers import Phi4MultimodalProcessor as Phi4MMProcessor
+from transformers.models.phi4_multimodal.modeling_phi4_multimodal import (
+    Phi4MultimodalAudioConvModule, Phi4MultimodalAudioNemoConvSubsampling,
+    Phi4MultimodalAudioRelativeAttentionBias, adaptive_enc_mask, unfold_tensor)
+
+from vllm.config import VllmConfig
+from vllm.distributed import (divide, get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size)
+from vllm.model_executor.layers.activation import MulAndSilu, get_act_fn
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs, NestedTensors)
+from vllm.multimodal.parse import (AudioProcessorItems, ImageEmbeddingItems,
+                                   ImageProcessorItems, ImageSize,
+                                   MultiModalDataItems, MultiModalDataParser)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+from vllm.utils import is_list_of
+
+from .idefics2_vision_model import Idefics2VisionTransformer
+from .interfaces import MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal
+from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
+                    init_vllm_registered_model, maybe_prefix,
+                    merge_multimodal_embeddings)
+
+# <|endoftext10|> (see vocab.json in hf model)
+_IMAGE_PLACEHOLDER_TOKEN_ID = 200010
+# <|endoftext11|>
+_AUDIO_PLACEHOLDER_TOKEN_ID = 200011
+
+_AUDIO_MAX_SOUNDFILE_SIZE = 241_000
+
+
+def _get_padding_size(orig_width: int, orig_height: int, target_height: int,
+                      target_width: int):
+    ratio_width = target_width / orig_width
+    ratio_height = target_height / orig_height
+
+    if ratio_width < ratio_height:
+        padding_width = 0
+        padding_height = target_height - int(orig_height * ratio_width)
+    else:
+        padding_width = target_width - int(orig_width * ratio_height)
+        padding_height = 0
+    return padding_height, padding_width
+
+
+class Phi4MMProjector(nn.Module):
+
+    def __init__(self, input_size: int, hidden_size: int):
+        super().__init__()
+        self.up = ColumnParallelLinear(input_size, hidden_size)
+        self.down = RowParallelLinear(hidden_size, hidden_size)
+        self.act = get_act_fn("gelu")
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x, _ = self.up(x)
+        x = self.act(x)
+        x, _ = self.down(x)
+        return x
+
+
+class Phi4MMImageEmbedding(nn.Module):
+    """Image embedding."""
+
+    def __init__(self, config: Phi4MultimodalConfig):
+        super().__init__()
+        self.config = config
+        self.layer_idx = config.vision_config.feature_layer
+        self.crop_size = config.vision_config.crop_size
+        self.image_dim_out = config.vision_config.hidden_size
+
+        n_patches = (config.vision_config.image_size //
+                     config.vision_config.patch_size)
+        if n_patches % 2 != 0:
+            self.img_processor_padding = nn.ReflectionPad2d((0, 1, 0, 1))
+            n_patches += 1
+        self.num_img_tokens = (n_patches // 2)**2
+
+        num_hidden_layers = (config.vision_config.num_hidden_layers +
+                             self.layer_idx +
+                             1 if self.layer_idx < 0 else self.layer_idx + 1)
+        self.img_processor = Idefics2VisionTransformer(
+            config.vision_config,
+            require_post_norm=False,
+            num_hidden_layers_override=num_hidden_layers)
+        self.image_token_compression = nn.AvgPool2d(kernel_size=2, stride=2)
+        self.img_projection = Phi4MMProjector(self.image_dim_out,
+                                              config.hidden_size)
+        self.global_img_feature_extensor = nn.Parameter(
+            torch.zeros([1, 1, self.image_dim_out]))
+        self.sub_img_feature_extensor = nn.Parameter(
+            torch.zeros([1, 1, 1, self.image_dim_out]))
+
+    def get_img_features(
+        self,
+        img_embeds: torch.FloatTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.FloatTensor:
+        img_feature = self.img_processor(img_embeds,
+                                         patch_attention_mask=attention_mask)
+
+        patch_feature = img_feature
+        # reshape to 2D tensor
+        width = int(math.sqrt(patch_feature.size(1)))
+        patch_feature = patch_feature.view(-1, width, width,
+                                           patch_feature.size(-1))
+        # convert to NCHW
+        patch_feature = patch_feature.permute(0, 3, 1, 2)
+        if getattr(self, "img_processor_padding", None) is not None:
+            patch_feature = self.img_processor_padding(patch_feature)
+        patch_feature = self.image_token_compression(patch_feature)
+        # convert to NHWC
+        patch_feature = patch_feature.permute(0, 2, 3, 1)
+        patch_feature = patch_feature.view(
+            -1,
+            patch_feature.size(1) * patch_feature.size(2),
+            patch_feature.size(-1))
+        return patch_feature
+
+    def forward(
+        self,
+        image_pixel_values: torch.FloatTensor,
+        image_sizes: Optional[torch.Tensor] = None,
+        image_attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.FloatTensor:
+        image_pixel_values = image_pixel_values.to(
+            self.img_processor.embeddings.patch_embedding.weight.dtype)
+
+        target_device = self.img_projection.up.bias.device
+        target_dtype = self.img_projection.up.bias.dtype
+
+        batch_size = image_pixel_values.shape[0]
+
+        img_features = self.get_img_features(
+            image_pixel_values.flatten(0, 1),
+            attention_mask=image_attention_mask.flatten(0, 1).to(
+                dtype=bool, device=target_device),
+        )
+        base_feat_size = int(np.sqrt(img_features.shape[1]))
+        img_features = img_features.view(batch_size, -1, base_feat_size**2,
+                                         self.image_dim_out)
+        image_sizes = image_sizes.view(-1, 2)
+
+        output_imgs = []
+        for idx in range(batch_size):
+            height, width = image_sizes[idx]
+            height_ratio = height // self.crop_size
+            width_ratio = width // self.crop_size
+            area_ratio = height_ratio * width_ratio
+
+            global_img = img_features[idx, :1]
+            global_img = global_img.reshape(1, base_feat_size, base_feat_size,
+                                            self.image_dim_out).contiguous()
+            temporary_extensor = self.sub_img_feature_extensor.repeat(
+                1, base_feat_size, 1, 1)
+            global_img = torch.cat([global_img, temporary_extensor],
+                                   dim=2).reshape(1, -1, self.image_dim_out)
+
+            sub_img = img_features[idx, 1:]
+            sub_img = sub_img[:area_ratio]
+            sub_img = (sub_img.reshape(
+                height_ratio, width_ratio, base_feat_size, base_feat_size,
+                self.image_dim_out).transpose(1, 2).reshape(
+                    1, height_ratio * base_feat_size,
+                    width_ratio * base_feat_size,
+                    self.image_dim_out).contiguous())
+
+            if image_attention_mask is not None:
+                reshaped_image_attention_mask = (
+                    image_attention_mask[idx, 1:area_ratio + 1,
+                                         0::2, 0::2].reshape(
+                                             height_ratio, width_ratio,
+                                             base_feat_size,
+                                             base_feat_size).transpose(
+                                                 1, 2).reshape(
+                                                     1, height_ratio *
+                                                     base_feat_size,
+                                                     width_ratio *
+                                                     base_feat_size))
+                useful_height = int(
+                    reshaped_image_attention_mask[0, :, 0].sum().item())
+                useful_width = int(
+                    reshaped_image_attention_mask[0, 0, :].sum().item())
+                sub_img = sub_img[:, :useful_height, :useful_width]
+                temporary_extensor = self.sub_img_feature_extensor.repeat(
+                    1, useful_height, 1, 1)
+            else:
+                temporary_extensor = self.sub_img_feature_extensor.repeat(
+                    1, height_ratio * base_feat_size, 1, 1)
+
+            sub_img = torch.cat([sub_img, temporary_extensor],
+                                dim=2).reshape(1, -1, self.image_dim_out)
+
+            # Merge global and sub
+            output_imgs.append(
+                torch.cat(
+                    [sub_img, self.global_img_feature_extensor, global_img],
+                    dim=1))
+
+        img_set_tensor = []
+        for output_img in output_imgs:
+            output_img = output_img.to(device=target_device,
+                                       dtype=target_dtype)
+            img_feature_proj = self.img_projection(output_img)
+            img_set_tensor.append(img_feature_proj.flatten(0, 1))
+
+        return img_set_tensor
+
+
+class Phi4MultimodalAudioMLP(nn.Module):
+
+    def __init__(
+        self,
+        config: Phi4MultimodalAudioConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(config.hidden_size)
+        self.act_fn = MulAndSilu()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            config.hidden_size, [config.intermediate_size] * 2,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj")
+        self.down_proj = RowParallelLinear(config.intermediate_size,
+                                           config.hidden_size,
+                                           bias=True,
+                                           quant_config=quant_config,
+                                           prefix=f"{prefix}.down_proj")
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states, _ = self.gate_up_proj(hidden_states)
+        hidden_states = self.act_fn(hidden_states)
+        hidden_states, _ = self.down_proj(hidden_states)
+        return hidden_states
+
+
+class Phi4MultimodalAudioAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: Phi4MultimodalAudioConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.total_num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.total_num_heads
+        if self.head_dim * self.total_num_heads != self.embed_dim:
+            raise ValueError(
+                "embed_dim must be divisible by num_heads "
+                f"(got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads}).")
+        self.scale = self.head_dim**-0.5
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=self.embed_dim,
+            head_size=self.head_dim,
+            total_num_heads=self.total_num_heads,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+
+        self.o_proj = RowParallelLinear(
+            input_size=self.embed_dim,
+            output_size=self.embed_dim,
+            quant_config=quant_config,
+            prefix=f"{prefix}.out_proj",
+        )
+
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.num_heads = divide(self.total_num_heads, self.tp_size)
+
+    def split_attn_mask(self, attention_mask: torch.Tensor) -> torch.Tensor:
+        start_idx = self.num_heads * self.tp_rank
+        end_idx = self.num_heads * (self.tp_rank + 1)
+        return attention_mask[:, start_idx:end_idx]
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv_states, _ = self.qkv_proj(hidden_states)
+        query, key, value = qkv_states.chunk(3, dim=-1)
+
+        bsz, seq_len, _ = query.size()
+        query = query.view(bsz, seq_len, self.num_heads, self.head_dim)
+        key = key.view(bsz, seq_len, self.num_heads, self.head_dim)
+        value = value.view(bsz, seq_len, self.num_heads, self.head_dim)
+        query, key, value = (x.transpose(1, 2) for x in (query, key, value))
+
+        attention_mask = self.split_attn_mask(attention_mask)
+        out = F.scaled_dot_product_attention(
+            query,
+            key,
+            value,
+            scale=self.scale,
+            attn_mask=attention_mask,
+        )
+        out = out.transpose(1, 2).reshape(bsz, seq_len, -1)
+
+        attn_output, _ = self.o_proj(out)
+
+        return attn_output
+
+
+class Phi4MultimodalAudioConformerEncoderLayer(nn.Module):
+
+    def __init__(self, config: Phi4MultimodalAudioConfig):
+        super().__init__()
+
+        self.feed_forward_in = Phi4MultimodalAudioMLP(config)
+        self.self_attn = Phi4MultimodalAudioAttention(config)
+        self.conv = Phi4MultimodalAudioConvModule(config)
+        self.feed_forward_out = Phi4MultimodalAudioMLP(config)
+        self.layer_norm_att = nn.LayerNorm(config.hidden_size)
+        self.layer_norm = nn.LayerNorm(config.hidden_size)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        residual = hidden_states + 0.5 * self.feed_forward_in(hidden_states)
+        hidden_states = self.layer_norm_att(residual)
+
+        hidden_states = residual + self.self_attn(hidden_states,
+                                                  attention_mask)
+        hidden_states = hidden_states + self.conv(hidden_states)
+        hidden_states = hidden_states + 0.5 * self.feed_forward_out(
+            hidden_states)
+
+        out = self.layer_norm(hidden_states)
+
+        return out
+
+
+class Phi4MMAudioMeanVarianceNormLayer(nn.Module):
+    """Mean/variance normalization layer.
+
+    Will subtract mean and multiply input by inverted standard deviation.
+    Typically used as a very first layer in a model.
+
+    Args:
+        input_size: int
+            layer input size.
+    """
+
+    def __init__(self, config: Phi4MultimodalAudioConfig):
+        super().__init__()
+        self.global_mean = nn.Parameter(torch.zeros(config.input_size))
+        self.global_invstd = nn.Parameter(torch.ones(config.input_size))
+
+    def forward(self, input_: torch.Tensor) -> torch.Tensor:
+        """MeanVarianceNormLayer Forward
+
+        Args:
+            input_: torch.Tensor
+                input tensor.
+        """
+        return (input_ - self.global_mean) * self.global_invstd
+
+
+class Phi4MultimodalAudioModel(nn.Module):
+
+    def __init__(self, config: Phi4MultimodalAudioConfig):
+        super().__init__()
+        self.config = config
+
+        self.encoder_embedding = Phi4MMAudioMeanVarianceNormLayer(config)
+        self.embed = Phi4MultimodalAudioNemoConvSubsampling(config)
+        self.relative_attention_bias_layer = (
+            Phi4MultimodalAudioRelativeAttentionBias(config))
+        self.encoders = nn.ModuleList([
+            Phi4MultimodalAudioConformerEncoderLayer(config)
+            for _ in range(config.num_blocks)
+        ])
+
+    def _streaming_mask(
+        self,
+        seq_len: int,
+        batch_size: int,
+        chunk_size: int,
+        left_chunk: int,
+    ):
+        # Create mask matrix for streaming
+        # S stores start index. if chunksize is 18, s is [0,18,36,....]
+        chunk_start_idx = np.arange(0, seq_len, chunk_size)
+
+        enc_streaming_mask = (adaptive_enc_mask(
+            seq_len, chunk_start_idx,
+            left_window=left_chunk).unsqueeze(0).expand([batch_size, -1, -1]))
+        return enc_streaming_mask
+
+    def forward_embeddings(
+        self,
+        hidden_states: torch.Tensor,
+        masks: torch.Tensor,
+    ):
+        """Forwarding the inputs through the top embedding layers"""
+        seq_len = math.ceil(hidden_states.shape[1] /
+                            self.config.time_reduction)
+        if seq_len <= 0:
+            raise ValueError(
+                f"Sequence length after time reduction is invalid: {seq_len}."
+                "Your input feature is too short.")
+
+        batch_size = hidden_states.shape[0]
+
+        enc_streaming_mask = self._streaming_mask(seq_len, batch_size,
+                                                  self.config.chunk_size,
+                                                  self.config.left_chunk)
+        enc_streaming_mask = enc_streaming_mask.to(hidden_states.device)
+
+        hidden_states, masks = self.embed(hidden_states, masks)
+
+        streaming_mask = enc_streaming_mask
+        if streaming_mask is not None and masks is not None:
+            hs_mask = masks & streaming_mask
+        elif masks is not None:
+            hs_mask = masks
+        else:
+            hs_mask = streaming_mask
+
+        return hidden_states, hs_mask, masks
+
+    def calculate_hs_mask(self, hidden_states: torch.Tensor,
+                          device: torch.device, mask: torch.Tensor):
+        max_audio_length = hidden_states.shape[1]
+        batch_size = hidden_states.shape[0]
+        enc_streaming_mask = self._streaming_mask(max_audio_length, batch_size,
+                                                  self.config.chunk_size,
+                                                  self.config.left_chunk)
+        enc_streaming_mask = enc_streaming_mask.to(device)
+        if mask is None:
+            return enc_streaming_mask
+
+        feature_lens = mask.sum(1)
+        padding_length = feature_lens
+        pad_mask = torch.arange(0, max_audio_length, device=device).expand(
+            padding_length.size(0), -1) < padding_length.unsqueeze(1)
+        pad_mask = pad_mask.unsqueeze(1)
+        pad_mask = pad_mask & enc_streaming_mask
+        return pad_mask
+
+    def forward(self,
+                hidden_states: torch.Tensor,
+                mask: Optional[torch.Tensor] = None):
+        hidden_states = self.encoder_embedding(hidden_states)
+        hidden_states, hs_mask, mask = self.forward_embeddings(
+            hidden_states, mask)
+
+        unfolded = False
+        bs, seq_len, _ = hidden_states.shape
+        max_seq_len = 500  # maximum position for absolute positional encoding
+        if seq_len > max_seq_len:
+            # audio sequence is longer than max_seq_len,
+            # unfold it into chunks of max_seq_len
+            unfolded = True
+            # the unfold op will drop residual frames,
+            # pad it to the multiple of max_seq_len
+            if seq_len % max_seq_len > 0:
+                chunk_pad_size = max_seq_len - (seq_len % max_seq_len)
+            else:
+                chunk_pad_size = 0
+            if chunk_pad_size > 0:
+                hidden_states_pad = F.pad(hidden_states,
+                                          (0, 0, 0, chunk_pad_size),
+                                          "constant", 0)
+                hidden_states = hidden_states_pad.to(hidden_states.device)
+
+            hidden_states = unfold_tensor(hidden_states, max_seq_len)
+            masks_unfold = None
+            if mask is not None:
+                # revise hs_mask here because the previous calculated hs_mask
+                # did not consider extra pad
+                subsampled_pad_mask = mask.squeeze(
+                    1)  # [bz, subsampled_unmask_seq_len]
+                extra_padded_subsamlped_pad_mask = F.pad(
+                    subsampled_pad_mask, (0, chunk_pad_size), "constant",
+                    False)  # extra padding to the pad mask
+                extra_padded_subsamlped_pad_mask = (
+                    extra_padded_subsamlped_pad_mask.unsqueeze(-1).float())
+                masks_unfold = unfold_tensor(
+                    extra_padded_subsamlped_pad_mask, max_seq_len
+                )  # unfold the pad mask like we did to the input tensor
+                masks_unfold = masks_unfold.squeeze(
+                    -1).bool()  # unfold op does not support bool tensor
+            hs_mask = self.calculate_hs_mask(
+                hidden_states, hidden_states.device, masks_unfold
+            )  # calculate hs_mask based on the unfolded pad mask
+
+        relative_attention_bias = self.relative_attention_bias_layer(
+            hidden_states)
+        attention_mask = hs_mask.unsqueeze(1) + relative_attention_bias
+
+        for layer in self.encoders:
+            hidden_states = layer(hidden_states, attention_mask)
+
+        if unfolded:
+            embed_dim = hidden_states.shape[-1]
+            hidden_states = hidden_states.reshape(bs, -1, embed_dim)
+            # if we ever padded before unfolding, we need to remove the padding
+            if chunk_pad_size > 0:
+                hidden_states = hidden_states[:, :-chunk_pad_size, :]
+
+        return hidden_states
+
+
+class Phi4MMAudioEmbedding(nn.Module):
+
+    def __init__(self, config: Phi4MultimodalConfig):
+        super().__init__()
+        self.config = config
+        self.layer_idx = config.audio_config.feature_layer
+
+        self.encoder = Phi4MultimodalAudioModel(config.audio_config)
+
+        audio_config = config.audio_config
+        proj_input_size = (audio_config.hidden_size *
+                           audio_config.downsample_rate)
+        self.vision_speech_projection = Phi4MMProjector(
+            proj_input_size, config.hidden_size)
+        self.speech_projection = Phi4MMProjector(proj_input_size,
+                                                 config.hidden_size)
+
+    def get_projection(
+        self,
+        audio_projection_mode: Literal["speech", "vision"],
+    ) -> Phi4MMProjector:
+        if audio_projection_mode == "speech":
+            return self.speech_projection
+        elif audio_projection_mode == "vision":
+            return self.vision_speech_projection
+
+    def forward(
+        self,
+        audio_input_features: torch.FloatTensor,
+        audio_embed_sizes=None,
+        audio_attention_mask=None,
+        audio_projection_mode="speech",
+    ) -> torch.FloatTensor:
+
+        audio_projection = self.get_projection(audio_projection_mode)
+
+        target_device = audio_projection.up.bias.device
+        target_dtype = audio_projection.up.bias.dtype
+
+        audio_input_features = audio_input_features.to(device=target_device,
+                                                       dtype=target_dtype)
+
+        audio_encoder_hidden_states = self.encoder(audio_input_features,
+                                                   audio_attention_mask)
+        audio_embeds = audio_projection(audio_encoder_hidden_states)
+
+        return audio_embeds.flatten(0, 1)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+
+        for name, loaded_weight in weights:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class Phi4MMImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    data: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    Shape:
+    `(batch_size * num_images, 1 + num_patches, num_channels, height, width)`
+
+    Note that `num_patches` may be different per batch and image,
+    in which case the data is passed as a list instead of a batched tensor.
+    """
+
+    image_sizes: torch.Tensor
+    """
+    Shape: `(batch_size * num_images, 2)`
+
+    This should be in `(height, width)` format.
+    """
+
+    num_img_tokens: list[int]
+    """Shape: `(batch_size * num_images)`"""
+
+    image_attention_mask: torch.Tensor
+    """Shape: `(batch_size * num_images, H_mask, W_mask)`"""
+
+
+class Phi4MMImageEmbeddingInputs(TypedDict):
+    type: Literal["image_embeds"]
+    data: Union[torch.Tensor, list[torch.Tensor]]
+    """Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
+
+    `hidden_size` must match the hidden size of language model backbone.
+    """
+
+
+class Phi4MMAudioFeatureInputs(TypedDict):
+    type: Literal["audio_features"]
+    data: Union[torch.Tensor, list[torch.Tensor]]
+    """Shape: `(batch_size * num_audios, 80, M)"""
+
+
+class Phi4MMAudioEmbeddingInputs(TypedDict):
+    type: Literal["audio_embeds"]
+    data: NestedTensors
+    """Shape: `(batch_size, num_audios, audio_feature_size, hidden_size)"""
+
+
+Phi4MMImageInput = Union[Phi4MMImagePixelInputs, Phi4MMImageEmbeddingInputs]
+Phi4MMAudioInputs = Union[Phi4MMAudioFeatureInputs, Phi4MMAudioEmbeddingInputs]
+
+
+def cat_with_pad(tensors, dim, padding_value=0):
+    """
+    cat along dim, while pad to max for all other dims
+    """
+    ndim = tensors[0].dim()
+    assert all(
+        t.dim() == ndim for t in
+        tensors[1:]), "All tensors must have the same number of dimensions"
+
+    out_size = [max(t.shape[i] for t in tensors) for i in range(ndim)]
+    out_size[dim] = sum(t.shape[dim] for t in tensors)
+    output = tensors[0].new_full(out_size, padding_value)
+
+    index = 0
+    for t in tensors:
+        # Create a slice list where every dimension except dim is full slice
+        slices = [slice(0, t.shape[d]) for d in range(ndim)]
+        # Update only the concat dimension slice
+        slices[dim] = slice(index, index + t.shape[dim])
+
+        output[slices] = t
+        index += t.shape[dim]
+
+    return output
+
+
+class Phi4MMProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self) -> Phi4MultimodalConfig:
+        return self.ctx.get_hf_config(Phi4MultimodalConfig)
+
+    def get_hf_processor(
+        self,
+        *,
+        dynamic_hd: Optional[int] = None,
+        **kwargs: object,
+    ) -> Phi4MMProcessor:
+        if dynamic_hd is not None:
+            kwargs["dynamic_hd"] = dynamic_hd
+
+        return self.ctx.get_hf_processor(**kwargs)
+
+    def get_feature_extractor(self) -> Phi4MultimodalFeatureExtractor:
+        return self.get_hf_processor().audio_processor
+
+    def get_image_processor(
+        self,
+        processor: Optional[Phi4MMProcessor] = None,
+    ) -> Phi4MultimodalImageProcessorFast:
+        if processor is None:
+            processor = self.get_hf_processor()
+        return processor.image_processor
+
+    def get_dynamic_hd(
+        self,
+        processor: Optional[Phi4MMProcessor] = None,
+    ) -> int:
+        return self.get_image_processor(processor).dynamic_hd
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"audio": None, "image": None}
+
+    def _find_target_aspect_ratio(
+        self,
+        orig_width: int,
+        orig_height: int,
+        image_size: int,
+        max_num: int,
+        min_num: int,
+    ):
+        w_crop_num = math.ceil(orig_width / float(image_size))
+        h_crop_num = math.ceil(orig_height / float(image_size))
+        if w_crop_num * h_crop_num > max_num:
+            aspect_ratio = orig_width / orig_height
+
+            # calculate the existing image aspect ratio
+            target_ratios = set((i, j) for i in range(1, max_num + 1)
+                                for j in range(1, max_num + 1)
+                                if i * j <= max_num and i * j >= min_num)
+            target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+            # find the closest aspect ratio to the target
+            image_processor = self.get_image_processor()
+            target_aspect_ratio = image_processor.find_closest_aspect_ratio(
+                aspect_ratio,
+                target_ratios,
+                orig_width,
+                orig_height,
+            )
+
+            # calculate the target width and height
+            target_width = image_size * target_aspect_ratio[0]
+            target_height = image_size * target_aspect_ratio[1]
+        else:
+            target_width = image_size * w_crop_num
+            target_height = image_size * h_crop_num
+            target_aspect_ratio = (w_crop_num, h_crop_num)
+        return target_aspect_ratio, target_height, target_width
+
+    def _compute_num_image_tokens(
+        self,
+        orig_width: int,
+        orig_height: int,
+        dynamic_hd_size: int,
+        vit_image_size: int,
+        vit_patch_size: int,
+        token_compression_factor: int = 2,
+    ):
+        """
+        compute the number of tokens an image is expected to take up considering
+        the image encoder architecture and exclude output features containing 
+        only padding pixels
+
+        for siglip, vit_image_size=448, vit_patch_size=14, so output will be 
+        32x32 feature map
+        NOTE right now, Phi4MM uses hard-coded token_compression_factor=2
+        """
+        assert vit_image_size % vit_patch_size == 0, (
+            "vit_image_size must be divisible by vit_patch_size")
+        assert (vit_image_size // vit_patch_size %
+                token_compression_factor == 0), (
+                    "vit_image_size // vit_patch_size must be divisible by "
+                    "token_compression_factor")
+
+        target_aspect_ratio, target_height, target_width = (
+            self._find_target_aspect_ratio(orig_width,
+                                           orig_height,
+                                           vit_image_size,
+                                           dynamic_hd_size,
+                                           min_num=1))
+        assert target_aspect_ratio[0] * vit_image_size == target_width, (
+            f"{target_aspect_ratio[0]} * {vit_image_size} != {target_width}")
+        assert target_aspect_ratio[1] * vit_image_size == target_height, (
+            f"{target_aspect_ratio[1]} * {vit_image_size} != {target_height}")
+        assert (target_height % vit_image_size == 0
+                and target_width % vit_image_size == 0)
+
+        padding_height, padding_width = _get_padding_size(
+            orig_width, orig_height, target_height, target_width)
+        assert padding_width == 0 or padding_height == 0, \
+            "padding_width or padding_height must be 0"
+
+        target_feat_width = target_width // vit_patch_size
+        target_feat_height = target_height // vit_patch_size
+        if padding_width >= vit_patch_size:
+            assert padding_height == 0, "padding_height not 0"
+            non_pad_feat_width = target_feat_width - math.floor(
+                padding_width / vit_patch_size)
+            non_pad_feat_height = target_feat_height
+        elif padding_height >= vit_patch_size:
+            assert padding_width == 0, "padding_width not 0"
+            non_pad_feat_height = target_feat_height - math.floor(
+                padding_height / vit_patch_size)
+            non_pad_feat_width = target_feat_width
+        else:
+            # small padding shorter than a vit patch
+            non_pad_feat_width = target_feat_width
+            non_pad_feat_height = target_feat_height
+
+        feat_width = non_pad_feat_width // token_compression_factor
+        feat_height = non_pad_feat_height // token_compression_factor
+        # NOTE it's possible that the non-padding feature is not divisible
+        if non_pad_feat_width % token_compression_factor != 0:
+            feat_width += 1
+        if non_pad_feat_height % token_compression_factor != 0:
+            feat_height += 1
+        num_hd_patch_tokens = feat_width * feat_height
+        num_hd_newline_tokens = feat_height
+        vit_feature_size = vit_image_size // vit_patch_size
+        num_global_image_tokens = (vit_feature_size //
+                                   token_compression_factor)**2
+        num_sep_tokens = 1
+        num_global_image_newline_tokens = \
+            vit_feature_size // token_compression_factor
+
+        return (num_global_image_tokens + num_sep_tokens +
+                num_hd_patch_tokens + num_hd_newline_tokens +
+                num_global_image_newline_tokens)
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        processor: Optional[Phi4MMProcessor] = None,
+    ) -> int:
+        hf_config = self.get_hf_config()
+        vision_config = hf_config.vision_config
+        vit_image_size = vision_config.image_size
+        vit_patch_size = vision_config.patch_size
+
+        dynamic_hd_size = self.get_dynamic_hd(processor=processor)
+
+        # we use default `token_compression_factor=2`,
+        # since it's not in HF vision config.
+        image_num_tokens = self._compute_num_image_tokens(
+            image_width,
+            image_height,
+            dynamic_hd_size=dynamic_hd_size,
+            vit_image_size=vit_image_size,
+            vit_patch_size=vit_patch_size,
+        )
+
+        return image_num_tokens
+
+    def get_image_size_with_most_features(
+        self,
+        processor: Optional[Phi4MMProcessor] = None,
+    ) -> ImageSize:
+        vit_image_size = self.get_hf_config().vision_config.image_size
+
+        max_side = vit_image_size * self.get_dynamic_hd(processor=processor)
+        return ImageSize(height=max_side, width=vit_image_size)
+
+    def get_audio_num_frames(self, audio_len: int, sr: float) -> int:
+        """
+        Compute the output size of the `extract_features` method.
+
+        Args:
+            audio_len (int): Length of the input waveform in samples.
+            sr (float): Sampling rate of the waveform, either 16000 or 8000.
+
+        Returns:
+            tuple (int, int): Output size as (T, D), where:
+                T: Number of time frames.
+                D: Number of Mel filterbank bins (80).
+        """
+
+        # Resample to 16000 or 8000 if needed
+        if sr > 16000:
+            audio_len //= sr // 16000
+        elif 8000 <= sr < 16000:
+            # We'll resample to 16K from 8K
+            audio_len *= 2
+        elif sr < 8000:
+            raise RuntimeError(f"Unsupported sample rate {sr}")
+
+        # Spectrogram parameters for 16 kHz
+        win_length = 400  # Frame length in samples
+        hop_length = 160  # Frame shift in samples
+
+        # Calculate number of frames (T)
+        num_frames = (audio_len - win_length) // hop_length + 1
+        if num_frames < 1:
+            raise ValueError("Waveform too short for given parameters.")
+
+        # Return time frames (T)
+        return num_frames
+
+    def _compute_audio_embed_size(self, audio_frames: int) -> int:
+        """
+        Compute the size of audio embeddings from the number of audio frames.
+        """
+        # `_compute_audio_embed_size` in audio_processor use torch for
+        # computation, therefore we re-implement it to use pythonic
+        # numeric computation to avoid extra tensor conversion.
+        audio_processor = self.get_feature_extractor()
+        audio_compression_rate = audio_processor.audio_compression_rate
+        audio_downsample_rate = audio_processor.audio_downsample_rate
+
+        integer = audio_frames // audio_compression_rate
+        remainder = audio_frames % audio_compression_rate
+        result = integer + int(remainder > 0)
+
+        integer = result // audio_downsample_rate
+        remainder = result % audio_downsample_rate
+        result = integer + int(remainder > 0)  # qformer compression
+
+        return result
+
+
+class Phi4MMDummyInputsBuilder(BaseDummyInputsBuilder[Phi4MMProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_audios = mm_counts.get("audio", 0)
+        num_images = mm_counts.get("image", 0)
+
+        tokenizer = self.info.get_tokenizer()
+        image_tokens: str = tokenizer.image_token * num_images
+        audio_tokens: str = tokenizer.audio_token * num_audios
+
+        return image_tokens + audio_tokens
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_audios = mm_counts.get("audio", 0)
+        num_images = mm_counts.get("image", 0)
+
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images),
+            "audio":
+            self._get_dummy_audios(length=_AUDIO_MAX_SOUNDFILE_SIZE,
+                                   num_audios=num_audios),
+        }
+
+        return mm_data
+
+
+class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]):
+
+    def _get_data_parser(self) -> MultiModalDataParser:
+        feature_extractor = self.info.get_feature_extractor()
+        return MultiModalDataParser(target_sr=feature_extractor.sampling_rate)
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        if not mm_data:
+            prompt_ids = self.info.get_tokenizer().encode(prompt)
+            prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
+            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
+
+        audio_data = mm_data.pop("audios", [])
+        if audio_data:
+            mm_data['audio'] = audio_data
+
+        processed_outputs = super()._call_hf_processor(prompt, mm_data,
+                                                       mm_kwargs, tok_kwargs)
+
+        if "image_pixel_values" in processed_outputs:
+            num_img_tokens = [
+                self.info.get_num_image_tokens(image_width=img_size[0],
+                                               image_height=img_size[1])
+                for img_size in processed_outputs["image_sizes"]
+            ]
+            processed_outputs["num_img_tokens"] = num_img_tokens
+
+        if audio_data:
+            audio_features = processed_outputs['audio_input_features']
+            sr = self.info.get_feature_extractor().sampling_rate
+            feature_sizes = [
+                self.info.get_audio_num_frames(len(audio), sr)
+                for audio in audio_data
+            ]
+            processed_outputs['audio_input_features'] = [
+                audio_features[idx, :size]
+                for idx, size in enumerate(feature_sizes)
+            ]
+
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            image_pixel_values=MultiModalFieldConfig.batched("image"),
+            image_attention_mask=MultiModalFieldConfig.batched("image"),
+            image_sizes=MultiModalFieldConfig.batched("image"),
+            num_img_tokens=MultiModalFieldConfig.batched("image"),
+            audio_input_features=MultiModalFieldConfig.batched("audio"),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, Any],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        tokenizer = self.info.get_tokenizer()
+        image_token_id = tokenizer.vocab[tokenizer.image_token]
+        audio_token_id = tokenizer.vocab[tokenizer.audio_token]
+
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        audio_processor = self.info.get_feature_extractor()
+
+        def get_image_replacement_phi4mm(item_idx: int):
+            images = mm_items.get_items(
+                "image", (ImageEmbeddingItems, ImageProcessorItems))
+
+            if isinstance(images, ImageEmbeddingItems):
+                num_image_tokens = images.get_feature_size(item_idx)
+            else:
+                image_size = images.get_image_size(item_idx)
+                num_image_tokens = self.info.get_num_image_tokens(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                    processor=hf_processor,
+                )
+
+            image_tokens = [image_token_id] * num_image_tokens
+
+            return image_tokens
+
+        def get_audio_replacement_phi4mm(item_idx: int):
+            audios = mm_items.get_items("audio", AudioProcessorItems)
+            # TODO(Isotr0py): support embedding inputs
+            audio_len = audios.get_audio_length(item_idx)
+            audio_frames = self.info.get_audio_num_frames(
+                audio_len, audio_processor.sampling_rate)
+            audio_embed_size = self.info._compute_audio_embed_size(
+                audio_frames)
+
+            audio_tokens = [audio_token_id] * audio_embed_size
+
+            return audio_tokens
+
+        return [
+            PromptReplacement(
+                modality="audio",
+                target=[audio_token_id],
+                replacement=get_audio_replacement_phi4mm,
+            ),
+            PromptReplacement(
+                modality="image",
+                target=[image_token_id],
+                replacement=get_image_replacement_phi4mm,
+            ),
+        ]
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    Phi4MMMultiModalProcessor,
+    info=Phi4MMProcessingInfo,
+    dummy_inputs=Phi4MMDummyInputsBuilder,
+)
+class Phi4MultimodalForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
+    """
+    Implements the Phi-4-multimodal-instruct model in vLLM.
+    """
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "qkv_proj",
+        ],
+        "gate_up_proj": [
+            "gate_up_proj",
+        ],
+    }
+
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            # Multimodal embedding
+            "model.embed_tokens_extend.": "",
+            # LLM backbone
+            "model.": "language_model.model.",
+        },
+        orig_to_new_substr={
+            # projection
+            ".img_projection_": ".img_projection.",
+            ".up_proj_for_speech.": ".speech_projection.up.",
+            ".up_proj_for_vision_speech.": ".vision_speech_projection.up.",
+            ".down_proj_for_speech.": ".speech_projection.down.",
+            ".down_proj_for_vision_speech.": ".vision_speech_projection.down.",
+        },
+    )
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<|image|>"
+        if modality.startswith("audio"):
+            return "<|audio|>"
+
+        raise ValueError("Only image or audio modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        # TODO: Optionally initializes these for supporting input embeddings.
+        self.image_embed = Phi4MMImageEmbedding(
+            config,
+            # prefix=maybe_prefix(prefix, "image_embed"),
+        )
+        self.audio_embed = Phi4MMAudioEmbedding(
+            config,
+            # prefix=maybe_prefix(prefix, "audio_embed"),
+        )
+
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+            architectures=["Phi3ForCausalLM"],
+        )
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    def _parse_and_validate_audio_input(
+            self, **kwargs: object) -> Optional[Phi4MMAudioInputs]:
+        """
+        Parse and validate the audio input to the model.  This handles both 
+        audio features and audio embeddings, but only the former is used for
+        now.
+
+        Args:
+            kwargs (object): Keyword arguments.
+
+        Returns:
+            Optional[Phi4MMAudioInputs]: Parsed and validated audio inputs.
+        """
+        audio_features = kwargs.pop("audio_input_features", None)
+        audio_embeds = kwargs.pop("audio_embeds", None)
+
+        if audio_features is None and audio_embeds is None:
+            return None
+
+        if audio_features is not None:
+            if not isinstance(audio_features, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of audio features. "
+                                 f"Got type: {type(audio_features)}")
+
+            return Phi4MMAudioFeatureInputs(type="audio_features",
+                                            data=flatten_bn(audio_features))
+
+        if audio_embeds is not None:
+            if not isinstance(audio_embeds, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of audio embeds. "
+                                 f"Got type: {type(audio_embeds)}")
+
+            return Phi4MMAudioEmbeddingInputs(type="audio_embeds",
+                                              data=audio_embeds)
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _process_audio_input(self, audio_input: Phi4MMAudioInputs,
+                             audio_projection_mode: str) -> NestedTensors:
+        """
+        Create the audio embeddings from the audio input, where the audio input
+        is pairs of audio features and audio embed lengths.  The audio input is
+        created by `input_mapper_for_phi4mm_audio`.
+
+        Args:
+            audio_input (Phi4MMAudioInputs): Audio input.
+
+        Returns:
+            NestedTensors: Audio embeddings
+        """
+        if audio_input["type"] == "audio_embeds":
+            return audio_input["data"]
+
+        audio_features = audio_input["data"]
+        # (e.g. multiple examples) and the second dim is the multi-audio dim
+        # (e.g. multiple audios in the same example)
+
+        dtype = next(self.audio_embed.parameters()).dtype
+        audio_embeds = [
+            self.audio_embed(
+                features.unsqueeze(0).to(dtype),
+                audio_projection_mode=audio_projection_mode,
+            ) for features in audio_features
+        ]
+        return audio_embeds
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[Phi4MMImagePixelInputs]:
+        image_pixel_values: NestedTensors = kwargs.get("image_pixel_values")
+        if image_pixel_values is None:
+            return None
+
+        image_sizes = kwargs.get("image_sizes")
+        image_attention_mask = kwargs.get("image_attention_mask")
+        num_img_tokens = kwargs.get("num_img_tokens")
+        assert image_sizes is not None and image_attention_mask is not None\
+              and num_img_tokens is not None, "Missing image inputs"
+
+        if is_list_of(image_pixel_values, torch.Tensor):
+            assert all(p.dim() == 5
+                       for p in image_pixel_values), "Incorrect image inputs"
+            # list len is batch_size.
+            # each tensor has dimension: num_img_per_example, num_hd_patches,
+            # channels, height, width.
+            # need to pad along num_hd_patches.
+            # mask size num_img_per_prompt, num_hd_patches, feat_h, heat_w.
+            image_pixel_values = cat_with_pad(image_pixel_values, dim=0)
+        elif isinstance(image_pixel_values, torch.Tensor):
+            # dimension: batch_size, num_img_per_example, num_hd_patches,
+            # channels, height, width.
+            # we flatten first 2 dims to make it a single large batch for
+            # SigLIP Encoder.
+            assert image_pixel_values.dim() == 6, "Incorrect image inputs"
+            image_pixel_values = image_pixel_values.flatten(0, 1)
+        else:
+            raise ValueError("Incorrect image_pixel_values inputs")
+
+        if isinstance(image_attention_mask, list):
+            image_attention_mask = cat_with_pad(image_attention_mask, dim=0)
+        elif isinstance(image_attention_mask, torch.Tensor):
+            image_attention_mask = image_attention_mask.flatten(0, 1)
+        else:
+            raise ValueError("Incorrect image_attention_mask inputs")
+
+        if isinstance(image_sizes, list):
+            image_sizes = torch.cat(image_sizes, dim=0)
+        elif isinstance(image_sizes, torch.Tensor):
+            image_sizes = image_sizes.flatten(0, 1)
+        else:
+            raise ValueError("Incorrect image_attention_mask inputs")
+
+        if isinstance(num_img_tokens, list):
+            num_img_tokens = [
+                n for num_tensor in num_img_tokens
+                for n in num_tensor.tolist()
+            ]
+        elif isinstance(num_img_tokens, torch.Tensor):
+            num_img_tokens = num_img_tokens.flatten(0, 1).tolist()
+        else:
+            raise ValueError("Incorrect image_attention_mask inputs")
+
+        return Phi4MMImagePixelInputs(
+            type="pixel_values",
+            data=image_pixel_values,
+            image_sizes=image_sizes,
+            image_attention_mask=image_attention_mask,
+            num_img_tokens=num_img_tokens,
+        )
+
+    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
+        modalities = {}
+
+        # Preserve the order of modalities if there are multiple of them
+        # from the order of kwargs.
+        for input_key in kwargs:
+            if input_key in ("image_pixel_values",
+                             "image_embeds") and "images" not in modalities:
+                modalities["images"] = self._parse_and_validate_image_input(
+                    **kwargs)
+            if input_key in ("audio_input_features",
+                             "audio_embeds") and "audios" not in modalities:
+                modalities["audios"] = self._parse_and_validate_audio_input(
+                    **kwargs)
+
+        return modalities
+
+    def _process_image_input(
+            self, image_input: Phi4MMImagePixelInputs) -> list[torch.Tensor]:
+        if image_input["type"] == "image_embeds":
+            image_embeds = image_input["image_embeds"].type(self.visual.dtype)
+        else:
+            dtype = next(self.image_embed.parameters()).dtype
+            pixel_values = image_input['data'].to(dtype)
+            image_sizes = image_input['image_sizes']
+            image_attention_mask = image_input['image_attention_mask']
+            image_embeds = self.image_embed(pixel_values, image_sizes,
+                                            image_attention_mask)
+        return image_embeds
+
+    def get_multimodal_embeddings(
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
+
+        modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
+        if not modalities:
+            return None
+
+        # The result multimodal_embeddings is tuple of tensors, with each
+        # tensor correspoending to a multimodal data item (image or video).
+        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
+
+        # NOTE: It is important to iterate over the keys in this dictionary
+        # to preserve the order of the modalities.
+        audio_projection_mode = 'speech'
+        for modality in modalities:
+            # make sure process images first
+            if modality == "images":
+                audio_projection_mode = "vision"
+                image_input = modalities["images"]
+                vision_embeddings = self._process_image_input(image_input)
+                multimodal_embeddings += tuple(vision_embeddings)
+            if modality == "audios":
+                audio_input = modalities["audios"]
+                audio_embeddings = self._process_audio_input(
+                    audio_input, audio_projection_mode=audio_projection_mode)
+                multimodal_embeddings += tuple(audio_embeddings)
+
+        return multimodal_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                [_IMAGE_PLACEHOLDER_TOKEN_ID, _AUDIO_PLACEHOLDER_TOKEN_ID])
+        return inputs_embeds
+
+    def get_input_embeddings_v0(
+        self,
+        input_ids: torch.Tensor,
+        image_input: Optional[Phi4MMImagePixelInputs] = None,
+        audio_input: Optional[Phi4MMAudioFeatureInputs] = None,
+    ) -> torch.Tensor:
+        audio_projection_mode = 'speech'
+        inputs_embeds = self.get_input_embeddings(input_ids)
+        if image_input is not None:
+            image_embeds = self._process_image_input(image_input)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                image_embeds,
+                placeholder_token_id=_IMAGE_PLACEHOLDER_TOKEN_ID,
+            )
+            audio_projection_mode = 'vision'
+
+        if audio_input is not None:
+            audio_embeds = self._process_audio_input(
+                audio_input, audio_projection_mode=audio_projection_mode)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                audio_embeds,
+                placeholder_token_id=_AUDIO_PLACEHOLDER_TOKEN_ID,
+            )
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> torch.Tensor:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner from
+        # `get_multimodal_embeddings` and `get_input_embeddings`, this
+        # condition is only for v0 compatibility.
+        elif inputs_embeds is None:
+            image_input = self._parse_and_validate_image_input(**kwargs)
+            audio_input = self._parse_and_validate_audio_input(**kwargs)
+
+            if image_input is None and audio_input is None:
+                inputs_embeds = None
+            else:
+                inputs_embeds = self.get_input_embeddings_v0(
+                    input_ids,
+                    image_input=image_input,
+                    audio_input=audio_input)
+                input_ids = None
+
+        hidden_states = self.language_model(
+            input_ids,
+            positions,
+            intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="language_model.",
+            connector=[
+                "img_projection", "vision_speech_projection",
+                "speech_projection"
+            ],
+            tower_model=["image_embed", "audio_embed"],
+        )
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 325a264a2f4c4..41eaf372785eb 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -671,7 +671,19 @@ class Attention(nn.Module):
         v = v.reshape(batch, patches, self.n_heads, self.head_dim)
 
         q, k = apply_rotary_emb_vit(q, k, freqs_cis=freqs_cis)
-        out = xops.memory_efficient_attention(q, k, v, attn_bias=mask)
+
+        if USE_XFORMERS_OPS:
+            out = xops.memory_efficient_attention(q, k, v, attn_bias=mask)
+        else:
+            q = q.transpose(1, 2)
+            k = k.transpose(1, 2)
+            v = v.transpose(1, 2)
+            out = nn.functional.scaled_dot_product_attention(q,
+                                                             k,
+                                                             v,
+                                                             attn_mask=mask)
+            out = out.transpose(1, 2)
+
         out = out.reshape(batch, patches, self.n_heads * self.head_dim)
         return self.wo(out)
 
@@ -814,8 +826,11 @@ class VisionTransformer(nn.Module):
             mask = xops.fmha.attn_bias.BlockDiagonalMask.from_seqlens(
                 [p.shape[-2] * p.shape[-1] for p in patch_embeds_list], )
         else:
-            raise ImportError("Xformers is required for Pixtral inference "
-                              "with the Mistral format")
+            from transformers.models.pixtral.modeling_pixtral import (
+                generate_block_attention_mask)
+            mask = generate_block_attention_mask(
+                [p.shape[-2] * p.shape[-1] for p in patch_embeds_list],
+                patch_embeds)
         out = self.transformer(patch_embeds, mask=mask, freqs_cis=freqs_cis)
 
         # squeeze dim 0 and split into separate tensors for each image
diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py
index 670576c68efdd..9bc577cfe3a3e 100644
--- a/vllm/model_executor/models/plamo2.py
+++ b/vllm/model_executor/models/plamo2.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only PLaMo2 model."""
-import math
 from collections.abc import Iterable
 from typing import Optional
 
@@ -11,30 +10,40 @@ from transformers import PretrainedConfig, PreTrainedModel
 
 from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.attention.layer import Attention
-from vllm.config import CacheConfig, VllmConfig
-from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import VllmConfig
+from vllm.distributed import divide, get_tensor_model_parallel_world_size
+from vllm.distributed.parallel_state import get_pp_group
 from vllm.forward_context import get_forward_context
+from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                MergedColumnParallelLinear,
                                                QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.mamba.mamba2_metadata import (
+    Mamba2Metadata, prepare_mamba2_metadata)
 from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
     causal_conv1d_fn, causal_conv1d_update)
 from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
-    selective_scan_fn, selective_state_update)
+    selective_state_update)
+from vllm.model_executor.layers.mamba.ops.ssd_combined import (
+    mamba_chunk_scan_combined)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
     composed_weight_loader, default_weight_loader, sharded_weight_loader)
 from vllm.model_executor.models.interfaces import (HasInnerState, IsHybrid,
-                                                   SupportsV0Only)
+                                                   SupportsPP, SupportsV0Only)
 from vllm.model_executor.models.mamba_cache import (MambaCacheManager,
                                                     MambaCacheParams)
-from vllm.model_executor.models.utils import maybe_prefix
+from vllm.model_executor.models.utils import (
+    is_pp_missing_parameter, make_empty_intermediate_tensors_factory,
+    make_layers, maybe_prefix)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.sequence import IntermediateTensors
@@ -77,17 +86,6 @@ class Plamo2PreTrainedModel(PreTrainedModel):  # type: ignore
                 module.weight.data[module.padding_idx].zero_()
 
 
-def get_initial_dt_bias(num_heads: int) -> torch.Tensor:
-    dt_min = 0.001
-    dt_max = 0.1
-    dt = torch.exp(
-        torch.rand(num_heads) * (math.log(dt_max) - math.log(dt_min)) +
-        math.log(dt_min))
-    dt = torch.clamp(dt, 1e-4)
-    inv_dt = dt + torch.log(-torch.expm1(-dt))
-    return inv_dt
-
-
 def is_mamba(config: Plamo2Config, i: int) -> bool:
     assert config.mamba_step > 1
 
@@ -97,52 +95,36 @@ def is_mamba(config: Plamo2Config, i: int) -> bool:
     return (i % config.mamba_step) != (config.mamba_step // 2)
 
 
-# TODO(Shinichi): Replace this with RMSNorm.
-def _rms_norm(hidden_states: torch.Tensor, weight: torch.Tensor,
-              eps: float) -> torch.Tensor:
-    input_shape = hidden_states.shape
-    hidden_states = hidden_states.reshape(input_shape[:-1] + weight.shape)
-    input_dtype = hidden_states.dtype
-    hidden_states = hidden_states.to(torch.float32)
-    variance = hidden_states.pow(2).mean(-1, keepdim=True)
-    hidden_states = hidden_states * torch.rsqrt(variance + eps)
-    hidden_states = hidden_states.to(input_dtype)
-    hidden_states = weight * hidden_states
-    return hidden_states.reshape(input_shape)
-
-
-def _swiglu(h: torch.Tensor) -> torch.Tensor:
-    h0, h1 = h.chunk(2, dim=-1)
-    return torch.nn.functional.silu(h0) * h1
-
-
-# Adapted from transformers.models.mamba.modeling_mamba.MambaMixer
+# Adapted from:
+# vllm.model_executor.layers.mamba.mamba_mixer2.MambaMixer2
+# transformers.models.mamba.modeling_mamba.MambaMixer
 class Plamo2MambaMixer(nn.Module):
-    # TODO(Shinichi): Rebase on Mamba2 implementation.
 
     def __init__(self,
-                 config: Plamo2Config,
-                 cache_config: CacheConfig,
-                 quant_config: QuantizationConfig,
-                 max_model_len: int,
+                 vllm_config: VllmConfig,
+                 *,
                  prefix: str = "",
                  **kwargs) -> None:
         super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.ssm_state_size = config.mamba_d_state
-        self.conv_kernel_size = config.mamba_d_conv
-        self.intermediate_size = (config.mamba_num_heads *
-                                  config.hidden_size_per_head)
-        self.hidden_size_per_head = config.hidden_size_per_head
-        self.num_heads = config.mamba_num_heads
+        self.config = vllm_config.model_config.hf_config
+        self.quant_config = vllm_config.quant_config
+        self.hidden_size = self.config.hidden_size
+        self.ssm_state_size = self.config.mamba_d_state
+        self.conv_kernel_size = self.config.mamba_d_conv
+        self.intermediate_size = (self.config.mamba_num_heads *
+                                  self.config.hidden_size_per_head)
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.intermediate_size_per_tp_worker = \
+            self.intermediate_size // self.tp_size
+        self.head_dim = self.config.hidden_size_per_head
+        self.num_heads = self.config.mamba_num_heads
         self.time_step_rank = max(64, self.hidden_size // 16)
-        self.use_conv_bias = False
-        self.use_bias = False
         self.conv1d = ColumnParallelLinear(
             input_size=self.conv_kernel_size,
             output_size=self.intermediate_size,
-            bias=self.use_conv_bias,
+            bias=False,
+            prefix=f"{prefix}.conv1d",
+            return_bias=False,
         )
         # unsqueeze to fit conv1d weights shape into the linear weights shape.
         # Can't do this in `weight_loader` since it already exists in
@@ -153,15 +135,19 @@ class Plamo2MambaMixer(nn.Module):
         self.in_proj = MergedColumnParallelLinear(
             self.hidden_size,
             [self.intermediate_size] * 2,
-            bias=self.use_bias,
+            bias=False,
+            quant_config=self.quant_config,
             prefix=f"{prefix}.in_proj",
+            return_bias=False,
         )
         # selective projection used to make dt, B and C input dependent
         self.bcdt_proj = RowParallelLinear(
             self.intermediate_size,
             self.time_step_rank + self.ssm_state_size * 2,
             bias=False,
+            quant_config=self.quant_config,
             prefix=f"{prefix}.bcdt_proj",
+            return_bias=False,
         )
         # time step projection (discretization) -
         # In the forward we need to apply dt_proj without the bias,
@@ -170,154 +156,224 @@ class Plamo2MambaMixer(nn.Module):
             self.time_step_rank,
             self.num_heads,
             bias=False,
+            quant_config=self.quant_config,
             prefix=f"{prefix}.dt_proj",
+            return_bias=False,
         )
-        self.dt_bias = torch.nn.Parameter(get_initial_dt_bias(self.num_heads))
 
-        tp_size = get_tensor_model_parallel_world_size()
         self.A = nn.Parameter(
             torch.empty(
-                self.intermediate_size // tp_size,
-                self.ssm_state_size,
+                divide(self.num_heads, self.tp_size),
                 dtype=torch.float32,
             ))
-        self.D = nn.Parameter(torch.ones(self.intermediate_size // tp_size))
+        self.D = nn.Parameter(torch.ones(divide(self.num_heads, self.tp_size)))
+        self.dt_bias = nn.Parameter(
+            torch.ones(divide(self.num_heads, self.tp_size)))
 
         set_weight_attrs(self.D, {"weight_loader": sharded_weight_loader(0)})
         a_weight_loader = composed_weight_loader(
             sharded_weight_loader(0), lambda x: -torch.exp(x.float()))
         set_weight_attrs(self.A, {"weight_loader": a_weight_loader})
+        set_weight_attrs(self.dt_bias,
+                         {"weight_loader": sharded_weight_loader(0)})
 
         self.out_proj = RowParallelLinear(
             self.intermediate_size,
             self.hidden_size,
-            bias=self.use_bias,
+            bias=False,
             input_is_parallel=True,
+            quant_config=self.quant_config,
             prefix=f"{prefix}.out_proj",
+            return_bias=False,
         )
         # The activation function is fixed to SiLU.
         self.activation = "silu"
 
-        self.dt_norm = RMSNorm(self.time_step_rank, eps=config.rms_norm_eps)
-        self.B_norm = RMSNorm(self.ssm_state_size, eps=config.rms_norm_eps)
-        self.C_norm = RMSNorm(self.ssm_state_size, eps=config.rms_norm_eps)
+        self.dt_norm = RMSNorm(self.time_step_rank,
+                               eps=self.config.rms_norm_eps)
+        self.B_norm = RMSNorm(self.ssm_state_size,
+                              eps=self.config.rms_norm_eps)
+        self.C_norm = RMSNorm(self.ssm_state_size,
+                              eps=self.config.rms_norm_eps)
 
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        mamba_cache_params: MambaCacheParams,
-        **kwargs,
-    ) -> torch.Tensor:
-
-        attn_metadata: AttentionMetadata = get_forward_context().attn_metadata
-
-        # 1. Gated MLP's linear projection
-        projected_states = self.in_proj(hidden_states)[0]
-        # Reshaping the projected states as in modeling_plamo.py.
-        length = len(hidden_states)
-        projected_states = projected_states.reshape(length, self.num_heads, -1)
-        gate, hidden_states = torch.split(
-            projected_states,
-            [self.hidden_size_per_head, self.hidden_size_per_head],
-            dim=-1)
-        hidden_states = hidden_states.reshape(length, -1).transpose(0, 1)
-        gate = gate.reshape(length, -1).transpose(0, 1)
-
-        # 2. Convolution sequence transformation
-        conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0),
-                                               self.conv1d.weight.size(2))
-
-        if attn_metadata.query_start_loc is not None \
-            and attn_metadata.context_lens_tensor is not None:
-            # |---------- N-1 iteration --------|
-            # |---------------- N iteration ---------------------|
-            # |- tokenA -|......................|-- newTokens ---|
-            # |---------- context_len ----------|
-            # |-------------------- seq_len ---------------------|
-            #                                   |-- query_len ---|
-            hidden_states = causal_conv1d_fn(
-                hidden_states,
-                conv_weights,
-                self.conv1d.bias,
-                activation=self.activation,
-                conv_states=mamba_cache_params.conv_state,
-                has_initial_state=attn_metadata.context_lens_tensor > 0,
-                cache_indices=mamba_cache_params.state_indices_tensor,
-                query_start_loc=attn_metadata.query_start_loc)
-        else:
-            hidden_states = causal_conv1d_update(
-                hidden_states.transpose(0, 1),
-                mamba_cache_params.conv_state,
-                conv_weights,
-                self.conv1d.bias,
-                self.activation,
-                conv_state_indices=mamba_cache_params.state_indices_tensor)
-            hidden_states = hidden_states.transpose(0, 1)
-
-        # 3. State Space Model sequence transformation
-        # 3.a. input varying initialization of time_step, B and C
-        ssm_parameters = self.bcdt_proj(hidden_states.transpose(-2, -1))[0]
-
-        # Splitting the ssm_parameters as in modeling_plamo.py.
+    def _project_ssm_parameters(self, hidden_states):
+        ssm_parameters = self.bcdt_proj(hidden_states)
         B, C, time_step = torch.split(
             ssm_parameters,
             [self.ssm_state_size, self.ssm_state_size, self.time_step_rank],
             dim=-1,
         )
+
+        # vllm._custom_ops.rms_norm requires contiguous input tensors.
         time_step = self.dt_norm(time_step.contiguous())
         B = self.B_norm(B.contiguous())
         C = self.C_norm(C.contiguous())
+        dt = self.dt_proj(time_step)
+        return B, C, dt
 
-        discrete_time_step = self.dt_proj(time_step)[0].transpose(-2, -1)
-        # 3.c perform the recurrence y ← SSM(A, B, C)(x)
-        time_proj_bias = (self.dt_bias.float() if hasattr(
-            self.dt_proj, "bias") else None)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        mamba_cache_params: MambaCacheParams,
+        mamba2_metadata: Mamba2Metadata,
+        **kwargs,
+    ) -> torch.Tensor:
 
-        # Broadcasting as in modeling_plamo.py.
-        discrete_time_step = discrete_time_step.transpose(
-            0, 1)[..., None].expand(-1, -1, self.hidden_size_per_head)
-        discrete_time_step = discrete_time_step.reshape(
-            -1, self.intermediate_size).transpose(0, 1)
-        time_proj_bias = time_proj_bias[...,
-                                        None].expand(-1,
-                                                     self.hidden_size_per_head)
-        time_proj_bias = time_proj_bias.reshape(self.intermediate_size)
+        # mamba2_metadata contains metadata necessary for the mamba2 triton
+        # kernels to operate in continuous batching and in chunked prefill
+        # modes; they are computed at top-level model forward since they
+        # stay the same and reused for all mamba layers in the same iteration
+        attn_metadata: AttentionMetadata = get_forward_context().attn_metadata
 
-        if attn_metadata.query_start_loc is not None \
-            and attn_metadata.context_lens_tensor is not None:
-            scan_outputs = selective_scan_fn(
-                hidden_states,
-                mamba_cache_params.ssm_state,
-                discrete_time_step,
+        num_prefills = attn_metadata.num_prefills  # request count
+        num_decodes = attn_metadata.num_decode_tokens  # token count (=request)
+        num_prefill_tokens = attn_metadata.num_prefill_tokens  # token count
+        has_prefill = num_prefills > 0
+        has_decode = num_decodes > 0
+
+        # 1. Gated MLP's linear projection
+        projected_states = self.in_proj(hidden_states)
+        gate, hidden_states = projected_states.chunk(2, dim=-1)
+
+        # 2. Convolution sequence transformation
+        conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0),
+                                               self.conv1d.weight.size(2))
+
+        # Separate prefill and decode by splitting varlen input
+        # Split along token dimension
+        hidden_states_p, hidden_states_d = torch.split(
+            hidden_states,
+            [num_prefill_tokens, num_decodes],
+            dim=0,
+        )
+        gate_p, gate_d = torch.split(gate, [num_prefill_tokens, num_decodes],
+                                     dim=0)
+        # Split along batch dimension
+        state_indices_tensor_p, state_indices_tensor_d = torch.split(
+            mamba_cache_params.state_indices_tensor,
+            [num_prefills, num_decodes],
+            dim=0,
+        )
+        query_start_loc_p = (attn_metadata.query_start_loc[:num_prefills + 1]
+                             if has_prefill else None)
+
+        ssd_output_list = []
+
+        # Process prefill requests
+        if has_prefill:
+            # 2. Convolution sequence transformation
+            # - "cache_indices" updates the conv_state cache in positions
+            # pointed to by "mamba_cache_params.state_indices_tensor"
+            hidden_states_p = causal_conv1d_fn(
+                hidden_states_p.transpose(0, 1),
+                conv_weights,
+                self.conv1d.bias,
+                activation=self.activation,
+                conv_states=mamba_cache_params.conv_state,
+                has_initial_state=mamba2_metadata.has_initial_states,
+                cache_indices=state_indices_tensor_p,
+                query_start_loc=query_start_loc_p)
+            hidden_states_p = hidden_states_p.transpose(0, 1)
+            hidden_states_p = hidden_states_p[:num_prefill_tokens]
+            # In some instances, the following `bcdt_proj` op
+            # requires contiguous inputs
+            # (e.g. if the Marlin kernel is used).
+            hidden_states_p = hidden_states_p.contiguous()
+
+            B, C, dt = self._project_ssm_parameters(hidden_states_p)
+
+            # 3. State Space Model sequence transformation
+            initial_states = None
+            if (mamba2_metadata.has_initial_states is not None
+                    and mamba2_metadata.prep_initial_states):
+                # making a copy of the states
+                initial_states = torch.where(
+                    mamba2_metadata.has_initial_states[:, None, None, None],
+                    mamba_cache_params.ssm_state[state_indices_tensor_p], 0)
+            scan_output, varlen_state = mamba_chunk_scan_combined(
+                hidden_states_p.view(1, num_prefill_tokens,
+                                     self.num_heads // self.tp_size,
+                                     self.head_dim),
+                dt.unsqueeze(0),
                 self.A,
-                B.transpose(-2, -1),
-                C.transpose(-2, -1),
-                self.D.float(),
-                gate,
-                time_proj_bias,
-                delta_softplus=True,
-                cache_indices=mamba_cache_params.state_indices_tensor,
-                has_initial_state=attn_metadata.context_lens_tensor > 0,
-                query_start_loc=attn_metadata.query_start_loc)
-        else:
-            scan_outputs = selective_state_update(
+                B.view(1, num_prefill_tokens, 1, -1),
+                C.view(1, num_prefill_tokens, 1, -1),
+                chunk_size=mamba2_metadata.chunk_size,
+                D=self.D,
+                z=gate_p.view(1, num_prefill_tokens,
+                              self.num_heads // self.tp_size, self.head_dim),
+                dt_bias=self.dt_bias,
+                seq_idx=mamba2_metadata.seq_idx,
+                chunk_indices=mamba2_metadata.chunk_indices,
+                chunk_offsets=mamba2_metadata.chunk_offsets,
+                cu_seqlens=attn_metadata.query_start_loc[:num_prefills + 1],
+                initial_states=initial_states,
+                return_varlen_states=True,
+                return_final_states=False,
+                dt_softplus=True,
+                dt_limit=(0.0, float("inf")),
+            )
+
+            # update ssm states
+            # - varlen state is a (batch, nheads, headdim, dstate) tensor
+            mamba_cache_params.ssm_state[state_indices_tensor_p] = varlen_state
+
+            # - reshape
+            ssd_output_list.append(scan_output.view(num_prefill_tokens, -1))
+
+        # Process decode requests
+        if has_decode:
+            # 2. Convolution sequence transformation
+            hidden_states_d = causal_conv1d_update(
+                hidden_states_d,
+                mamba_cache_params.conv_state,
+                conv_weights,
+                self.conv1d.bias,
+                self.activation,
+                conv_state_indices=state_indices_tensor_d)
+
+            B, C, dt = self._project_ssm_parameters(hidden_states_d)
+
+            # 3. State Space Model sequence transformation
+            A = self.A[:, None, ...][:, :,
+                                     None].expand(-1, self.head_dim,
+                                                  self.config.mamba_d_state)
+            dt = dt[:, :, None].expand(-1, -1, self.head_dim)
+            dt_bias = self.dt_bias[:, None, ...].expand(-1, self.head_dim)
+            D = self.D[:, None, ...].expand(-1, self.head_dim)
+            B = B.unsqueeze(1)
+            C = C.unsqueeze(1)
+            hidden_states_d = hidden_states_d.view(
+                -1, self.num_heads // self.tp_size, self.head_dim)
+
+            # - the hidden is reshaped into (bs, num_heads, head_dim)
+            # - mamba_cache_params.ssm_state's slots will be selected
+            #   using state_indices_tensor_d
+
+            hidden_states_d = selective_state_update(
                 mamba_cache_params.ssm_state,
-                hidden_states.transpose(0, 1),
-                discrete_time_step.transpose(0, 1),
-                self.A,
+                hidden_states_d,
+                dt,
+                A,
                 B,
                 C,
-                self.D,
-                gate.transpose(0, 1),
-                time_proj_bias,
+                D,
+                z=gate_d.reshape(num_decodes, -1, self.head_dim),
+                dt_bias=dt_bias,
                 dt_softplus=True,
-                state_batch_indices=mamba_cache_params.state_indices_tensor)
-            scan_outputs = scan_outputs.transpose(0, 1)
+                state_batch_indices=state_indices_tensor_d,
+            )
+            assert self.num_heads % self.tp_size == 0
+            ssd_output_list.append(
+                hidden_states_d.view(-1, (self.num_heads // self.tp_size) *
+                                     self.head_dim))
+
+        # Merge prefill and decode outputs before passing to MLP
+        hidden_states = torch.vstack(ssd_output_list)
 
         # 4. Final linear projection
-        contextualized_states = self.out_proj(scan_outputs.transpose(-2,
-                                                                     -1))[0]
-        return contextualized_states
+        out = self.out_proj(hidden_states)
+        return out
 
 
 class DenseMLP(nn.Module):
@@ -332,33 +388,39 @@ class DenseMLP(nn.Module):
         self.hidden_size = config.hidden_size
         self.intermediate_size = config.intermediate_size
         self.gate_up_proj = MergedColumnParallelLinear(
-            self.hidden_size, [self.intermediate_size] * 2,
+            self.hidden_size,
+            [self.intermediate_size] * 2,
             bias=False,
             prefix=f"{prefix}.gate_up_proj",
-            quant_config=quant_config)
+            quant_config=quant_config,
+            return_bias=False,
+        )
+        self.act = SiluAndMul()
         self.down_proj = RowParallelLinear(self.intermediate_size,
                                            self.hidden_size,
                                            bias=False,
                                            prefix=f"{prefix}.down_proj",
-                                           quant_config=quant_config)
+                                           quant_config=quant_config,
+                                           return_bias=False)
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        h = self.gate_up_proj(hidden_states)[0]
-        h = _swiglu(h)
-        output, _ = self.down_proj(h)
-        return output  # type: ignore
+        h = self.gate_up_proj(hidden_states)
+        h = self.act(h)
+        return self.down_proj(h)
 
 
+@support_torch_compile
 class Plamo2AttentionMixer(nn.Module):
 
     def __init__(self,
-                 config: Plamo2Config,
-                 cache_config: CacheConfig,
-                 quant_config: QuantizationConfig,
-                 max_model_len: int | None = None,
+                 *,
+                 vllm_config: VllmConfig,
                  prefix: str = "",
                  **kwargs) -> None:
         super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
         self.hidden_size = config.hidden_size
         tp_size = get_tensor_model_parallel_world_size()
         self.total_num_heads = config.num_attention_heads
@@ -396,19 +458,35 @@ class Plamo2AttentionMixer(nn.Module):
                                                        "rope_theta") else 10000
         self.rope_scaling = config.rope_scaling if hasattr(
             config, "rope_scaling") else None
+        max_position = config.max_position_embeddings
+        if hasattr(vllm_config.model_config, "max_model_len") and isinstance(
+                vllm_config.model_config.max_model_len, int):
+            max_position = min(max_position,
+                               vllm_config.model_config.max_model_len)
 
-        assert max_model_len is not None, "max_model_len must be provided"
         self.rotary_emb = get_rope(
             self.head_dim,
             rotary_dim=self.head_dim,
-            max_position=max_model_len,
+            max_position=max_position,
             base=self.rope_theta,
             rope_scaling=self.rope_scaling,
         )
-        self.q_weight = torch.nn.Parameter(
+        self.q_norm = RMSNorm(config.hidden_size_per_head,
+                              eps=config.rms_norm_eps)
+        self.q_norm.weight = torch.nn.Parameter(
             torch.ones((self.num_heads, config.hidden_size_per_head)))
-        self.k_weight = torch.nn.Parameter(
+        set_weight_attrs(self.q_norm.weight,
+                         {"weight_loader": sharded_weight_loader(0)})
+        self.k_norm = RMSNorm(config.hidden_size_per_head,
+                              eps=config.rms_norm_eps)
+        self.k_norm.weight = torch.nn.Parameter(
             torch.ones((self.num_kv_heads, config.hidden_size_per_head)))
+        # Tensor-parallelism shards the K norm weights to the tp ranks
+        # in a head-wise manner. This approach does not work if there is only
+        # a single KV head, as is the case for PLaMo 2-1B.
+        if self.total_num_kv_heads != 1:
+            set_weight_attrs(self.k_norm.weight,
+                             {"weight_loader": sharded_weight_loader(0)})
 
         self.attn = Attention(
             self.num_heads,
@@ -423,13 +501,18 @@ class Plamo2AttentionMixer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
         **kwargs,
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
-        q = _rms_norm(q, self.q_weight, 1e-6)
-        k = _rms_norm(k, self.k_weight, 1e-6)
+
+        q_shape = q.shape
+        q = q.reshape(q_shape[:-1] + self.q_norm.weight.shape)
+        q = self.q_norm.forward_native(q).reshape(q_shape)
+        k_shape = k.shape
+        k = k.reshape(k_shape[:-1] + self.k_norm.weight.shape)
+        k = self.k_norm.forward_native(k).reshape(k_shape)
+
         q, k = self.rotary_emb(positions, q, k)
         attn_output = self.attn(q, k, v)
         output, _ = self.o_proj(attn_output)
@@ -441,27 +524,18 @@ class Plamo2DecoderLayer(nn.Module):
     def __init__(self,
                  vllm_config: VllmConfig,
                  layer_idx: int,
-                 max_model_len: int | None = None,
                  prefix: str = "",
                  **kwargs) -> None:
         super().__init__()
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        max_model_len = vllm_config.scheduler_config.max_model_len
 
         self.is_mamba = is_mamba(config, layer_idx)
         if self.is_mamba:
-            self.mixer = Plamo2MambaMixer(config=config,
-                                          cache_config=cache_config,
-                                          quant_config=quant_config,
-                                          max_model_len=max_model_len,
+            self.mixer = Plamo2MambaMixer(vllm_config=vllm_config,
                                           prefix=f"{prefix}.mixer")
         else:
-            self.mixer = Plamo2AttentionMixer(config=config,
-                                              cache_config=cache_config,
-                                              quant_config=quant_config,
-                                              max_model_len=max_model_len,
+            self.mixer = Plamo2AttentionMixer(vllm_config=vllm_config,
                                               prefix=f"{prefix}.mixer")
 
         self.mlp = DenseMLP(config=config,
@@ -482,6 +556,7 @@ class Plamo2DecoderLayer(nn.Module):
         hidden_states: torch.Tensor,
         residual: Optional[torch.Tensor],
         mamba_cache_params: MambaCacheParams,
+        mamba2_metadata: Mamba2Metadata,
         **kwargs,
     ):
         if residual is None:
@@ -491,10 +566,12 @@ class Plamo2DecoderLayer(nn.Module):
             hidden_states, residual = self.pre_mixer_norm(
                 hidden_states, residual)
 
-        hidden_states = self.mixer(positions=positions,
-                                   hidden_states=hidden_states,
-                                   residual=residual,
-                                   mamba_cache_params=mamba_cache_params)
+        hidden_states = self.mixer(
+            positions=positions,
+            hidden_states=hidden_states,
+            mamba_cache_params=mamba_cache_params,
+            mamba2_metadata=mamba2_metadata,
+        )
         hidden_states = self.post_mixer_norm(hidden_states)
         # Fully Connected
         hidden_states, residual = self.pre_mlp_norm(hidden_states, residual)
@@ -507,14 +584,18 @@ class Plamo2Decoder(torch.nn.Module):
 
     def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
-        num_hidden_layers = vllm_config.model_config.hf_config.num_hidden_layers
+        config = vllm_config.model_config.hf_config
+        extra_kwargs = {"is_lora_enabled": bool(vllm_config.lora_config)}
 
-        self.layers = nn.ModuleList([
-            Plamo2DecoderLayer(vllm_config=vllm_config,
-                               layer_idx=i,
-                               prefix=f"{prefix}.layers.{i}")
-            for i in range(num_hidden_layers)
-        ])
+        def get_layer(prefix: str):
+            layer_idx = int(prefix.rsplit(".", 1)[1])
+            return Plamo2DecoderLayer(vllm_config=vllm_config,
+                                      layer_idx=layer_idx,
+                                      prefix=prefix,
+                                      **extra_kwargs)
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers, get_layer, prefix=f"{prefix}.layers")
 
     def forward(
         self,
@@ -522,9 +603,10 @@ class Plamo2Decoder(torch.nn.Module):
         hidden_states: torch.Tensor,
         residual: Optional[torch.Tensor],
         mamba_cache_params: MambaCacheParams,
+        mamba2_metadata: Mamba2Metadata,
     ) -> torch.Tensor:
         mamba_cache_index = 0
-        for layer in self.layers:
+        for layer in self.layers[self.start_layer:self.end_layer]:
             layer_mamba_cache_params = None
             if layer.is_mamba:
                 layer_mamba_cache_params = mamba_cache_params.at_layer_idx(
@@ -535,7 +617,9 @@ class Plamo2Decoder(torch.nn.Module):
                 positions=positions,
                 hidden_states=hidden_states,
                 residual=residual,
-                mamba_cache_params=layer_mamba_cache_params)
+                mamba_cache_params=layer_mamba_cache_params,
+                mamba2_metadata=mamba2_metadata,
+            )
         return hidden_states, residual
 
 
@@ -557,10 +641,16 @@ class Plamo2Model(Plamo2PreTrainedModel):
             org_num_embeddings=config.vocab_size,
             prefix=f"{prefix}.embed_tokens",
         )
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
         self.layers = Plamo2Decoder(vllm_config, prefix=f"{prefix}.layers")
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.post_init()
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -569,21 +659,41 @@ class Plamo2Model(Plamo2PreTrainedModel):
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        # TODO(Shinichi): Implement pipeline parallelism.
-        hidden_states = self.embed_tokens(input_ids)
-        residual = None
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        attn_metadata: AttentionMetadata = get_forward_context().attn_metadata
+        mamba2_metadata = prepare_mamba2_metadata(
+            chunk_size=self.config.mamba_chunk_size,
+            attn_metadata=attn_metadata,
+        )
 
         hidden_states, residual = self.layers(
             positions=positions,
             hidden_states=hidden_states,
             residual=residual,
-            mamba_cache_params=mamba_cache_params)
+            mamba_cache_params=mamba_cache_params,
+            mamba2_metadata=mamba2_metadata,
+        )
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
 
-class Plamo2ForCausalLM(Plamo2PreTrainedModel, HasInnerState, IsHybrid,
-                        SupportsV0Only):
+class Plamo2ForCausalLM(Plamo2PreTrainedModel, HasInnerState, SupportsPP,
+                        IsHybrid, SupportsV0Only):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -629,10 +739,15 @@ class Plamo2ForCausalLM(Plamo2PreTrainedModel, HasInnerState, IsHybrid,
 
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 self.config.vocab_size)
-
+        self.sampler = get_sampler()
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
         # Initialize weights and apply final processing
         self.post_init()
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(self,
                 input_ids: torch.Tensor,
                 positions: torch.Tensor,
@@ -661,7 +776,7 @@ class Plamo2ForCausalLM(Plamo2PreTrainedModel, HasInnerState, IsHybrid,
         return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size)
 
     def _get_mamba_cache_shape(
-            self) -> tuple[tuple[int, int], tuple[int, int]]:
+            self) -> tuple[tuple[int, int], tuple[int, int, int]]:
         world_size = get_tensor_model_parallel_world_size()
         hidden_size = (self.config.mamba_num_heads *
                        self.config.hidden_size_per_head)
@@ -670,7 +785,8 @@ class Plamo2ForCausalLM(Plamo2PreTrainedModel, HasInnerState, IsHybrid,
             self.config.mamba_d_conv - 1,
         )
         temporal_state_shape = (
-            hidden_size // world_size,
+            divide(self.config.mamba_num_heads, world_size),
+            self.config.hidden_size_per_head,
             self.config.mamba_d_state,
         )
         return conv_state_shape, temporal_state_shape
@@ -684,6 +800,14 @@ class Plamo2ForCausalLM(Plamo2PreTrainedModel, HasInnerState, IsHybrid,
                                        sampling_metadata)
         return logits
 
+    def sample(
+        self,
+        logits: Optional[torch.Tensor],
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in weights:
@@ -703,23 +827,46 @@ class Plamo2ForCausalLM(Plamo2PreTrainedModel, HasInnerState, IsHybrid,
                 ".B_norm_weight": ".B_norm.weight",
                 ".C_norm_weight": ".C_norm.weight",
                 ".dt_norm_weight": ".dt_norm.weight",
+                ".q_weight": ".q_norm.weight",
+                ".k_weight": ".k_norm.weight",
             }
             # Apply replacements based on the defined mappings
             for old, new in replacements.items():
                 if old in name:
                     name = name.replace(old, new)
 
-            # Broadcast the loaded weight to match the model's parameter shape.
-            if ".A" in name:
-                loaded_weight = loaded_weight[:, None, None].expand(
-                    -1, self.config.hidden_size_per_head,
-                    self.config.mamba_d_state)
+            # Reshape the in_proj weights to match the shape expected
+            # by MergedColumnParallelLinear.
+            # This works both for unquantized weights and
+            # for quantized weights.
+            # In the quantized case, the weights are already transposed.
+            # Also, in addition to the quantized weights,
+            # the zero points and scales have to be reshaped as well.
+            # Packing should not be affected by this.
+            if ".mixer.in_proj.weight" in name \
+                or "mixer.in_proj.qweight" in name \
+                or "mixer.in_proj.scales" in name \
+                or "mixer.in_proj.qzeros" in name:
+                if "mixer.in_proj.weight" in name:
+                    loaded_weight = loaded_weight.transpose(0, 1)
+                # for weight:
+                # loaded_weight.shape[0] == self.config.hidden_size
+                # for qweight:
+                # loaded_weight.shape[0] == self.config.hidden_size // param.pack_factor  # noqa
+                # for scales and qzeros:
+                # loaded_weight.shape[0] == self.config.hidden_size // self.vllm_config.quant_config.group_size  # noqa
                 loaded_weight = loaded_weight.reshape(
-                    -1, self.config.mamba_d_state)
-            elif ".D" in name:
-                loaded_weight = loaded_weight[:, None].expand(
-                    -1, self.config.hidden_size_per_head)
-                loaded_weight = loaded_weight.reshape(-1)
+                    loaded_weight.shape[0], self.config.mamba_num_heads, -1)
+                gate_weight, hidden_states_weight = loaded_weight.chunk(2,
+                                                                        dim=-1)
+                gate_weight = gate_weight.reshape(loaded_weight.shape[0], -1)
+                hidden_states_weight = hidden_states_weight.reshape(
+                    loaded_weight.shape[0], -1)
+                loaded_weight = torch.cat([gate_weight, hidden_states_weight],
+                                          dim=-1)
+                if "mixer.in_proj.weight" in name:
+                    loaded_weight = loaded_weight.transpose(0, 1)
+
             # Offset parameter with vllm's RMSNorm haven't been supported yet.
             if ".pre_mixer_norm" in name:
                 loaded_weight += 1.0
@@ -732,6 +879,10 @@ class Plamo2ForCausalLM(Plamo2PreTrainedModel, HasInnerState, IsHybrid,
             elif "model.norm.weight" in name:
                 loaded_weight += 1.0
 
+            # Skip layers on other devices.
+            if is_pp_missing_parameter(name, self):
+                continue
+
             param = params_dict[name]
             weight_loader = getattr(param, "weight_loader",
                                     default_weight_loader)
diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
index 12899c28016b9..ca14fd06574ec 100644
--- a/vllm/model_executor/models/qwen3_moe.py
+++ b/vllm/model_executor/models/qwen3_moe.py
@@ -22,7 +22,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Qwen3MoE model compatible with HuggingFace weights."""
-from collections.abc import Iterable
+import typing
+from collections.abc import Callable, Iterable
 from typing import Any, Optional, Union
 
 import torch
@@ -31,8 +32,9 @@ from transformers import PretrainedConfig
 
 from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, VllmConfig
-from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config
+from vllm.distributed import (get_ep_group, get_pp_group,
+                              get_tensor_model_parallel_world_size)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import FusedMoE
@@ -50,8 +52,8 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (AutoWeightsLoader, extract_layer_index,
+from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP
+from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index,
                     is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
@@ -101,23 +103,47 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
         config: PretrainedConfig,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        enable_eplb: bool = False,
     ):
         super().__init__()
         self.tp_size = get_tensor_model_parallel_world_size()
 
+        self.ep_group = get_ep_group().device_group
+        self.ep_rank = self.ep_group.rank()
+        self.ep_size = self.ep_group.size()
+        self.n_routed_experts = config.num_experts
+
         if self.tp_size > config.num_experts:
             raise ValueError(
                 f"Tensor parallel size {self.tp_size} is greater than "
                 f"the number of experts {config.num_experts}.")
 
-        self.experts = FusedMoE(num_experts=config.num_experts,
+        # Load balancing settings.
+        vllm_config = get_current_vllm_config()
+        parallel_config = vllm_config.parallel_config
+        self.enable_eplb = enable_eplb
+
+        self.n_logical_experts = self.n_routed_experts
+        self.n_redundant_experts = parallel_config.num_redundant_experts
+        self.n_physical_experts = (self.n_logical_experts +
+                                   self.n_redundant_experts)
+        self.n_local_physical_experts = self.n_physical_experts // self.ep_size
+
+        self.physical_expert_start = (self.ep_rank *
+                                      self.n_local_physical_experts)
+        self.physical_expert_end = (self.physical_expert_start +
+                                    self.n_local_physical_experts)
+
+        self.experts = FusedMoE(num_experts=self.n_routed_experts,
                                 top_k=config.num_experts_per_tok,
                                 hidden_size=config.hidden_size,
                                 intermediate_size=config.moe_intermediate_size,
                                 reduce_results=False,
                                 renormalize=config.norm_topk_prob,
                                 quant_config=quant_config,
-                                prefix=f"{prefix}.experts")
+                                prefix=f"{prefix}.experts",
+                                enable_eplb=self.enable_eplb,
+                                num_redundant_experts=self.n_redundant_experts)
 
         self.gate = ReplicatedLinear(config.hidden_size,
                                      config.num_experts,
@@ -246,6 +272,7 @@ class Qwen3MoeDecoderLayer(nn.Module):
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        enable_eplb: bool = False,
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
@@ -277,7 +304,8 @@ class Qwen3MoeDecoderLayer(nn.Module):
             (layer_idx + 1) % config.decoder_sparse_step == 0):
             self.mlp = Qwen3MoeSparseMoeBlock(config=config,
                                               quant_config=quant_config,
-                                              prefix=f"{prefix}.mlp")
+                                              prefix=f"{prefix}.mlp",
+                                              enable_eplb=enable_eplb)
         else:
             self.mlp = Qwen3MoeMLP(hidden_size=config.hidden_size,
                                    intermediate_size=config.intermediate_size,
@@ -323,6 +351,9 @@ class Qwen3MoeModel(nn.Module):
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
+        parallel_config = vllm_config.parallel_config
+        enable_eplb = parallel_config.enable_eplb
+        self.num_redundant_experts = parallel_config.num_redundant_experts
 
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
@@ -336,7 +367,8 @@ class Qwen3MoeModel(nn.Module):
             lambda prefix: Qwen3MoeDecoderLayer(config=config,
                                                 cache_config=cache_config,
                                                 quant_config=quant_config,
-                                                prefix=prefix),
+                                                prefix=prefix,
+                                                enable_eplb=enable_eplb),
             prefix=f"{prefix}.layers",
         )
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -382,7 +414,8 @@ class Qwen3MoeModel(nn.Module):
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
-            num_experts=self.config.num_experts)
+            num_experts=self.config.num_experts,
+            num_redundant_experts=self.num_redundant_experts)
 
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
@@ -433,27 +466,51 @@ class Qwen3MoeModel(nn.Module):
                 weight_loader(param, loaded_weight, shard_id)
                 break
             else:
+                is_expert_weight = False
                 for mapping in expert_params_mapping:
                     param_name, weight_name, expert_id, shard_id = mapping
                     if weight_name not in name:
                         continue
-                    name = name.replace(weight_name, param_name)
-                    # Skip layers on other devices.
-                    if is_pp_missing_parameter(name, self):
+
+                    # Anyway, this is an expert weight and should not be
+                    # attempted to load as other weights later
+                    is_expert_weight = True
+
+                    # Do not modify `name` since the loop may continue here
+                    # Instead, create a new variable
+                    name_mapped = name.replace(weight_name, param_name)
+
+                    if is_pp_missing_parameter(name_mapped, self):
                         continue
+
                     # Skip loading extra parameters for GPTQ/modelopt models.
-                    if name.endswith(
-                            ignore_suffixes) and name not in params_dict:
+                    if name_mapped.endswith(
+                            ignore_suffixes
+                    ) and name_mapped not in params_dict:
                         continue
-                    param = params_dict[name]
-                    weight_loader = param.weight_loader
-                    weight_loader(param,
-                                  loaded_weight,
-                                  name,
-                                  shard_id=shard_id,
-                                  expert_id=expert_id)
-                    break
+
+                    param = params_dict[name_mapped]
+                    # We should ask the weight loader to return success or not
+                    # here since otherwise we may skip experts with other
+                    # available replicas.
+                    weight_loader = typing.cast(Callable[..., bool],
+                                                param.weight_loader)
+                    success = weight_loader(param,
+                                            loaded_weight,
+                                            name_mapped,
+                                            shard_id=shard_id,
+                                            expert_id=expert_id,
+                                            return_success=True)
+                    if success:
+                        name = name_mapped
+                        break
                 else:
+                    if is_expert_weight:
+                        # We've checked that this is an expert weight
+                        # However it's not mapped locally to this rank
+                        # So we simply skip it
+                        continue
+
                     # Skip loading extra parameters for GPTQ/modelopt models.
                     if name.endswith(
                             ignore_suffixes) and name not in params_dict:
@@ -482,7 +539,8 @@ class Qwen3MoeModel(nn.Module):
         return loaded_params
 
 
-class Qwen3MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
+class Qwen3MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA,
+                          MixtureOfExperts):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -514,6 +572,66 @@ class Qwen3MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+        # Set MoE hyperparameters
+        self.expert_weights = []
+
+        self.moe_layers: list[FusedMoE] = []
+        example_layer = None
+        for layer in self.model.layers:
+            if isinstance(layer, PPMissingLayer):
+                continue
+
+            assert isinstance(layer, Qwen3MoeDecoderLayer)
+            if isinstance(layer.mlp, Qwen3MoeSparseMoeBlock):
+                example_layer = layer.mlp
+                self.moe_layers.append(layer.mlp.experts)
+
+        if example_layer is None:
+            raise RuntimeError("No Qwen3MoE layer found in the model.layers.")
+
+        self.num_moe_layers = len(self.moe_layers)
+        self.num_expert_groups = 1
+        self.num_shared_experts = 0
+        self.num_logical_experts = example_layer.n_logical_experts
+        self.num_physical_experts = example_layer.n_physical_experts
+        self.num_local_physical_experts = example_layer.n_local_physical_experts
+        self.num_routed_experts = example_layer.n_routed_experts
+        self.num_redundant_experts = example_layer.n_redundant_experts
+
+    def set_eplb_state(
+        self,
+        expert_load_view: torch.Tensor,
+        logical_to_physical_map: torch.Tensor,
+        logical_replica_count: torch.Tensor,
+    ) -> None:
+        for layer_idx, layer in enumerate(self.moe_layers):
+            # Register the expert weights.
+            self.expert_weights.append(layer.get_expert_weights())
+            layer.set_eplb_state(
+                moe_layer_idx=layer_idx,
+                expert_load_view=expert_load_view,
+                logical_to_physical_map=logical_to_physical_map,
+                logical_replica_count=logical_replica_count,
+            )
+
+    def update_physical_experts_metadata(
+        self,
+        num_physical_experts: int,
+        num_local_physical_experts: int,
+    ) -> None:
+        assert self.num_local_physical_experts == num_local_physical_experts
+        self.num_physical_experts = num_physical_experts
+        self.num_local_physical_experts = num_local_physical_experts
+        self.num_redundant_experts = (num_physical_experts -
+                                      self.num_logical_experts)
+        for layer in self.model.layers:
+            if isinstance(layer.mlp, Qwen3MoeSparseMoeBlock):
+                moe = layer.mlp
+                moe.n_local_physical_experts = num_local_physical_experts
+                moe.n_physical_experts = num_physical_experts
+                moe.n_redundant_experts = self.num_redundant_experts
+                moe.experts.update_expert_map()
+
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.model.get_input_embeddings(input_ids)
 
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 14a8ac7876f73..51831a770347a 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -12,19 +12,24 @@ import sys
 import tempfile
 from abc import ABC, abstractmethod
 from collections.abc import Set
-from dataclasses import asdict, dataclass, field
+from dataclasses import dataclass, field
 from functools import lru_cache
 from typing import Callable, Optional, TypeVar, Union
 
 import torch.nn as nn
+import transformers
 
+from vllm.config import (ModelConfig, ModelImpl, iter_architecture_defaults,
+                         try_match_architecture_defaults)
 from vllm.logger import init_logger
+from vllm.transformers_utils.dynamic_module import (
+    try_get_class_from_dynamic_module)
 
 from .interfaces import (has_inner_state, has_noops, is_attention_free,
                          is_hybrid, supports_cross_encoding,
                          supports_multimodal, supports_multimodal_raw_input,
                          supports_pp, supports_transcription, supports_v0_only)
-from .interfaces_base import is_text_generation_model
+from .interfaces_base import is_pooling_model, is_text_generation_model
 
 logger = init_logger(__name__)
 
@@ -55,7 +60,7 @@ _TEXT_GENERATION_MODELS = {
     "DeepseekV2ForCausalLM": ("deepseek_v2", "DeepseekV2ForCausalLM"),
     "DeepseekV3ForCausalLM": ("deepseek_v2", "DeepseekV3ForCausalLM"),
     "Dots1ForCausalLM": ("dots1", "Dots1ForCausalLM"),
-    "Ernie4_5_ForCausalLM": ("ernie45", "Ernie4_5_ForCausalLM"),
+    "Ernie4_5ForCausalLM": ("ernie45", "Ernie4_5ForCausalLM"),
     "Ernie4_5_MoeForCausalLM": ("ernie45_moe", "Ernie4_5_MoeForCausalLM"),
     "ExaoneForCausalLM": ("exaone", "ExaoneForCausalLM"),
     "Exaone4ForCausalLM": ("exaone4", "Exaone4ForCausalLM"),
@@ -89,6 +94,7 @@ _TEXT_GENERATION_MODELS = {
     "JAISLMHeadModel": ("jais", "JAISLMHeadModel"),
     "JambaForCausalLM": ("jamba", "JambaForCausalLM"),
     "LlamaForCausalLM": ("llama", "LlamaForCausalLM"),
+    "Llama4ForCausalLM": ("llama4", "Llama4ForCausalLM"),  # noqa: E501
     # For decapoda-research/llama-*
     "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"),
     "MambaForCausalLM": ("mamba", "MambaForCausalLM"),
@@ -202,6 +208,7 @@ _MULTIMODAL_MODELS = {
     "GraniteSpeechForConditionalGeneration": ("granite_speech", "GraniteSpeechForConditionalGeneration"),  # noqa: E501
     "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"),
     "InternVLChatModel": ("internvl", "InternVLChatModel"),
+    "InternS1ForConditionalGeneration": ("interns1", "InternS1ForConditionalGeneration"),  # noqa: E501
     "Idefics3ForConditionalGeneration":("idefics3","Idefics3ForConditionalGeneration"),
     "SmolVLMForConditionalGeneration": ("smolvlm","SmolVLMForConditionalGeneration"),  # noqa: E501
     "KeyeForConditionalGeneration": ("keye", "KeyeForConditionalGeneration"),
@@ -221,6 +228,8 @@ _MULTIMODAL_MODELS = {
     "Ovis": ("ovis", "Ovis"),
     "PaliGemmaForConditionalGeneration": ("paligemma", "PaliGemmaForConditionalGeneration"),  # noqa: E501
     "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
+    "Phi4MMForCausalLM": ("phi4mm", "Phi4MMForCausalLM"),
+    "Phi4MultimodalForCausalLM": ("phi4_multimodal", "Phi4MultimodalForCausalLM"),  # noqa: E501
     "PixtralForConditionalGeneration": ("pixtral", "PixtralForConditionalGeneration"),  # noqa: E501
     "QwenVLForConditionalGeneration": ("qwen_vl", "QwenVLForConditionalGeneration"),  # noqa: E501
     "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"),  # noqa: E501
@@ -229,7 +238,6 @@ _MULTIMODAL_MODELS = {
     "Qwen2_5OmniModel": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"),  # noqa: E501
     "Qwen2_5OmniForConditionalGeneration": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"),  # noqa: E501
     "UltravoxModel": ("ultravox", "UltravoxModel"),
-    "Phi4MMForCausalLM": ("phi4mm", "Phi4MMForCausalLM"),
     "TarsierForConditionalGeneration": ("tarsier", "TarsierForConditionalGeneration"),  # noqa: E501
     "Tarsier2ForConditionalGeneration": ("qwen2_vl", "Tarsier2ForConditionalGeneration"),  # noqa: E501
     "VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"),  # noqa: E501
@@ -308,7 +316,7 @@ class _ModelInfo:
         return _ModelInfo(
             architecture=model.__name__,
             is_text_generation_model=is_text_generation_model(model),
-            is_pooling_model=True,  # Can convert any model into a pooling model
+            is_pooling_model=is_pooling_model(model),
             supports_cross_encoding=supports_cross_encoding(model),
             supports_multimodal=supports_multimodal(model),
             supports_multimodal_raw_input=supports_multimodal_raw_input(model),
@@ -462,6 +470,16 @@ class _ModelRegistry:
                 f"Model architectures {architectures} failed "
                 "to be inspected. Please check the logs for more details.")
 
+        for arch in architectures:
+            if arch in _PREVIOUSLY_SUPPORTED_MODELS:
+                previous_version = _PREVIOUSLY_SUPPORTED_MODELS[arch]
+
+                raise ValueError(
+                    f"Model architecture {arch} was supported in vLLM until "
+                    f"v{previous_version}, and is not supported anymore. "
+                    "Please use an older version of vLLM if you want to "
+                    "use this model architecture.")
+
         raise ValueError(
             f"Model architectures {architectures} are not supported for now. "
             f"Supported architectures: {all_supported_archs}")
@@ -474,174 +492,298 @@ class _ModelRegistry:
         return _try_load_model_cls(model_arch, self.models[model_arch])
 
     def _try_inspect_model_cls(self, model_arch: str) -> Optional[_ModelInfo]:
-        if model_arch in self.models:
-            return _try_inspect_model_cls(model_arch, self.models[model_arch])
+        if model_arch not in self.models:
+            return None
 
-        if model_arch.endswith("ForSequenceClassification"):
-            causal_lm_arch = model_arch.replace("ForSequenceClassification",
-                                                "ForCausalLM")
-            if causal_lm_arch not in self.models:
+        return _try_inspect_model_cls(model_arch, self.models[model_arch])
+
+    def _try_resolve_transformers(
+        self,
+        architecture: str,
+        model_config: ModelConfig,
+    ) -> Optional[str]:
+        if architecture in _TRANSFORMERS_BACKEND_MODELS:
+            return architecture
+
+        auto_map: dict[str, str] = getattr(model_config.hf_config, "auto_map",
+                                           None) or dict()
+
+        # Make sure that config class is always initialized before model class,
+        # otherwise the model class won't be able to access the config class,
+        # the expected auto_map should have correct order like:
+        # "auto_map": {
+        #     "AutoConfig": "<your-repo-name>--<config-name>",
+        #     "AutoModel": "<your-repo-name>--<config-name>",
+        #     "AutoModelFor<Task>": "<your-repo-name>--<config-name>",
+        # },
+        for prefix in ("AutoConfig", "AutoModel"):
+            for name, module in auto_map.items():
+                if name.startswith(prefix):
+                    try_get_class_from_dynamic_module(
+                        module,
+                        model_config.model,
+                        revision=model_config.revision,
+                        warn_on_fail=False,
+                    )
+
+        model_module = getattr(transformers, architecture, None)
+
+        if model_module is None:
+            for name, module in auto_map.items():
+                if name.startswith("AutoModel"):
+                    model_module = try_get_class_from_dynamic_module(
+                        module,
+                        model_config.model,
+                        revision=model_config.revision,
+                        warn_on_fail=True,
+                    )
+                    if model_module is not None:
+                        break
+            else:
+                if model_config.model_impl != ModelImpl.TRANSFORMERS:
+                    return None
+
+                raise ValueError(
+                    f"Cannot find model module. {architecture!r} is not a "
+                    "registered model in the Transformers library (only "
+                    "relevant if the model is meant to be in Transformers) "
+                    "and 'AutoModel' is not present in the model config's "
+                    "'auto_map' (relevant if the model is custom).")
+
+        if not model_module.is_backend_compatible():
+            if model_config.model_impl != ModelImpl.TRANSFORMERS:
                 return None
 
-            info = _try_inspect_model_cls(causal_lm_arch,
-                                          self.models[causal_lm_arch])
+            raise ValueError(
+                f"The Transformers implementation of {architecture!r} "
+                "is not compatible with vLLM.")
 
-            info = _ModelInfo(**dict(
-                asdict(info), **{
-                    "architecture": model_arch,
-                    "supports_cross_encoding": True
-                }))
-            return info
+        return model_config._get_transformers_backend_cls()
 
-        return None
-
-    def _normalize_archs(
+    def _normalize_arch(
         self,
-        architectures: Union[str, list[str]],
-    ) -> list[str]:
-        if isinstance(architectures, str):
-            architectures = [architectures]
-        if not architectures:
-            logger.warning("No model architectures are specified")
+        architecture: str,
+        model_config: ModelConfig,
+    ) -> str:
+        if architecture in self.models:
+            return architecture
 
-        # filter out support architectures
-        normalized_arch = list(
-            filter(lambda model: model in self.models, architectures))
+        # This may be called in order to resolve runner_type and convert_type
+        # in the first place, in which case we consider the default match
+        match = try_match_architecture_defaults(
+            architecture,
+            runner_type=getattr(model_config, "runner_type", None),
+            convert_type=getattr(model_config, "convert_type", None),
+        )
+        if match:
+            suffix, _ = match
 
-        # try automatic conversion in adapters.py
-        for arch in architectures:
-            if not arch.endswith("ForSequenceClassification"):
-                continue
-            causal_lm_arch = arch.replace("ForSequenceClassification",
-                                          "ForCausalLM")
-            if causal_lm_arch in self.models:
-                normalized_arch.append(arch)
+            # Get the name of the base model to convert
+            for repl_suffix, _ in iter_architecture_defaults():
+                base_arch = architecture.replace(suffix, repl_suffix)
+                if base_arch in self.models:
+                    return base_arch
 
-        # NOTE(Isotr0py): Be careful of architectures' order!
-        # Make sure Transformers backend architecture is at the end of the
-        # list, otherwise pooling models automatic conversion will fail!
-        for arch in normalized_arch:
-            if arch.startswith("TransformersFor"):
-                normalized_arch.remove(arch)
-                normalized_arch.append(arch)
-
-        return normalized_arch
+        return architecture
 
     def inspect_model_cls(
         self,
         architectures: Union[str, list[str]],
+        model_config: ModelConfig,
     ) -> tuple[_ModelInfo, str]:
-        architectures = self._normalize_archs(architectures)
+        if isinstance(architectures, str):
+            architectures = [architectures]
+        if not architectures:
+            raise ValueError("No model architectures are specified")
+
+        # Require transformers impl
+        if model_config.model_impl == ModelImpl.TRANSFORMERS:
+            arch = self._try_resolve_transformers(architectures[0],
+                                                  model_config)
+            if arch is not None:
+                model_info = self._try_inspect_model_cls(arch)
+                if model_info is not None:
+                    return (model_info, arch)
+
+        # Fallback to transformers impl (after resolving convert_type)
+        if (all(arch not in self.models for arch in architectures)
+                and model_config.model_impl == ModelImpl.AUTO
+                and getattr(model_config, "convert_type", "none") == "none"):
+            arch = self._try_resolve_transformers(architectures[0],
+                                                  model_config)
+            if arch is not None:
+                model_info = self._try_inspect_model_cls(arch)
+                if model_info is not None:
+                    return (model_info, arch)
 
         for arch in architectures:
-            model_info = self._try_inspect_model_cls(arch)
+            normalized_arch = self._normalize_arch(arch, model_config)
+            model_info = self._try_inspect_model_cls(normalized_arch)
             if model_info is not None:
                 return (model_info, arch)
 
+        # Fallback to transformers impl (before resolving runner_type)
+        if (all(arch not in self.models for arch in architectures)
+                and model_config.model_impl == ModelImpl.AUTO):
+            arch = self._try_resolve_transformers(architectures[0],
+                                                  model_config)
+            if arch is not None:
+                model_info = self._try_inspect_model_cls(arch)
+                if model_info is not None:
+                    return (model_info, arch)
+
         return self._raise_for_unsupported(architectures)
 
     def resolve_model_cls(
         self,
         architectures: Union[str, list[str]],
+        model_config: ModelConfig,
     ) -> tuple[type[nn.Module], str]:
-        architectures = self._normalize_archs(architectures)
+        if isinstance(architectures, str):
+            architectures = [architectures]
+        if not architectures:
+            raise ValueError("No model architectures are specified")
+
+        # Require transformers impl
+        if model_config.model_impl == ModelImpl.TRANSFORMERS:
+            arch = self._try_resolve_transformers(architectures[0],
+                                                  model_config)
+            if arch is not None:
+                model_cls = self._try_load_model_cls(arch)
+                if model_cls is not None:
+                    return (model_cls, arch)
+
+        # Fallback to transformers impl (after resolving convert_type)
+        if (all(arch not in self.models for arch in architectures)
+                and model_config.model_impl == ModelImpl.AUTO
+                and getattr(model_config, "convert_type", "none") == "none"):
+            arch = self._try_resolve_transformers(architectures[0],
+                                                  model_config)
+            if arch is not None:
+                model_cls = self._try_load_model_cls(arch)
+                if model_cls is not None:
+                    return (model_cls, arch)
 
         for arch in architectures:
-            model_cls = self._try_load_model_cls(arch)
+            normalized_arch = self._normalize_arch(arch, model_config)
+            model_cls = self._try_load_model_cls(normalized_arch)
             if model_cls is not None:
                 return (model_cls, arch)
 
+        # Fallback to transformers impl (before resolving runner_type)
+        if (all(arch not in self.models for arch in architectures)
+                and model_config.model_impl == ModelImpl.AUTO):
+            arch = self._try_resolve_transformers(architectures[0],
+                                                  model_config)
+            if arch is not None:
+                model_cls = self._try_load_model_cls(arch)
+                if model_cls is not None:
+                    return (model_cls, arch)
+
         return self._raise_for_unsupported(architectures)
 
     def is_text_generation_model(
         self,
         architectures: Union[str, list[str]],
+        model_config: ModelConfig,
     ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures)
+        model_cls, _ = self.inspect_model_cls(architectures, model_config)
         return model_cls.is_text_generation_model
 
     def is_pooling_model(
         self,
         architectures: Union[str, list[str]],
+        model_config: ModelConfig,
     ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures)
+        model_cls, _ = self.inspect_model_cls(architectures, model_config)
         return model_cls.is_pooling_model
 
     def is_cross_encoder_model(
         self,
         architectures: Union[str, list[str]],
+        model_config: ModelConfig,
     ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures)
+        model_cls, _ = self.inspect_model_cls(architectures, model_config)
         return model_cls.supports_cross_encoding
 
     def is_multimodal_model(
         self,
         architectures: Union[str, list[str]],
+        model_config: ModelConfig,
     ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures)
+        model_cls, _ = self.inspect_model_cls(architectures, model_config)
         return model_cls.supports_multimodal
 
     def supports_multimodal_raw_input(
         self,
         architectures: Union[str, list[str]],
+        model_config: ModelConfig,
     ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures)
+        model_cls, _ = self.inspect_model_cls(architectures, model_config)
         return model_cls.supports_multimodal_raw_input
 
     def is_pp_supported_model(
         self,
         architectures: Union[str, list[str]],
+        model_config: ModelConfig,
     ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures)
+        model_cls, _ = self.inspect_model_cls(architectures, model_config)
         return model_cls.supports_pp
 
     def model_has_inner_state(
         self,
         architectures: Union[str, list[str]],
+        model_config: ModelConfig,
     ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures)
+        model_cls, _ = self.inspect_model_cls(architectures, model_config)
         return model_cls.has_inner_state
 
     def is_attention_free_model(
         self,
         architectures: Union[str, list[str]],
+        model_config: ModelConfig,
     ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures)
+        model_cls, _ = self.inspect_model_cls(architectures, model_config)
         return model_cls.is_attention_free
 
     def is_hybrid_model(
         self,
         architectures: Union[str, list[str]],
+        model_config: ModelConfig,
     ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures)
+        model_cls, _ = self.inspect_model_cls(architectures, model_config)
         return model_cls.is_hybrid
 
     def is_noops_model(
         self,
         architectures: Union[str, list[str]],
+        model_config: ModelConfig,
     ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures)
+        model_cls, _ = self.inspect_model_cls(architectures, model_config)
         return model_cls.has_noops
 
     def is_transcription_model(
         self,
         architectures: Union[str, list[str]],
+        model_config: ModelConfig,
     ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures)
+        model_cls, _ = self.inspect_model_cls(architectures, model_config)
         return model_cls.supports_transcription
 
     def is_transcription_only_model(
         self,
         architectures: Union[str, list[str]],
+        model_config: ModelConfig,
     ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures)
+        model_cls, _ = self.inspect_model_cls(architectures, model_config)
         return model_cls.supports_transcription_only
 
     def is_v1_compatible(
         self,
         architectures: Union[str, list[str]],
+        model_config: ModelConfig,
     ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures)
+        model_cls, _ = self.inspect_model_cls(architectures, model_config)
         return not model_cls.supports_v0_only
 
 
diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py
index c6b4116440346..77e072c792755 100644
--- a/vllm/model_executor/models/roberta.py
+++ b/vllm/model_executor/models/roberta.py
@@ -9,6 +9,7 @@ from torch import nn
 from transformers import RobertaConfig
 
 from vllm.config import VllmConfig
+from vllm.forward_context import get_forward_context
 from vllm.model_executor.layers.pooler import (ClassifierPooler, CLSPool,
                                                DispatchPooler, Pooler)
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -51,33 +52,12 @@ class RobertaEmbedding(nn.Module):
     def forward(
         self,
         input_ids: torch.Tensor,
-        seq_lens: torch.Tensor,
         position_ids: torch.Tensor,
         token_type_ids: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         input_shape = input_ids.size()
         inputs_embeds = self.word_embeddings(input_ids)
 
-        # Replace position ids because in RoBERTa models
-        # they have to start at padding_idx + 1 and ignore
-        # existing padding tokens
-        # References:
-        # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L133
-        # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L1669
-        seq_lens_list = seq_lens.tolist()
-        new_pos_list = []
-        for positions, tokens in zip(position_ids.split(seq_lens_list),
-                                     input_ids.split(seq_lens_list)):
-            # Verify assumption that incoming position are
-            # always a sequence from 0 to N.
-            expected_pos = torch.arange(positions.size()[0],
-                                        dtype=torch.long,
-                                        device=inputs_embeds.device)
-            assert torch.equal(positions, expected_pos)
-            new_pos_list.append(
-                create_position_ids_from_input_ids(tokens, self.padding_idx))
-        position_ids = torch.cat(new_pos_list)
-
         # Position embeddings.
         position_embeddings = self.position_embeddings(position_ids)
         if token_type_ids is None:
@@ -119,6 +99,32 @@ class RobertaEmbeddingModel(BertEmbeddingModel):
        _pooler: An instance of Pooler used for pooling operations.
    """
 
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+        self.padding_idx = vllm_config.model_config.hf_config.pad_token_id
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        token_type_ids: Optional[torch.Tensor] = None,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+
+        # Fix Roberta positions here outside of the CUDA graph.
+        # Because we need the to extract the sequences from
+        # input_ids the control flow is data dependent.
+        replace_roberta_positions(input_ids=input_ids,
+                                  position_ids=positions,
+                                  padding_idx=self.padding_idx)
+
+        return self.model(input_ids=input_ids,
+                          position_ids=positions,
+                          token_type_ids=token_type_ids,
+                          inputs_embeds=inputs_embeds,
+                          intermediate_tensors=intermediate_tensors)
+
     def _build_model(self,
                      vllm_config: VllmConfig,
                      prefix: str = "") -> Union[BertModel, BertWithRope]:
@@ -175,6 +181,7 @@ class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding,
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
+        self.padding_idx = vllm_config.model_config.hf_config.pad_token_id
 
         self.num_labels = config.num_labels
         self.roberta = BertModel(vllm_config=vllm_config,
@@ -216,6 +223,9 @@ class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding,
         inputs_embeds: Optional[torch.Tensor] = None,
         token_type_ids: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        replace_roberta_positions(input_ids=input_ids,
+                                  position_ids=positions,
+                                  padding_idx=self.padding_idx)
         return self.roberta(input_ids=input_ids,
                             position_ids=positions,
                             inputs_embeds=inputs_embeds,
@@ -245,3 +255,36 @@ def create_position_ids_from_input_ids(input_ids,
                            past_key_values_length) * mask
 
     return incremental_indices.long() + padding_idx
+
+
+def replace_roberta_positions(input_ids: torch.Tensor,
+                              position_ids: torch.Tensor,
+                              padding_idx: int) -> None:
+
+    seq_lens: Optional[torch.Tensor] = None
+    attn_metadata = get_forward_context().attn_metadata
+    if attn_metadata is not None:  # can be None during warmup
+        if isinstance(attn_metadata, dict):
+            attn_metadata = next(iter(attn_metadata.values()))
+        # TODO: remove "seq_lens_tensor" after V0 is removed
+        seq_lens = getattr(attn_metadata, "seq_lens_tensor",
+                           getattr(attn_metadata, "seq_lens", None))
+
+    if seq_lens is not None:
+        assert isinstance(seq_lens, torch.Tensor)
+
+        # Replace position ids because in RoBERTa models
+        # they have to start at padding_idx + 1 and ignore
+        # existing padding tokens
+        # References:
+        # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L133
+        # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L1669
+        token_list = torch.split(input_ids[:torch.sum(seq_lens)],
+                                 seq_lens.tolist())
+
+        offset = 0
+        for tokens in token_list:
+            length = tokens.shape[0]
+            position_ids[offset:offset+length] = \
+                create_position_ids_from_input_ids(tokens, padding_idx)
+            offset = offset + length
diff --git a/vllm/model_executor/models/telechat2.py b/vllm/model_executor/models/telechat2.py
index f0b31b1332fb1..49a7677151a94 100644
--- a/vllm/model_executor/models/telechat2.py
+++ b/vllm/model_executor/models/telechat2.py
@@ -37,9 +37,20 @@ from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper,
 class TeleChat2Model(LlamaModel):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        hf_config = vllm_config.model_config.hf_config
+
+        vllm_config.model_config.hf_config.attribute_map = {
+            "num_hidden_layers": "n_layer",
+            "num_attention_heads": "n_head",
+            "intermediate_size": "ffn_hidden_size",
+            "rms_norm_eps": "layer_norm_epsilon"
+        }
+        vllm_config.model_config.hf_config.hidden_act = "silu"
+
         # 1. Initialize the LlamaModel with bias
-        vllm_config.model_config.hf_config.bias = True
-        vllm_config.model_config.hf_config.mlp_bias = True
+        hf_config.bias = True
+        hf_config.mlp_bias = True
+
         super().__init__(vllm_config=vllm_config, prefix=prefix)
         # 2. Remove the bias from the qkv_proj and gate_up_proj based on config
         # Telechat2's gate_up_proj and qkv_proj don't have bias
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 3697e3fd0cf43..a4569ccd5a845 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -39,9 +39,7 @@ from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
                     merge_multimodal_embeddings,
                     merge_multimodal_embeddings_from_map)
 
-_AUDIO_PLACEHOLDER_OVERRIDE = "<|reserved_special_token_0|>"
-_AUDIO_PLACEHOLDER_TOKEN = 128002
-_AUDIO_TOKENS_PER_SECOND = 6.25
+_AUDIO_PLACEHOLDER_OVERRIDE = "<|audio|>"
 _MAX_ENCODER_BATCH_SIZE = 16
 
 
@@ -80,14 +78,15 @@ class UltravoxProcessingInfo(BaseProcessingInfo):
         sampling_rate: Optional[int] = None,
         **kwargs: object,
     ) -> ProcessorMixin:
+        config = self.ctx.model_config.hf_config
         hf_processor = self.ctx.get_hf_processor(**kwargs)
 
         # NOTE: Ultravox processing definition uses '<|eot_id|>' as the
         # placeholder that will cause confusion with the actual end of turn
-        # token, thus we override placeholder with a reserved special
-        # token.
+        # token, thus we override placeholder with a reserved token.
         hf_processor.audio_token_replacement = _AUDIO_PLACEHOLDER_OVERRIDE
-        hf_processor.audio_replacement_token_id = _AUDIO_PLACEHOLDER_TOKEN
+        hf_processor.audio_replacement_token_id = config.audio_token_index
+
         return hf_processor
 
     def get_feature_extractor(
@@ -274,7 +273,7 @@ class UltravoxProjector(nn.Module):
         else:
             self.act = get_act_fn(config.projector_act)
 
-        dim_out = config.text_config.hidden_size
+        dim_out = config.text_hidden_size
         self.linear_2 = nn.Linear(dim_mid, dim_out, bias=False)
 
         # Ultravox v0.4.1 and below use layer_norm after the second linear layer
@@ -572,9 +571,14 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
         input_ids: torch.Tensor,
         multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
     ) -> torch.Tensor:
-        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
-        if multimodal_embeddings is not None \
-            and len(multimodal_embeddings) != 0:
+        # The audio token index is not included in the embedding table
+        # We need to remove it before embedding lookup
+        safe_input_ids = input_ids.clone()
+        safe_input_ids[safe_input_ids == self.config.audio_token_index] = 0
+        inputs_embeds = self.language_model.get_input_embeddings(
+            safe_input_ids)
+        if multimodal_embeddings is not None and len(
+                multimodal_embeddings) > 0:
 
             # TODO(ywang96): remove this block after v0 is deprecated.
             if not envs.VLLM_USE_V1:
@@ -585,7 +589,7 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
             else:
                 inputs_embeds = merge_multimodal_embeddings(
                     input_ids, inputs_embeds, multimodal_embeddings,
-                    _AUDIO_PLACEHOLDER_TOKEN)
+                    self.config.audio_token_index)
         return inputs_embeds
 
     def forward(self,
@@ -623,10 +627,14 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
                                                       multimodal_embeddings)
             input_ids = None
 
-        hidden_states = self.language_model.model(input_ids,
-                                                  positions,
-                                                  intermediate_tensors,
-                                                  inputs_embeds=inputs_embeds)
+        language_model = self.language_model
+        if hasattr(language_model, "language_model"):
+            language_model = language_model.language_model
+
+        hidden_states = language_model.model(input_ids,
+                                             positions,
+                                             intermediate_tensors,
+                                             inputs_embeds=inputs_embeds)
         return hidden_states
 
     def compute_logits(self, hidden_states: torch.Tensor,
diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py
index 97cab628317e4..6b06c0ac6683f 100644
--- a/vllm/model_executor/models/voxtral.py
+++ b/vllm/model_executor/models/voxtral.py
@@ -26,8 +26,7 @@ from vllm.logger import init_logger
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models import SupportsPP
 # yapf: disable
-from vllm.model_executor.models.whisper import (
-    WhisperEncoder, WhisperForConditionalGeneration)
+from vllm.model_executor.models.whisper import WhisperEncoder
 # yapf: enable
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
@@ -50,6 +49,18 @@ from .utils import (flatten_bn, init_vllm_registered_model, maybe_prefix,
 
 logger = init_logger(__name__)
 
+ISO639_1_SUPPORTED_LANGS = {
+    "ar": "Arabic",
+    "nl": "Dutch",
+    "en": "English",
+    "fr": "French",
+    "de": "German",
+    "hi": "Hindi",
+    "it": "Italian",
+    "pt": "Portuguese",
+    "es": "Spanish",
+}
+
 
 class VoxtralProcessorAdapter:
     """
@@ -301,6 +312,7 @@ class VoxtralMultiModalProcessor(BaseMultiModalProcessor[VoxtralProcessingInfo]
                                         dummy_inputs=VoxtralDummyInputsBuilder)
 class VoxtralForConditionalGeneration(nn.Module, SupportsMultiModal,
                                       SupportsPP, SupportsTranscription):
+    supported_languages = ISO639_1_SUPPORTED_LANGS
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
@@ -441,8 +453,8 @@ class VoxtralForConditionalGeneration(nn.Module, SupportsMultiModal,
     # for speech-to-text transcription
     def get_generation_prompt(cls, audio: np.ndarray,
                               model_config: ModelConfig,
-                              stt_config: SpeechToTextConfig, language: str,
-                              task_type: str,
+                              stt_config: SpeechToTextConfig,
+                              language: Optional[str], task_type: str,
                               request_prompt: str) -> PromptType:
         tokenizer = cached_tokenizer_from_config(model_config)
         audio = Audio(audio, int(stt_config.sample_rate),
@@ -457,11 +469,6 @@ class VoxtralForConditionalGeneration(nn.Module, SupportsMultiModal,
         prompts_dict["prompt_token_ids"] = tokenized.tokens
         return cast(PromptType, prompts_dict)
 
-    @classmethod
-    def validate_language(cls, language: str) -> bool:
-        # same as whisper
-        return WhisperForConditionalGeneration.validate_language(language)
-
     @classmethod
     def get_num_audio_tokens(cls, audio_duration_s: float,
                              stt_config: SpeechToTextConfig,
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index d98dab5fac0e4..d7bafb9ef84d9 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -109,51 +109,6 @@ ISO639_1_SUPPORTED_LANGS = {
     "vi": "Vietnamese",
     "cy": "Welsh"
 }
-ISO639_1_OTHER_LANGS = {
-    "lo": "Lao",
-    "jw": "Javanese",
-    "tk": "Turkmen",
-    "yi": "Yiddish",
-    "so": "Somali",
-    "bn": "Bengali",
-    "nn": "Norwegian Nynorsk",
-    "si": "Sinhala",
-    "yo": "Yoruba",
-    "sa": "Sanskrit",
-    "mi": "Māori",
-    "fo": "Faroese",  # codespell:ignore
-    "mt": "Maltese",
-    "tg": "Tajik",
-    "mg": "Malagasy",
-    "haw": "Hawaiian",
-    "km": "Khmer",
-    "br": "Breton",
-    "ps": "Pashto",
-    "ln": "Lingala",
-    "la": "Latin",
-    "ml": "Malayalam",
-    "sq": "Albanian",
-    "su": "Sundanese",
-    "eu": "Basque",
-    "ka": "Georgian",
-    "uz": "Uzbek",
-    "sn": "Shona",
-    "ht": "Haitian",
-    "as": "Assamese",
-    "mn": "Mongolian",
-    "te": "Telugu",
-    "pa": "Panjabi",
-    "tt": "Tatar",
-    "gu": "Gujarati",
-    "oc": "Occitan",
-    "ha": "Hausa",
-    "ba": "Bashkir",
-    "my": "Burmese",
-    "sd": "Sindhi",
-    "am": "Amharic",
-    "lb": "Luxembourgish",
-    "bo": "Tibetan"
-}
 
 
 class WhisperAudioInputs(TypedDict):
@@ -807,22 +762,20 @@ class WhisperForConditionalGeneration(nn.Module, SupportsTranscription,
 
     # Whisper only supports audio-conditioned generation.
     supports_transcription_only = True
+    supported_languages = ISO639_1_SUPPORTED_LANGS
 
     @classmethod
-    def validate_language(cls, language: str) -> bool:
-        if language in ISO639_1_SUPPORTED_LANGS:
-            return True
-        elif language in ISO639_1_OTHER_LANGS:
+    def validate_language(cls, language: Optional[str]) -> Optional[str]:
+        if language is None:
+            # TODO language should be optional and can be guessed.
+            # For now we default to en. See
+            # https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/generation_whisper.py#L1520
             logger.warning(
-                "The selected language %s has limited accuracy with"
-                " reported WER>=0.5. Results may be less accurate "
-                "for this choice.", language)
-            return True
-        else:
-            raise ValueError(f"Unsupported language: {language}."
-                             "Language should be one of:" +
-                             f" {list(ISO639_1_SUPPORTED_LANGS.values())}" +
-                             f"or {list(ISO639_1_OTHER_LANGS.values())}")
+                "Defaulting to language='en'. If you wish to transcribe "
+                "audio in a different language, pass the `language` field "
+                "in the TranscriptionRequest.")
+            language = "en"
+        return super().validate_language(language)
 
     @classmethod
     def get_generation_prompt(
@@ -830,9 +783,12 @@ class WhisperForConditionalGeneration(nn.Module, SupportsTranscription,
             audio: np.ndarray,
             model_config: ModelConfig,  # not needed here
             stt_config: SpeechToTextConfig,
-            language: str,
+            language: Optional[str],
             task_type: str,
             request_prompt: str) -> PromptType:
+        if language is None:
+            raise ValueError(
+                "Language must be specified when creating the Whisper prompt")
         prompt = {
             "encoder_prompt": {
                 # Whisper does not support encoder prompt.
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 7f6fb47a21fa6..d96803b643ff2 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -180,11 +180,14 @@ class MultiModalProfiler(Generic[_I]):
     def _get_mm_num_tokens(
         self,
         mm_inputs: MultiModalInputs,
+        mm_embeddings_only: bool = True,
     ) -> Mapping[str, int]:
         placeholders_by_modality = mm_inputs["mm_placeholders"]
 
         return {
-            modality: sum(item.get_num_embeds() for item in placeholders)
+            modality:
+            sum(item.get_num_embeds() if mm_embeddings_only else item.length
+                for item in placeholders)
             for modality, placeholders in placeholders_by_modality.items()
         }
 
@@ -253,10 +256,11 @@ class MultiModalProfiler(Generic[_I]):
             multi_modal_placeholders=mm_inputs["mm_placeholders"],
         )
 
-    def get_mm_max_tokens(
+    def _get_mm_max_tokens(
         self,
         seq_len: int,
         mm_counts: Optional[Mapping[str, int]] = None,
+        mm_embeddings_only: bool = True,
     ) -> Mapping[str, int]:
         if mm_counts is None:
             mm_counts = self.get_mm_limits()
@@ -285,4 +289,25 @@ class MultiModalProfiler(Generic[_I]):
             return max_tokens_per_item
 
         mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
-        return self._get_mm_num_tokens(mm_inputs)
+        return self._get_mm_num_tokens(mm_inputs,
+                                       mm_embeddings_only=mm_embeddings_only)
+
+    def get_mm_max_contiguous_tokens(
+        self,
+        seq_len: int,
+        mm_counts: Optional[Mapping[str, int]] = None,
+    ):
+        """
+        Returns the maximum length of the multimodal (image placeholders+text)
+        tokens, including any break/text tokens in-between image embeddings.
+
+        <im_start> [IMG] [IMG] [IMG] <row_break> [IMG] [IMG] [IMG] <im_end>
+        Returns 9, even when the number of image embeddings is 6.
+        
+        This is important to take into account when profiling and
+        initializing the encoder cache size.
+        """
+
+        return self._get_mm_max_tokens(seq_len,
+                                       mm_counts,
+                                       mm_embeddings_only=False)
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index c44fcacd246c4..5f5b620e0cf79 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -5,7 +5,6 @@ from dataclasses import dataclass
 from typing import TYPE_CHECKING, Generic, Optional, Protocol, TypeVar
 
 import torch.nn as nn
-from typing_extensions import deprecated
 
 from vllm.envs import VLLM_MM_INPUT_CACHE_GIB
 from vllm.inputs import InputProcessingContext
@@ -105,13 +104,6 @@ class MultiModalRegistry:
 
         return True  # Success
 
-    @deprecated("Legacy input processor/mapper pipeline has been removed. "
-                "Please update your model runner to use "
-                "`seq_group_metadata.multi_modal_data` directly without "
-                "further processing.")
-    def create_input_mapper(self, model_config: "ModelConfig"):
-        return lambda data, mm_processor_kwargs: data
-
     def get_max_tokens_per_item_by_modality(
         self,
         model_config: "ModelConfig",
@@ -129,7 +121,7 @@ class MultiModalRegistry:
         seq_len = model_config.max_model_len
         mm_limits = self.get_mm_limits_per_prompt(model_config)
 
-        return profiler.get_mm_max_tokens(
+        return profiler.get_mm_max_contiguous_tokens(
             seq_len,
             {
                 modality: 1
@@ -182,16 +174,6 @@ class MultiModalRegistry:
         """
         return sum(self.get_max_tokens_by_modality(model_config).values())
 
-    @deprecated("Legacy input processor/mapper pipeline has been removed. "
-                "Please update your model runner to use "
-                "`seq_group_metadata.multi_modal_data` directly without "
-                "further processing.")
-    def init_mm_limits_per_prompt(
-        self,
-        model_config: "ModelConfig",
-    ) -> None:
-        pass
-
     def get_mm_limits_per_prompt(
         self,
         model_config: "ModelConfig",
@@ -246,13 +228,6 @@ class MultiModalRegistry:
         model_cls, _ = get_model_architecture(model_config)
         return model_cls
 
-    @deprecated("Legacy input processor/mapper pipeline has been removed. "
-                "Please update your model runner to use "
-                "`seq_group_metadata.multi_modal_data` directly without "
-                "further processing.")
-    def has_processor(self, model_config: "ModelConfig") -> bool:
-        return True
-
     def create_processor(
         self,
         model_config: "ModelConfig",
diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
index c13659f8a06e6..56edb8629e45b 100644
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -1,11 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
 import logging
 import traceback
 from itertools import chain
 from typing import TYPE_CHECKING, Optional
 
+from vllm import envs
 from vllm.plugins import load_plugins_by_group
 from vllm.utils import resolve_obj_by_qualname, supports_xccl
 
@@ -31,20 +31,26 @@ def vllm_version_matches_substr(substr: str) -> bool:
 
 
 def tpu_platform_plugin() -> Optional[str]:
-    is_tpu = False
     logger.debug("Checking if TPU platform is available.")
+
+    # Check for Pathways TPU proxy
+    if envs.VLLM_TPU_USING_PATHWAYS:
+        logger.debug("Confirmed TPU platform is available via Pathways proxy.")
+        return "tpu_commons.platforms.tpu_jax.TpuPlatform"
+
+    # Check for libtpu installation
     try:
         # While it's technically possible to install libtpu on a
         # non-TPU machine, this is a very uncommon scenario. Therefore,
-        # we assume that libtpu is installed if and only if the machine
+        # we assume that libtpu is installed only if the machine
         # has TPUs.
+
         import libtpu  # noqa: F401
-        is_tpu = True
         logger.debug("Confirmed TPU platform is available.")
+        return "vllm.platforms.tpu.TpuPlatform"
     except Exception as e:
         logger.debug("TPU platform is not available because: %s", str(e))
-
-    return "vllm.platforms.tpu.TpuPlatform" if is_tpu else None
+        return None
 
 
 def cuda_platform_plugin() -> Optional[str]:
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 9a8941e3cdd19..87ff6b385809a 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -150,11 +150,28 @@ class CudaPlatformBase(Platform):
         # TODO(lucas): handle this more gracefully
         # Note: model_config may be None during testing
         if model_config is not None and model_config.use_mla:
-            # if `VLLM_ATTENTION_BACKEND` is not set and we are using MLA, then
-            # we default to FlashMLA backend, so we need to force the blocksize
-            # here
-            use_flashmla = (envs.VLLM_ATTENTION_BACKEND is None \
-                or envs.VLLM_ATTENTION_BACKEND == "FLASHMLA")
+            # If `VLLM_ATTENTION_BACKEND` is not set and we are using MLA,
+            # then we default to FlashMLA backend for non-blackwell GPUs,
+            # else we default to CutlassMLA. For each case, we force the
+            # required block_size.
+            use_flashmla = False
+            use_cutlass_mla = False
+
+            if envs.VLLM_ATTENTION_BACKEND is None:
+                # Default case
+                if cls.is_device_capability(100):
+                    # Blackwell => Force CutlassMLA.
+                    use_cutlass_mla = True
+                    envs.VLLM_ATTENTION_BACKEND = "CUTLASS_MLA"
+                else:
+                    # Not Blackwell
+                    use_flashmla = True
+            else:
+                # Forced case
+                use_flashmla = (envs.VLLM_ATTENTION_BACKEND == "FLASHMLA")
+                use_cutlass_mla = (
+                    envs.VLLM_ATTENTION_BACKEND == "CUTLASS_MLA")
+
             from vllm.attention.ops.flashmla import is_flashmla_supported
             if use_flashmla and is_flashmla_supported()[0] \
                 and cache_config.block_size != 64:
@@ -162,12 +179,10 @@ class CudaPlatformBase(Platform):
                 logger.info(
                     "Forcing kv cache block size to 64 for FlashMLA backend.")
 
-            use_cutlass_mla = (envs.VLLM_ATTENTION_BACKEND is not None \
-                and envs.VLLM_ATTENTION_BACKEND == "CUTLASS_MLA_VLLM_V1")
             if use_cutlass_mla and cache_config.block_size != 128:
                 cache_config.block_size = 128
                 logger.info("Forcing kv cache block size to 128 for "
-                            "CUTLASS_MLA_VLLM_V1 backend.")
+                            "CUTLASS_MLA backend.")
 
         compilation_config = vllm_config.compilation_config
         if (envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput"
@@ -196,9 +211,9 @@ class CudaPlatformBase(Platform):
                              kv_cache_dtype, block_size, use_v1,
                              use_mla) -> str:
         if use_mla:
-            # TODO(lucas): refactor to  be more concise
+            # TODO(lucas): refactor to be more concise
             #  we should probably consider factoring out V1 here
-            if selected_backend == _Backend.CUTLASS_MLA_VLLM_V1:
+            if selected_backend == _Backend.CUTLASS_MLA:
                 if use_v1:
                     logger.info_once("Using Cutlass MLA backend on V1 engine.")
                     return ("vllm.v1.attention.backends.mla."
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 02cc392244bac..6bae0fe25c797 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -53,7 +53,7 @@ class _Backend(enum.Enum):
     TRITON_MLA_VLLM_V1 = enum.auto()
     FLASHMLA_VLLM_V1 = enum.auto()
     FLASHMLA = enum.auto()  # Supported by V1
-    CUTLASS_MLA_VLLM_V1 = enum.auto()
+    CUTLASS_MLA = enum.auto()
     PALLAS = enum.auto()
     PALLAS_VLLM_V1 = enum.auto()
     IPEX = enum.auto()
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index c4530c1dfaa3f..d8a663f2f0c4a 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -30,7 +30,7 @@ class XPUPlatform(Platform):
     # see https://github.com/ray-project/ray/blob/6a5eb5865eeb9ccf058a79b44f107e327e360673/python/ray/_private/accelerators/intel_gpu.py#L20 # noqa: E501
     ray_device_key: str = "GPU"
     dist_backend: str = "ccl"  # ccl | xccl
-    device_control_env_var: str = "ONEAPI_DEVICE_SELECTOR"
+    device_control_env_var: str = "ZE_AFFINITY_MASK"
 
     @classmethod
     def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
@@ -67,7 +67,7 @@ class XPUPlatform(Platform):
 
     @classmethod
     def get_punica_wrapper(cls) -> str:
-        return "vllm.lora.punica_wrapper.punica_gpu.PunicaWrapperGPU"
+        return "vllm.lora.punica_wrapper.punica_xpu.PunicaWrapperXPU"
 
     @classmethod
     def get_device_total_memory(cls, device_id: int = 0) -> int:
diff --git a/vllm/plugins/lora_resolvers/README.md b/vllm/plugins/lora_resolvers/README.md
index 7e7c55f5c69c7..48f27dddea07e 100644
--- a/vllm/plugins/lora_resolvers/README.md
+++ b/vllm/plugins/lora_resolvers/README.md
@@ -6,7 +6,8 @@ via the LoRAResolver plugin framework.
 Note that `VLLM_ALLOW_RUNTIME_LORA_UPDATING` must be set to true to allow LoRA resolver plugins
 to work, and `VLLM_PLUGINS` must be set to include the desired resolver plugins.
 
-# lora_filesystem_resolver
+## lora_filesystem_resolver
+
 This LoRA Resolver is installed with vLLM by default.
 To use, set `VLLM_PLUGIN_LORA_CACHE_DIR` to a local directory. When vLLM receives a request
 for a LoRA adapter `foobar` it doesn't currently recognize, it will look in that local directory
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index da475c3b50a39..4ce56cb3a6aac 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -29,19 +29,13 @@ from vllm import envs
 from vllm.logger import init_logger
 # yapf conflicts with isort for this block
 # yapf: disable
-from vllm.transformers_utils.configs import (ChatGLMConfig, Cohere2Config,
-                                             DbrxConfig, DeepseekVLV2Config,
-                                             EAGLEConfig, Exaone4Config,
-                                             ExaoneConfig, JAISConfig,
+from vllm.transformers_utils.configs import (ChatGLMConfig, DeepseekVLV2Config,
+                                             EAGLEConfig, JAISConfig,
                                              KimiVLConfig, MedusaConfig,
-                                             MiniMaxText01Config,
-                                             MiniMaxVL01Config, MllamaConfig,
-                                             MLPSpeculatorConfig, MPTConfig,
+                                             MllamaConfig, MLPSpeculatorConfig,
                                              Nemotron_Nano_VL_Config,
                                              NemotronConfig, NVLM_D_Config,
-                                             OvisConfig, RWConfig,
-                                             SkyworkR1VChatConfig, SolarConfig,
-                                             Telechat2Config, UltravoxConfig)
+                                             RWConfig, UltravoxConfig)
 # yapf: enable
 from vllm.transformers_utils.configs.mistral import adapt_config_dict
 from vllm.transformers_utils.utils import check_gguf_file
@@ -77,28 +71,17 @@ _CONFIG_REGISTRY_OVERRIDE_HF: dict[str, type[PretrainedConfig]] = {
 
 _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = {
     "chatglm": ChatGLMConfig,
-    "cohere2": Cohere2Config,
-    "dbrx": DbrxConfig,
     "deepseek_vl_v2": DeepseekVLV2Config,
     "kimi_vl": KimiVLConfig,
     "Llama_Nemotron_Nano_VL": Nemotron_Nano_VL_Config,
-    "mpt": MPTConfig,
     "RefinedWeb": RWConfig,  # For tiiuae/falcon-40b(-instruct)
     "RefinedWebModel": RWConfig,  # For tiiuae/falcon-7b(-instruct)
     "jais": JAISConfig,
     "mlp_speculator": MLPSpeculatorConfig,
     "medusa": MedusaConfig,
     "eagle": EAGLEConfig,
-    "exaone": ExaoneConfig,
-    "exaone4": Exaone4Config,
-    "minimax_text_01": MiniMaxText01Config,
-    "minimax_vl_01": MiniMaxVL01Config,
     "nemotron": NemotronConfig,
     "NVLM_D": NVLM_D_Config,
-    "ovis": OvisConfig,
-    "solar": SolarConfig,
-    "skywork_chat": SkyworkR1VChatConfig,
-    "telechat": Telechat2Config,
     "ultravox": UltravoxConfig,
     **_CONFIG_REGISTRY_OVERRIDE_HF
 }
@@ -574,13 +557,11 @@ def get_pooling_config_name(pooling_name: str) -> Union[str, None]:
     supported_pooling_types = ['LAST', 'ALL', 'CLS', 'STEP', 'MEAN']
     pooling_type_name = pooling_name.upper()
 
-    try:
-        if pooling_type_name in supported_pooling_types:
-            return pooling_type_name
-    except NotImplementedError as e:
-        logger.debug("Pooling type not supported", e)
-        return None
-    return None
+    if pooling_type_name in supported_pooling_types:
+        return pooling_type_name
+
+    raise NotImplementedError(
+        f"Pooling type {pooling_type_name} not supported")
 
 
 @cache
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index 89303213a27e1..7c7d859e4a325 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -1,13 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Model configs may be defined in this directory for the following reasons:
+
+- There is no configuration file defined by HF Hub or Transformers library.
+- There is a need to override the existing config to support vLLM.
+"""
 
 from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
-from vllm.transformers_utils.configs.cohere2 import Cohere2Config
-from vllm.transformers_utils.configs.dbrx import DbrxConfig
 from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekVLV2Config
 from vllm.transformers_utils.configs.eagle import EAGLEConfig
-from vllm.transformers_utils.configs.exaone import ExaoneConfig
-from vllm.transformers_utils.configs.exaone4 import Exaone4Config
 # RWConfig is for the original tiiuae/falcon-40b(-instruct) and
 # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
 # `FalconConfig` class from the official HuggingFace transformers library.
@@ -15,36 +17,22 @@ from vllm.transformers_utils.configs.falcon import RWConfig
 from vllm.transformers_utils.configs.jais import JAISConfig
 from vllm.transformers_utils.configs.kimi_vl import KimiVLConfig
 from vllm.transformers_utils.configs.medusa import MedusaConfig
-from vllm.transformers_utils.configs.minimax_text_01 import MiniMaxText01Config
-from vllm.transformers_utils.configs.minimax_vl_01 import MiniMaxVL01Config
 from vllm.transformers_utils.configs.mllama import MllamaConfig
 from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig
 from vllm.transformers_utils.configs.moonvit import MoonViTConfig
-from vllm.transformers_utils.configs.mpt import MPTConfig
 from vllm.transformers_utils.configs.nemotron import NemotronConfig
 from vllm.transformers_utils.configs.nemotron_h import NemotronHConfig
 from vllm.transformers_utils.configs.nemotron_vl import Nemotron_Nano_VL_Config
 from vllm.transformers_utils.configs.nvlm_d import NVLM_D_Config
-from vllm.transformers_utils.configs.ovis import OvisConfig
-from vllm.transformers_utils.configs.skyworkr1v import SkyworkR1VChatConfig
-from vllm.transformers_utils.configs.solar import SolarConfig
-from vllm.transformers_utils.configs.telechat2 import Telechat2Config
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
 
 __all__ = [
     "ChatGLMConfig",
-    "Cohere2Config",
-    "DbrxConfig",
     "DeepseekVLV2Config",
-    "MPTConfig",
     "RWConfig",
     "JAISConfig",
     "MedusaConfig",
     "EAGLEConfig",
-    "ExaoneConfig",
-    "Exaone4Config",
-    "MiniMaxText01Config",
-    "MiniMaxVL01Config",
     "MllamaConfig",
     "MLPSpeculatorConfig",
     "MoonViTConfig",
@@ -53,9 +41,5 @@ __all__ = [
     "NemotronHConfig",
     "Nemotron_Nano_VL_Config",
     "NVLM_D_Config",
-    "OvisConfig",
-    "SkyworkR1VChatConfig",
-    "SolarConfig",
-    "Telechat2Config",
     "UltravoxConfig",
 ]
diff --git a/vllm/transformers_utils/configs/cohere2.py b/vllm/transformers_utils/configs/cohere2.py
deleted file mode 100644
index e547a9c281cff..0000000000000
--- a/vllm/transformers_utils/configs/cohere2.py
+++ /dev/null
@@ -1,195 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# ruff: noqa
-
-# Adapted from
-# https://github.com/huggingface/transformers/blob/main/src/transformers/models/cohere2/configuration_cohere2.py
-from transformers import PretrainedConfig
-from transformers.modeling_rope_utils import rope_config_validation
-
-
-class Cohere2Config(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`CohereModel`]. It is used to instantiate an Cohere
-    model according to the specified arguments, defining the model architecture.
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information. Instantiating a configuration
-    with the defaults will yield a similar configuration to that of the [CohereForAI/c4ai-command-r-v01](https://huggingface.co/CohereForAI/c4ai-command-r-v01) model.
-
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 256000):
-            Vocabulary size of the Cohere model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`CohereModel`]
-        hidden_size (`int`, *optional*, defaults to 8192):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 22528):
-            Dimension of the MLP representations.
-        logit_scale (`float`, *optional*, defaults to 0.0625):
-            The scaling factor for the output logits.
-        num_hidden_layers (`int`, *optional*, defaults to 40):
-            Number of hidden layers in the Transformer decoder.
-        num_attention_heads (`int`, *optional*, defaults to 64):
-            Number of attention heads for each attention layer in the Transformer decoder.
-        num_key_value_heads (`int`, *optional*):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details checkout [this
-            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
-            `num_attention_heads`.
-        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
-            The non-linear activation function (function or string) in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to 8192):
-            The maximum sequence length that this model might ever be used with.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
-            The epsilon used by the layer normalization.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        pad_token_id (`int`, *optional*, defaults to 0):
-            Padding token id.
-        bos_token_id (`int`, *optional*, defaults to 5):
-            Beginning of stream token id.
-        eos_token_id (`int`, *optional*, defaults to 255001):
-            End of stream token id.
-        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
-            Whether to tie weight embeddings
-        rope_theta (`float`, *optional*, defaults to 10000.0):
-            The base period of the RoPE embeddings.
-        rope_scaling (`dict`, *optional*):
-            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
-            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
-            accordingly.
-            Expected contents:
-                `rope_type` (`str`):
-                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
-                    'llama3'], with 'default' being the original RoPE implementation.
-                `factor` (`float`, *optional*):
-                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
-                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
-                    original maximum pre-trained length.
-                `original_max_position_embeddings` (`int`, *optional*):
-                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
-                    pretraining.
-                `attention_factor` (`float`, *optional*):
-                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
-                    computation. If unspecified, it defaults to value recommended by the implementation, using the
-                    `factor` field to infer the suggested value.
-                `beta_fast` (`float`, *optional*):
-                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
-                    ramp function. If unspecified, it defaults to 32.
-                `beta_slow` (`float`, *optional*):
-                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
-                    ramp function. If unspecified, it defaults to 1.
-                `short_factor` (`list[float]`, *optional*):
-                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
-                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
-                    size divided by the number of attention heads divided by 2
-                `long_factor` (`list[float]`, *optional*):
-                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
-                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
-                    size divided by the number of attention heads divided by 2
-                `low_freq_factor` (`float`, *optional*):
-                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
-                `high_freq_factor` (`float`, *optional*):
-                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
-        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
-            Whether to use a bias in the query, key, value and output projection layers during self-attention.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-        sliding_window (`int`, *optional*, defaults to 4096):
-            Size of the sliding window attention context.
-        sliding_window_pattern (`int`, *optional*, defaults to 4):
-            Pattern for the sliding window attention.
-        cache_implementation (`str`, *optional*, defaults to `"hybrid"`): the cache type to be used with `generate`.
-
-    ```python
-    >>> from transformers import Cohere2Model, Cohere2Config
-
-    >>> # Initializing a Cohere Nextmodel configuration
-    >>> configuration = Cohere2Config()
-
-    >>> # Initializing a model from the Cohere2 configuration
-    >>> model = Cohere2Model(configuration) # doctest: +SKIP
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config # doctest: +SKIP
-    ```
-    """
-
-    model_type = "cohere2"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=256000,
-        hidden_size=8192,
-        intermediate_size=22528,
-        logit_scale=0.0625,
-        num_hidden_layers=40,
-        num_attention_heads=64,
-        num_key_value_heads=None,
-        hidden_act="silu",
-        max_position_embeddings=8192,
-        initializer_range=0.02,
-        layer_norm_eps=1e-5,
-        use_cache=True,
-        pad_token_id=0,
-        bos_token_id=5,
-        eos_token_id=255001,
-        tie_word_embeddings=True,
-        rope_theta=10000.0,
-        rope_scaling=None,
-        attention_bias=False,
-        attention_dropout=0.0,
-        sliding_window=4096,
-        sliding_window_pattern=4,
-        cache_implementation="hybrid",
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.logit_scale = logit_scale
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-
-        self.num_key_value_heads = num_key_value_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.layer_norm_eps = layer_norm_eps
-        self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
-        self.attention_bias = attention_bias
-        self.attention_dropout = attention_dropout
-        self.sliding_window = sliding_window
-        self.sliding_window_pattern = sliding_window_pattern
-        # Need to specify head_dim in the config so it can be used in the attention forward functions
-        self.head_dim = hidden_size // num_attention_heads
-        self.cache_implementation = cache_implementation
-
-        # Validate the correctness of rotary position embeddings parameters
-        rope_config_validation(self)
-
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
-
-
-__all__ = ["Cohere2Config"]
diff --git a/vllm/transformers_utils/configs/dbrx.py b/vllm/transformers_utils/configs/dbrx.py
deleted file mode 100644
index 7dbda99f85a4e..0000000000000
--- a/vllm/transformers_utils/configs/dbrx.py
+++ /dev/null
@@ -1,280 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# yapf: disable
-# ruff: noqa: E501
-# coding=utf-8
-# Copied from
-# https://huggingface.co/databricks/dbrx-base/blob/main/configuration_dbrx.py
-"""Dbrx configuration."""
-
-from typing import Any, Optional
-
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-
-logger = logging.get_logger(__name__)
-
-DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP = {} # type: ignore
-
-
-class DbrxAttentionConfig(PretrainedConfig):
-    """Configuration class for Dbrx Attention.
-
-    [`DbrxAttention`] class. It is used to instantiate attention layers
-    according to the specified arguments, defining the layers architecture.
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-    Args:
-        attn_pdrop (`float`, *optional*, defaults to 0.0):
-            The dropout probability for the attention layers.
-        clip_qkv (`float`, *optional*, defaults to None):
-            If not `None`, clip the queries, keys, and values in the attention layer to this value.
-        kv_n_heads (Optional[int]): For grouped_query_attention only, allow user to specify number of kv heads.
-        rope_theta (float): The base frequency for rope.
-    """
-
-    def __init__(
-        self,
-        attn_pdrop: float = 0,
-        clip_qkv: Optional[float] = None,
-        kv_n_heads: int = 1,
-        rope_theta: float = 10000.0,
-        **kwargs: Any,
-    ):
-        super().__init__(**kwargs)
-        self.attn_pdrop = attn_pdrop
-        self.clip_qkv = clip_qkv
-        self.kv_n_heads = kv_n_heads
-        self.rope_theta = rope_theta
-
-        for k in ["model_type"]:
-            if k in kwargs:
-                kwargs.pop(k)
-        if len(kwargs) != 0:
-            raise ValueError(f"Found unknown {kwargs=}")
-
-    @classmethod
-    def from_pretrained(
-        cls, pretrained_model_name_or_path: str, **kwargs: Any
-    ) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(
-            pretrained_model_name_or_path, **kwargs
-        )
-
-        if config_dict.get("model_type") == "dbrx":
-            config_dict = config_dict["attn_config"]
-
-        if (
-            "model_type" in config_dict
-            and hasattr(cls, "model_type")
-            and config_dict["model_type"] != cls.model_type
-        ):
-            logger.warning(
-                "You are using a model of type %s to instantiate a model of "
-                "type %s. This is not supported for all configurations of "
-                "models and can yield errors.",
-                config_dict["model_type"], cls.model_type)
-
-        return cls.from_dict(config_dict, **kwargs)
-
-
-class DbrxFFNConfig(PretrainedConfig):
-    """Configuration class for Dbrx FFN.
-
-    [`DbrxFFN`] class. It is used to instantiate feedforward layers according to
-    the specified arguments, defining the layers architecture.
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-    Args:
-        ffn_act_fn (dict, optional): A dict specifying activation function for the FFN.
-            The dict should have a key 'name' with the value being the name of
-            the activation function along with any additional keyword arguments.
-        ffn_hidden_size (int, optional): The hidden size of the feedforward network.
-        moe_num_experts (int, optional): The number of experts in the mixture of experts layer.
-        moe_top_k (int, optional): The number of experts to use in the mixture of experts layer.
-        moe_jitter_eps (float, optional): The jitter epsilon for the mixture of experts layer.
-        moe_loss_weight (float, optional): The loss weight for the mixture of experts layer.
-        moe_normalize_expert_weights (float, optional): The normalization factor for the expert weights.
-        uniform_expert_assignment (bool, optional): Whether to use uniform expert assignment.
-            This should only be used for benchmarking purposes.
-    """
-
-    def __init__(
-        self,
-        ffn_act_fn: Optional[dict] = None,
-        ffn_hidden_size: int = 3584,
-        moe_num_experts: int = 4,
-        moe_top_k: int = 1,
-        moe_jitter_eps: Optional[float] = None,
-        moe_loss_weight: float = 0.01,
-        moe_normalize_expert_weights: Optional[float] = 1,
-        uniform_expert_assignment: bool = False,
-        **kwargs: Any,
-    ):
-        super().__init__()
-        if ffn_act_fn is None:
-            ffn_act_fn = {"name": "silu"}
-        self.ffn_act_fn = ffn_act_fn
-        self.ffn_hidden_size = ffn_hidden_size
-        self.moe_num_experts = moe_num_experts
-        self.moe_top_k = moe_top_k
-        self.moe_jitter_eps = moe_jitter_eps
-        self.moe_loss_weight = moe_loss_weight
-        self.moe_normalize_expert_weights = moe_normalize_expert_weights
-        self.uniform_expert_assignment = uniform_expert_assignment
-
-        for k in ["model_type"]:
-            if k in kwargs:
-                kwargs.pop(k)
-        if len(kwargs) != 0:
-            raise ValueError(f"Found unknown {kwargs=}")
-
-    @classmethod
-    def from_pretrained(
-        cls, pretrained_model_name_or_path: str, **kwargs: Any
-    ) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(
-            pretrained_model_name_or_path, **kwargs
-        )
-
-        if config_dict.get("model_type") == "dbrx":
-            config_dict = config_dict["ffn_config"]
-
-        if (
-            "model_type" in config_dict
-            and hasattr(cls, "model_type")
-            and config_dict["model_type"] != cls.model_type
-        ):
-            logger.warning(
-                "You are using a model of type %s to instantiate a model of "
-                "type %s. This is not supported for all "
-                "configurations of models and can yield errors.", config_dict["model_type"], cls.model_type)
-
-        return cls.from_dict(config_dict, **kwargs)
-
-
-class DbrxConfig(PretrainedConfig):
-    """Configuration class for Dbrx.
-
-    [`DbrxModel`]. It is used to instantiate a Dbrx model according to the
-    specified arguments, defining the model architecture.
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-
-    Args:
-        d_model (`int`, *optional*, defaults to 6144):
-            Dimensionality of the embeddings and hidden states.
-        n_heads (`int`, *optional*, defaults to 48):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        n_layers (`int`, *optional*, defaults to 40):
-            Number of hidden layers in the Transformer encoder.
-        max_seq_len (`int`, *optional*, defaults to 32768):
-            The maximum sequence length of the model.
-        vocab_size (`int`, *optional*, defaults to 100352):
-            Vocabulary size of the Dbrx model. Defines the maximum number of different tokens that can be represented by
-            the `inputs_ids` passed when calling [`DbrxModel`].
-        resid_pdrop (`float`, *optional*, defaults to 0.0):
-            The dropout probability applied to the attention output before combining with residual.
-        emb_pdrop (`float`, *optional*, defaults to 0.0):
-            The dropout probability for the embedding layer.
-        attn_config (`dict`, *optional*):
-            A dictionary used to configure the model's attention module.
-        ffn_config (`dict`, *optional*):
-            A dictionary used to configure the model's FFN module.
-        use_cache (`bool`, *optional*, defaults to `False`):
-            Whether or not the model should return the last key/values attentions (not used by all models).
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        output_router_logits (`bool`, *optional*, defaults to `False`):
-            Whether or not the router logits should be returned by the model. Enabling this will also allow the model to output the auxiliary loss.
-        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
-            The aux loss factor for the total loss.
-
-
-    Example:
-    ```python
-    >>> from transformers import DbrxConfig, DbrxModel
-
-    >>> # Initializing a Dbrx configuration
-    >>> configuration = DbrxConfig()
-
-    >>> # Initializing a model (with random weights) from the configuration
-    >>> model = DbrxModel(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```
-    """
-
-    model_type = "dbrx"
-    attribute_map = {
-        "num_attention_heads": "n_heads",
-        "hidden_size": "d_model",
-        "num_hidden_layers": "n_layers",
-        "max_position_embeddings": "max_seq_len",
-    }
-
-    def __init__(
-        self,
-        d_model: int = 2048,
-        n_heads: int = 16,
-        n_layers: int = 24,
-        max_seq_len: int = 2048,
-        vocab_size: int = 32000,
-        resid_pdrop: float = 0.0,
-        emb_pdrop: float = 0.0,
-        attn_config: Optional[DbrxAttentionConfig] = None,
-        ffn_config: Optional[DbrxFFNConfig] = None,
-        use_cache: bool = True,
-        initializer_range: float = 0.02,
-        output_router_logits: bool = False,
-        router_aux_loss_coef: float = 0.05,
-        **kwargs: Any,
-    ):
-        if attn_config is None:
-            self.attn_config = DbrxAttentionConfig()
-        elif isinstance(attn_config, dict):
-            self.attn_config = DbrxAttentionConfig(**attn_config)
-        else:
-            self.attn_config = attn_config
-
-        if ffn_config is None:
-            self.ffn_config = DbrxFFNConfig()
-        elif isinstance(ffn_config, dict):
-            self.ffn_config = DbrxFFNConfig(**ffn_config)
-        else:
-            self.ffn_config = ffn_config
-
-        self.d_model = d_model
-        self.n_heads = n_heads
-        self.n_layers = n_layers
-        self.max_seq_len = max_seq_len
-        self.vocab_size = vocab_size
-        self.resid_pdrop = resid_pdrop
-        self.emb_pdrop = emb_pdrop
-        self.use_cache = use_cache
-        self.initializer_range = initializer_range
-        self.output_router_logits = output_router_logits
-        self.router_aux_loss_coef = router_aux_loss_coef
-
-        tie_word_embeddings = kwargs.pop("tie_word_embeddings", False)
-        if tie_word_embeddings:
-            raise ValueError(
-                "tie_word_embeddings is not supported for Dbrx models."
-            )
-
-        super().__init__(
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
diff --git a/vllm/transformers_utils/configs/exaone.py b/vllm/transformers_utils/configs/exaone.py
deleted file mode 100644
index 7450904a15caf..0000000000000
--- a/vllm/transformers_utils/configs/exaone.py
+++ /dev/null
@@ -1,190 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# Copied from
-# https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/configuration_exaone.py
-# Copyright 2021 The LG AI Research EXAONE Lab. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Exaone model configuration"""
-
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-
-logger = logging.get_logger(__name__)
-
-EXAONE_PRETRAINED_CONFIG_ARCHIVE_MAP: dict[str, str] = {}
-
-
-class ExaoneConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a :class:
-    `~transformers.ExaoneModel`. It is used to instantiate a GPT Lingvo model
-    according to the specified arguments, defining the model architecture.
-    Instantiating a configuration with the defaults will yield a similar
-    configuration to that of the Exaone
-
-    Configuration objects inherit from {class}`~transformers.PretrainedConfig`
-    and can be used to control the model outputs. Read the documentation from :
-    class:`~transformers.PretrainedConfig` for more information.
-
-    Args:
-        vocab_size ({obj}`int`, `optional`, defaults to 50257):
-            Vocabulary size of the GPT Lingvo model. Defines the number of
-            different tokens that can be represented by the {obj}`inputs_ids`
-            passed when calling {class}`~transformers.ExaoneModel`. Vocabulary
-            size of the model.
-            Defines the different tokens that can be represented by the
-            `inputs_ids` passed to the forward method of :class:
-            `~transformers.EXAONEModel`.
-        hidden_size ({obj}`int`, `optional`, defaults to 2048):
-            Dimensionality of the encoder layers and the pooler layer.
-        num_layers ({obj}`int`, `optional`, defaults to 24):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer in the
-            Transformer decoder.
-        num_key_value_heads (`int`, *optional*):
-            This is the number of key_value heads that should be used to
-            implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi
-            Head Attention (MHA), if `num_key_value_heads=1 the model will use
-            Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint,
-            each group key and value head should be constructed by meanpooling
-            all the original heads within that group. For more details checkout
-            [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not
-            specified, will default to `num_attention_heads`.
-        rotary_pct (`float`, *optional*, defaults to 0.25):
-            percentage of hidden dimensions to allocate to rotary embeddings
-        intermediate_size ({obj}`int`, `optional`, defaults to 8192):
-            Dimensionality of the "intermediate" (i.e., feed-forward) layer in
-            the Transformer encoder.
-        activation_function ({obj}`str` or {obj}`function`, `optional`,
-        defaults to {obj}`"gelu_new"`):
-            The non-linear activation function (function or string) in the
-            encoder and pooler. If string, {obj}`"gelu"`, {obj}`"relu"`,
-            {obj}`"selu"` and {obj}`"gelu_new"` are supported.
-        embed_dropout ({obj}`float`, `optional`, defaults to 0.0):
-            The dropout probabilitiy for all fully connected layers in the
-            embeddings, encoder, and pooler.
-        attention_dropout ({obj}`float`, `optional`, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-        max_position_embeddings ({obj}`int`, `optional`, defaults to 2048):
-            The maximum sequence length that this model might ever be used with.
-            Typically set this to something large just in case
-            (e.g., 512 or 1024 or 2048).
-        type_vocab_size ({obj}`int`, `optional`, defaults to 2):
-            The vocabulary size of the {obj}`token_type_ids` passed when calling
-            {class}`~transformers.EXAONEModel`.
-        initializer_range ({obj}`float`, `optional`, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for
-            initializing all weight matrices.
-        layer_norm_epsilon ({obj}`float`, `optional`, defaults to 1e-5):
-            The epsilon used by the layer normalization layers.
-        use_cache ({obj}`bool`, `optional`, defaults to {obj}`True`):
-            Whether or not the model should return the last key/values
-            attentions (not used by all models).
-            Only relevant if ``config.is_decoder=True``.
-        gradient_checkpointing ({obj}`bool`, `optional`,
-        defaults to {obj}`False`):
-            If True, use gradient checkpointing to save memory at the expense
-            of slower backward pass.
-        Example::
-
-            >>> from transformers import ExoneModel, ExaoneConfig
-
-            >>> # Initializing a EXAONE configuration
-            >>> configuration = ExaoneConfig()
-
-            >>> # Initializing a model from configuration
-            >>> model = ExoneModel(configuration)
-
-            >>> # Accessing the model configuration
-            >>> configuration = model.config
-    """
-
-    model_type = "exaone"
-    keys_to_ignore_at_inference = ["past_key_values"]
-    attribute_map = {"num_hidden_layers": "num_layers"}
-
-    def __init__(
-        self,
-        vocab_size=102400,
-        max_position_embeddings=2048,
-        hidden_size=2048,
-        num_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=None,
-        intermediate_size=None,
-        activation_function="silu",
-        rotary_pct=0.25,
-        resid_dropout=0.0,
-        embed_dropout=0.0,
-        attention_dropout=0.0,
-        layer_norm_epsilon=1e-6,
-        initializer_range=0.02,
-        use_cache=True,
-        bos_token_id=0,
-        eos_token_id=2,
-        tie_word_embeddings=True,
-        **kwargs,
-    ):
-        super().__init__(
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
-
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.num_layers = num_layers
-        self.num_attention_heads = num_attention_heads
-        self.num_hidden_layers = num_layers
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-        self.num_key_value_heads = num_key_value_heads
-        if intermediate_size:
-            self.intermediate_size = intermediate_size
-        else:
-            self.intermediate_size = hidden_size * 4
-        self.activation_function = activation_function
-        self.resid_dropout = resid_dropout
-        self.embed_dropout = embed_dropout
-        self.attention_dropout = attention_dropout
-        self.layer_norm_epsilon = layer_norm_epsilon
-        self.initializer_range = initializer_range
-        self.use_cache = use_cache
-        self.rotary_pct = rotary_pct
-
-        self.bos_token_id = bos_token_id
-        self.eos_token_id = eos_token_id
-
-        self.use_logit_cap = kwargs.pop("use_logit_cap", False)
-        self.ln_no_scale = kwargs.pop("ln_no_scale", False)
-        self.use_gated = kwargs.pop("use_gated", False)
-        self.use_emb_norm = kwargs.pop("use_emb_norm", False)
-        self.use_rotary_pos = kwargs.pop("use_rotary_pos", False)
-        self.rotary_type = kwargs.pop("rotary_type", None)
-        self.scaling_factor = kwargs.pop("scaling_factor", 1)
-        self.use_absolute_pos = kwargs.pop("use_absolute_pos", True)
-        self.use_extra_logit = kwargs.pop("use_extra_logit", True)
-        self.rotary_expand_length = kwargs.pop("rotary_expand_length", None)
-        self.rotary_base = kwargs.pop("rotary_base", 10000.0)
-        self.use_qkv_fuse = kwargs.pop("use_qkv_fuse", False)
-        self.rescale_before_lm_head = kwargs.pop("rescale_before_lm_head",
-                                                 (rotary_pct == 0.25))
-        if self.use_rotary_pos:
-            self.use_absolute_pos = False
diff --git a/vllm/transformers_utils/configs/exaone4.py b/vllm/transformers_utils/configs/exaone4.py
deleted file mode 100644
index a22ebaa6bd6bb..0000000000000
--- a/vllm/transformers_utils/configs/exaone4.py
+++ /dev/null
@@ -1,252 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# ruff: noqa: E501
-
-# Copied from
-# https://github.com/lgai-exaone/transformers/blob/add-exaone4/src/transformers/models/exaone4/configuration_exaone4.py
-# Copyright 2025 The LG CNS Gen AI Solution Delivery Team.
-# Copyright 2025 The LG AI Research and HuggingFace Inc. team. All rights reserved.
-#
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from transformers.configuration_utils import (PretrainedConfig,
-                                              layer_type_validation)
-from transformers.utils import logging
-
-logger = logging.get_logger(__name__)
-
-
-def check_is_sliding(config, layer_idx):
-    """
-    Check if the current layer is a sliding window attention (local attention) layer.
-    """
-    if config.sliding_window is None:
-        return False
-    if config.layer_types is not None:
-        return config.layer_types[layer_idx] == "sliding_attention"
-    if isinstance(config.sliding_window_pattern, int):
-        return ((layer_idx + 1) % config.sliding_window_pattern) != 0
-    elif isinstance(config.sliding_window_pattern, str):
-        assert isinstance(config.sliding_window, int), (
-            f"Sliding window must be positive integer, but got {config.sliding_window}"
-        )
-        return (layer_idx != config.num_hidden_layers - 1
-                and config.sliding_window_pattern[layer_idx % len(
-                    config.sliding_window_pattern)] == "L")
-    else:
-        logger.warning_once(
-            "Sliding window is set, but none of `sliding_window_pattern` or `layer_types` is set. "
-            "Defaulting to use 'full_attention' for all layers.")
-    return False
-
-
-class Exaone4Config(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`Exaone4Model`]. It is used to
-    instantiate a EXAONE 4.0 model according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the EXAONE-4.0-Instruct [LGAI-EXAONE/EXAONE-4.0-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-Instruct)
-    NOTE: `EXAONE-4.0-Instruct` is a placeholder model ID. The exact model ID will be updated in the future.
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
-    outputs. Read the documentation from [`PretrainedConfig`] for more information.
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 102400):
-            Vocabulary size of the EXAONE 4.0 model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`Exaone4Model`].
-        hidden_size (`int`, *optional*, defaults to 4096):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to `hidden_size * 4`):
-            Dimensionality of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 32):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer in the Transformer decoder.
-        num_key_value_heads (`int`, *optional*):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details checkout [this
-            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
-            `num_attention_heads`.
-        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
-            The non-linear activation function (function or string) in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to 2048):
-            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            just in case (e.g., 32768 for EXAONE 3.5).
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
-            The epsilon used by the layer normalization layers.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if ``config.is_decoder=True``.
-        bos_token_id (`int`, *optional*, defaults to 0):
-            Beginning of stream token id.
-        eos_token_id (`int`, *optional*, defaults to 2):
-            End of stream token id.
-        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
-            Whether to tie weight embeddings
-        rope_theta (`float`, *optional*, defaults to 10000.0):
-            The base period of the RoPE embeddings.
-        rope_scaling (`Dict`, *optional*):
-            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
-            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
-            accordingly.
-            Expected contents:
-                `rope_type` (`str`):
-                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
-                    'llama3'], with 'default' being the original RoPE implementation.
-                `factor` (`float`, *optional*):
-                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
-                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
-                    original maximum pre-trained length.
-                `original_max_position_embeddings` (`int`, *optional*):
-                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
-                    pretraining.
-                `attention_factor` (`float`, *optional*):
-                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
-                    computation. If unspecified, it defaults to value recommended by the implementation, using the
-                    `factor` field to infer the suggested value.
-                `beta_fast` (`float`, *optional*):
-                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
-                    ramp function. If unspecified, it defaults to 32.
-                `beta_slow` (`float`, *optional*):
-                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
-                    ramp function. If unspecified, it defaults to 1.
-                `short_factor` (`List[float]`, *optional*):
-                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
-                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
-                    size divided by the number of attention heads divided by 2
-                `long_factor` (`List[float]`, *optional*):
-                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
-                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
-                    size divided by the number of attention heads divided by 2
-                `low_freq_factor` (`float`, *optional*):
-                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
-                `high_freq_factor` (`float`, *optional*):
-                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-        sliding_window (`int`, *optional*):
-            The size of the sliding window for the sliding window attention.
-        sliding_window_pattern (`str`, *optional*):
-            The pattern to use for sliding window attention. Can be one of:
-                - `None`: No sliding window attention is used
-                - `int`: Every `sliding_window` layers, use global attention, else use local attention.
-                - `str`: A sequence of "L" (local attention) and "G" (global attention) characters that defines the
-                  attention pattern. The pattern starts from layer 0 and repeats every `sliding_window` layers. The
-                  final layer always uses global attention regardless of the pattern.
-            For instance, sliding_window_pattern="LLLG" same as sliding_window=4, which means:
-                - Layer 0, 1, 2: local attention,
-                - Layer 3: global attention,
-                ...(repeated)
-        layer_types (`list`, *optional*):
-            Attention pattern for each layer. Prioritized over `sliding_window_pattern`.
-
-    Example:
-
-    ```python
-    >>> from transformers import Exaone4Model, Exaone4Config
-
-    >>> # Initializing a EXAONE configuration
-    >>> configuration = Exaone4Config()
-
-    >>> # Initializing a model from configuration
-    >>> model = Exaone4Model(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-
-    model_type = "exaone4"
-    keys_to_ignore_at_inference = ["past_key_values"]
-    # Default tensor parallel plan for base model `LlamaModel`
-    base_model_tp_plan = {
-        "layers.*.self_attn.q_proj": "colwise",
-        "layers.*.self_attn.k_proj": "colwise",
-        "layers.*.self_attn.v_proj": "colwise",
-        "layers.*.self_attn.o_proj": "rowwise",
-        "layers.*.mlp.gate_proj": "colwise",
-        "layers.*.mlp.up_proj": "colwise",
-        "layers.*.mlp.down_proj": "rowwise",
-    }
-    base_model_pp_plan = {
-        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
-        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
-        "norm": (["hidden_states"], ["hidden_states"]),
-    }
-
-    def __init__(
-        self,
-        vocab_size=102400,
-        hidden_size=4096,
-        intermediate_size=None,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=None,
-        hidden_act="silu",
-        max_position_embeddings=2048,
-        initializer_range=0.02,
-        rms_norm_eps=1e-5,
-        use_cache=True,
-        bos_token_id=0,
-        eos_token_id=2,
-        tie_word_embeddings=False,
-        rope_theta=10000.0,
-        rope_scaling=None,
-        attention_dropout=0.0,
-        sliding_window=None,
-        sliding_window_pattern=None,
-        layer_types=None,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-        self.num_key_value_heads = num_key_value_heads
-        if intermediate_size:
-            self.intermediate_size = intermediate_size
-        else:
-            self.intermediate_size = hidden_size * 4
-        self.hidden_act = hidden_act
-        self.max_position_embeddings = max_position_embeddings
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        self.attention_dropout = attention_dropout
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
-        self.sliding_window = sliding_window
-        self.sliding_window_pattern = sliding_window_pattern
-
-        self.layer_types = layer_types
-        if self.layer_types is None:
-            self.layer_types = [
-                "sliding_attention"
-                if check_is_sliding(self, i) else "full_attention"
-                for i in range(self.num_hidden_layers)
-            ]
-        layer_type_validation(self.layer_types)
-
-        super().__init__(bos_token_id=bos_token_id,
-                         eos_token_id=eos_token_id,
-                         tie_word_embeddings=tie_word_embeddings,
-                         **kwargs)
-
-
-__all__ = ["Exaone4Config"]
diff --git a/vllm/transformers_utils/configs/minimax_text_01.py b/vllm/transformers_utils/configs/minimax_text_01.py
deleted file mode 100644
index e3b63dfa00371..0000000000000
--- a/vllm/transformers_utils/configs/minimax_text_01.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-""" MiniMaxText01 model configuration"""
-
-from transformers.configuration_utils import PretrainedConfig
-
-
-class MiniMaxText01Config(PretrainedConfig):
-    model_type = "MiniMaxText01"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=32000,
-        hidden_size=4096,
-        intermediate_size=14336,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=8,
-        hidden_act="silu",
-        max_position_embeddings=4096 * 32,
-        initializer_range=0.02,
-        rms_norm_eps=1e-5,
-        use_cache=True,
-        pad_token_id=None,
-        bos_token_id=None,
-        eos_token_id=None,
-        tie_word_embeddings=False,
-        rope_theta=1e6,
-        sliding_window=None,
-        attention_dropout=0.0,
-        num_experts_per_tok=2,
-        num_local_experts=8,
-        output_router_logits=False,
-        router_aux_loss_coef=0.001,
-        router_jitter_noise=0.0,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.sliding_window = sliding_window
-
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-
-        self.num_key_value_heads = num_key_value_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.attention_dropout = attention_dropout
-
-        self.num_experts_per_tok = num_experts_per_tok
-        self.num_local_experts = num_local_experts
-        self.output_router_logits = output_router_logits
-        self.router_aux_loss_coef = router_aux_loss_coef
-        self.router_jitter_noise = router_jitter_noise
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
diff --git a/vllm/transformers_utils/configs/minimax_vl_01.py b/vllm/transformers_utils/configs/minimax_vl_01.py
deleted file mode 100644
index c62497192cc2a..0000000000000
--- a/vllm/transformers_utils/configs/minimax_vl_01.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""MiniMaxVL01 model configuration"""
-
-from transformers.configuration_utils import PretrainedConfig
-from transformers.models.auto import CONFIG_MAPPING
-
-from .minimax_text_01 import MiniMaxText01Config
-
-
-class MiniMaxVL01Config(PretrainedConfig):
-    model_type = "minimax_vl_01"
-
-    def __init__(
-        self,
-        vision_config=None,
-        text_config=None,
-        ignore_index=-100,
-        image_token_index=32000,
-        projector_hidden_act="gelu",
-        vision_feature_select_strategy="default",
-        vision_feature_layer=-2,
-        image_grid_pinpoints=None,
-        tie_word_embeddings=False,
-        image_seq_length=576,
-        **kwargs,
-    ):
-        self.ignore_index = ignore_index
-        self.image_token_index = image_token_index
-        self.projector_hidden_act = projector_hidden_act
-        self.image_seq_length = image_seq_length
-
-        if vision_feature_select_strategy not in ["default", "full"]:
-            raise ValueError("vision_feature_select_strategy should " +
-                             "be one of 'default', 'full'." +
-                             f"Got: {vision_feature_select_strategy}")
-
-        self.vision_feature_select_strategy = vision_feature_select_strategy
-        self.vision_feature_layer = vision_feature_layer
-        image_grid_pinpoints = (
-            image_grid_pinpoints if image_grid_pinpoints is not None else
-            [[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]])
-        self.image_grid_pinpoints = image_grid_pinpoints
-
-        if isinstance(vision_config, dict):
-            if "model_type" not in vision_config:
-                vision_config["model_type"] = "clip_vision_model"
-            vision_config = CONFIG_MAPPING[vision_config["model_type"]](
-                **vision_config)
-        elif vision_config is None:
-            vision_config = CONFIG_MAPPING["clip_vision_model"](
-                intermediate_size=4096,
-                hidden_size=1024,
-                patch_size=14,
-                image_size=336,
-                num_hidden_layers=24,
-                num_attention_heads=16,
-                vocab_size=32000,
-                projection_dim=768,
-            )
-
-        self.vision_config = vision_config
-
-        if text_config is not None:
-            text_config = MiniMaxText01Config(**text_config)
-        else:
-            text_config = MiniMaxText01Config()
-
-        self.text_config = text_config
-
-        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
diff --git a/vllm/transformers_utils/configs/mpt.py b/vllm/transformers_utils/configs/mpt.py
deleted file mode 100644
index 91316408dcd89..0000000000000
--- a/vllm/transformers_utils/configs/mpt.py
+++ /dev/null
@@ -1,180 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# Copied from
-# https://huggingface.co/mosaicml/mpt-7b/blob/main/configuration_mpt.py
-"""A HuggingFace-style model configuration."""
-import warnings
-from typing import Any, Optional, Union
-
-from transformers import PretrainedConfig
-
-attn_config_defaults: dict = {
-    'attn_type': 'multihead_attention',
-    'attn_pdrop': 0.0,
-    'attn_impl': 'triton',
-    'qk_ln': False,
-    'clip_qkv': None,
-    'softmax_scale': None,
-    'prefix_lm': False,
-    'attn_uses_sequence_id': False,
-    'alibi': False,
-    'alibi_bias_max': 8
-}
-ffn_config_defaults: dict = {'ffn_type': 'mptmlp'}
-init_config_defaults: dict = {
-    'name': 'kaiming_normal_',
-    'fan_mode': 'fan_in',
-    'init_nonlinearity': 'relu',
-    'init_div_is_residual': True,
-    'emb_init_std': None,
-    'emb_init_uniform_lim': None,
-    'init_std': None,
-    'init_gain': 0.0
-}
-
-
-class MPTConfig(PretrainedConfig):
-    model_type = 'mpt'
-    attribute_map = {
-        'num_attention_heads': 'n_heads',
-        'hidden_size': 'd_model',
-        'num_hidden_layers': 'n_layers',
-    }
-
-    # pylint: disable=dangerous-default-value
-    def __init__(self,
-                 d_model: int = 2048,
-                 n_heads: int = 16,
-                 n_layers: int = 24,
-                 expansion_ratio: int = 4,
-                 max_seq_len: int = 2048,
-                 vocab_size: int = 50368,
-                 resid_pdrop: float = 0.0,
-                 emb_pdrop: float = 0.0,
-                 learned_pos_emb: bool = True,
-                 attn_config: dict = attn_config_defaults,
-                 ffn_config: dict = ffn_config_defaults,
-                 init_device: str = 'cpu',
-                 logit_scale: Optional[Union[float, str]] = None,
-                 no_bias: bool = False,
-                 embedding_fraction: float = 1.0,
-                 norm_type: str = 'low_precision_layernorm',
-                 use_cache: bool = False,
-                 init_config: dict = init_config_defaults,
-                 fc_type: str = 'torch',
-                 verbose: Optional[int] = None,
-                 **kwargs: Any):
-        self.d_model = d_model
-        self.n_heads = n_heads
-        self.n_layers = n_layers
-        self.expansion_ratio = expansion_ratio
-        self.max_seq_len = max_seq_len
-        self.vocab_size = vocab_size
-        self.resid_pdrop = resid_pdrop
-        self.emb_pdrop = emb_pdrop
-        self.learned_pos_emb = learned_pos_emb
-        self.attn_config = attn_config
-        self.ffn_config = ffn_config
-        self.init_device = init_device
-        self.logit_scale = logit_scale
-        self.no_bias = no_bias
-        self.embedding_fraction = embedding_fraction
-        self.norm_type = norm_type
-        self.use_cache = use_cache
-        self.init_config = init_config
-        self.fc_type = fc_type
-        if verbose is not None:
-            warnings.warn(DeprecationWarning(
-                'verbose argument for MPTConfig is now ignored and '
-                'will be removed. Use python_log_level instead.'),
-                          stacklevel=2)
-        if 'name' in kwargs:
-            del kwargs['name']
-        if 'loss_fn' in kwargs:
-            del kwargs['loss_fn']
-        if self.attn_config.get('alibi', False):
-            self.learned_pos_emb = False
-            warnings.warn(
-                f'alibi is turned on, setting `learned_pos_emb` '
-                f'to {self.learned_pos_emb}`',
-                stacklevel=2)
-        super().__init__(**kwargs)
-        self._validate_config()
-
-    def _set_config_defaults(
-            self, config: dict[str, Any],
-            config_defaults: dict[str, Any]) -> dict[str, Any]:
-        for (k, v) in config_defaults.items():
-            if k not in config:
-                config[k] = v
-        return config
-
-    def _validate_config(self) -> None:
-        self.attn_config = self._set_config_defaults(self.attn_config,
-                                                     attn_config_defaults)
-        self.ffn_config = self._set_config_defaults(self.ffn_config,
-                                                    ffn_config_defaults)
-        self.init_config = self._set_config_defaults(self.init_config,
-                                                     init_config_defaults)
-        if self.d_model % self.n_heads != 0:
-            raise ValueError('d_model must be divisible by n_heads')
-        if any(
-                prob < 0 or prob > 1 for prob in
-            [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop
-             ]):
-            raise ValueError(
-                "self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are "
-                "probabilities and must be between 0 and 1")
-        if self.attn_config['attn_impl'] not in ['torch', 'flash', 'triton']:
-            raise ValueError(
-                f"Unknown attn_impl={self.attn_config['attn_impl']}")
-        if self.attn_config['prefix_lm'] and self.attn_config[
-                'attn_impl'] not in ['torch', 'triton']:
-            raise NotImplementedError(
-                'prefix_lm only implemented with torch and triton attention.')
-        if self.attn_config['alibi'] and self.attn_config['attn_impl'] not in [
-                'torch', 'triton'
-        ]:
-            raise NotImplementedError(
-                'alibi only implemented with torch and triton attention.')
-        if self.attn_config['attn_uses_sequence_id'] and self.attn_config[
-                'attn_impl'] not in ['torch', 'triton']:
-            raise NotImplementedError(
-                'attn_uses_sequence_id only implemented with torch '
-                'and triton attention.')
-        if self.embedding_fraction > 1 or self.embedding_fraction <= 0:
-            raise ValueError(
-                'model.embedding_fraction must be between 0 (exclusive) '
-                'and 1 (inclusive)!')
-        if isinstance(self.logit_scale,
-                      str) and self.logit_scale != 'inv_sqrt_d_model':
-            raise ValueError(
-                f"self.logit_scale={self.logit_scale!r} is not recognized as "
-                "an option; use numeric value or 'inv_sqrt_d_model'.")
-        if self.init_config.get('name', None) is None:
-            raise ValueError(
-                f"self.init_config={self.init_config!r} 'name' needs to be set."
-            )
-        if not self.learned_pos_emb and (not self.attn_config['alibi']):
-            warnings.warn(
-                'Positional information not being provided to the model.',
-                stacklevel=2)
-        if self.fc_type == 'te' or self.ffn_config['ffn_type'] == 'te_ln_mlp':
-            try:
-                # pylint: disable=import-outside-toplevel
-                import transformer_engine.pytorch as te
-                del te
-            except Exception as exc:
-                raise ImportError(
-                    'TransformerEngine import fail. `fc_type: te` requires '
-                    'TransformerEngine be installed. '
-                    'The required version of transformer_engine also requires '
-                    'FlashAttention v1.0.6 is installed:\n'
-                    'pip install flash-attn==1.0.6 --no-build-isolation \n'
-                    'pip install git+https://github.com/NVIDIA/TransformerEngine.git@144e4888b2cdd60bd52e706d5b7a79cb9c1a7156'
-                ) from exc
-        if self.ffn_config['ffn_type'] == 'mptmlp':
-            self.ffn_config['fc_type'] = self.fc_type
-        elif self.ffn_config['ffn_type'] == 'te_ln_mlp':
-            self.ffn_config['bias'] = not self.no_bias
diff --git a/vllm/transformers_utils/configs/ovis.py b/vllm/transformers_utils/configs/ovis.py
deleted file mode 100644
index 021d402a71f4c..0000000000000
--- a/vllm/transformers_utils/configs/ovis.py
+++ /dev/null
@@ -1,184 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# yapf: disable
-# ruff: noqa: E501
-# copied from https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_aimv2.py
-# and https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_ovis.py
-from typing import Any, Optional, Union
-
-from transformers import AutoConfig, PretrainedConfig
-
-
-class AIMv2Config(PretrainedConfig):
-    """This is the configuration class to store the configuration of an [`AIMv2Model`].
-
-    Instantiating a configuration with the defaults will yield a similar configuration
-    to that of the [apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224).
-
-    Args:
-        hidden_size: Dimension of the hidden representations.
-        intermediate_size: Dimension of the SwiGLU representations.
-        num_hidden_layers: Number of hidden layers in the Transformer.
-        num_attention_heads: Number of attention heads for each attention layer
-            in the Transformer.
-        num_channels: Number of input channels.
-        image_size: Image size.
-        patch_size: Patch size.
-        rms_norm_eps: Epsilon value used for the RMS normalization layer.
-        attention_dropout: Dropout ratio for attention probabilities.
-        projection_dropout: Dropout ratio for the projection layer after the attention.
-        qkv_bias: Whether to add a bias to the queries, keys and values.
-        use_bias: Whether to add a bias in the feed-forward and projection layers.
-        kwargs: Keyword arguments for the [`PretrainedConfig`].
-    """
-
-    model_type: str = "aimv2"
-
-    def __init__(
-        self,
-        hidden_size: int = 1024,
-        intermediate_size: int = 2816,
-        num_hidden_layers: int = 24,
-        num_attention_heads: int = 8,
-        num_channels: int = 3,
-        image_size: int = 224,
-        patch_size: int = 14,
-        rms_norm_eps: float = 1e-5,
-        attention_dropout: float = 0.0,
-        projection_dropout: float = 0.0,
-        qkv_bias: bool = False,
-        use_bias: bool = False,
-        **kwargs: Any,
-    ):
-        super().__init__(**kwargs)
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.num_channels = num_channels
-        self.patch_size = patch_size
-        self.image_size = image_size
-        self.attention_dropout = attention_dropout
-        self.rms_norm_eps = rms_norm_eps
-
-        self.projection_dropout = projection_dropout
-        self.qkv_bias = qkv_bias
-        self.use_bias = use_bias
-
-
-IGNORE_ID = -100
-IMAGE_TOKEN_ID = -200
-IMAGE_TOKEN = "<image>"
-IMAGE_ATOM_ID = -300
-IMAGE_INDICATOR_IDS = [-301, -302, -303, -304, -305]
-
-
-# ----------------------------------------------------------------------
-#                     Visual Tokenizer Configuration
-# ----------------------------------------------------------------------
-class BaseVisualTokenizerConfig(PretrainedConfig):
-
-    def __init__(self,
-                 vocab_size=16384,
-                 tokenize_function="softmax",
-                 tau=1.0,
-                 depths=None,
-                 drop_cls_token=False,
-                 backbone_config: Optional[Union[PretrainedConfig,
-                                                 dict]] = None,
-                 hidden_stride: int = 1,
-                 **kwargs):
-        super().__init__(**kwargs)
-        self.vocab_size = vocab_size
-        self.tokenize_function = tokenize_function
-        self.tau = tau
-        if isinstance(depths, str):
-            depths = [int(x) for x in depths.split('|')]
-        self.depths = depths
-        self.backbone_kwargs = dict[str, Any]()
-        self.drop_cls_token = drop_cls_token
-        if backbone_config is not None:
-            assert isinstance(backbone_config, (PretrainedConfig, dict)), \
-                f"expect `backbone_config` to be instance of PretrainedConfig or dict, but got {type(backbone_config)} type"
-            if not isinstance(backbone_config, PretrainedConfig):
-                model_type = backbone_config['model_type']
-                if model_type != "aimv2":
-                    backbone_config.pop('model_type')
-                    backbone_config = AutoConfig.for_model(model_type, **backbone_config)
-                else:
-                    backbone_config = AIMv2Config(**backbone_config)
-        self.backbone_config = backbone_config
-        self.hidden_stride = hidden_stride
-
-
-class Aimv2VisualTokenizerConfig(BaseVisualTokenizerConfig):
-    model_type = "aimv2_visual_tokenizer"
-
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        if self.drop_cls_token:
-            self.drop_cls_token = False
-        if self.depths:
-            assert len(self.depths) == 1
-            self.backbone_kwargs['num_hidden_layers'] = self.depths[0]
-
-
-class SiglipVisualTokenizerConfig(BaseVisualTokenizerConfig):
-    model_type = "siglip_visual_tokenizer"
-
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        if self.drop_cls_token:
-            self.drop_cls_token = False
-        if self.depths:
-            assert len(self.depths) == 1
-            self.backbone_kwargs['num_hidden_layers'] = self.depths[0]
-
-
-AutoConfig.register("siglip_visual_tokenizer", SiglipVisualTokenizerConfig)
-AutoConfig.register("aimv2_visual_tokenizer", Aimv2VisualTokenizerConfig)
-
-
-# ----------------------------------------------------------------------
-#                           Ovis Configuration
-# ----------------------------------------------------------------------
-class OvisConfig(PretrainedConfig):
-    model_type = "ovis"
-
-    def __init__(self,
-                 llm_config: Optional[Union[PretrainedConfig, dict]] = None,
-                 visual_tokenizer_config: Optional[Union[PretrainedConfig,
-                                                         dict]] = None,
-                 multimodal_max_length=8192,
-                 hidden_size=None,
-                 conversation_formatter_class=None,
-                 llm_attn_implementation=None,
-                 disable_tie_weight=False,
-                 **kwargs):
-        super().__init__(**kwargs)
-        if llm_config is not None:
-            assert isinstance(llm_config, (PretrainedConfig, dict)), \
-                f"expect `llm_config` to be instance of PretrainedConfig or dict, but got {type(llm_config)} type"
-            if not isinstance(llm_config, PretrainedConfig):
-                model_type = llm_config['model_type']
-                llm_config.pop('model_type')
-                llm_config = AutoConfig.for_model(model_type, **llm_config)
-
-        # map llm_config to text_config
-        self.text_config = llm_config
-        if visual_tokenizer_config is not None:
-            assert isinstance(visual_tokenizer_config, (PretrainedConfig, dict)), \
-                f"expect `visual_tokenizer_config` to be instance of PretrainedConfig or dict, but got {type(visual_tokenizer_config)} type"
-            if not isinstance(visual_tokenizer_config, PretrainedConfig):
-                model_type = visual_tokenizer_config['model_type']
-                visual_tokenizer_config.pop('model_type')
-                visual_tokenizer_config = AutoConfig.for_model(
-                    model_type, **visual_tokenizer_config)
-
-        self.visual_tokenizer_config = visual_tokenizer_config
-        self.multimodal_max_length = multimodal_max_length
-        self.hidden_size = hidden_size
-        self.conversation_formatter_class = conversation_formatter_class
-        self.llm_attn_implementation = llm_attn_implementation
-        self.disable_tie_weight = disable_tie_weight
diff --git a/vllm/transformers_utils/configs/skyworkr1v.py b/vllm/transformers_utils/configs/skyworkr1v.py
deleted file mode 100644
index 33a45220e3159..0000000000000
--- a/vllm/transformers_utils/configs/skyworkr1v.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# Adapted from
-# https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/configuration_skywork_chat.py
-# --------------------------------------------------------
-# SkyworkR1V
-# Copyright (c) 2025 Skywork
-# Licensed under The MIT License [see LICENSE for details]
-# --------------------------------------------------------
-from transformers.configuration_utils import PretrainedConfig
-
-
-class SkyworkR1VChatConfig(PretrainedConfig):
-    model_type = 'internvl_chat'
-    is_composition = True
-
-    def __init__(self,
-                 vision_config=None,
-                 llm_config=None,
-                 use_backbone_lora=0,
-                 use_llm_lora=0,
-                 select_layer=-1,
-                 force_image_size=None,
-                 downsample_ratio=0.5,
-                 template=None,
-                 dynamic_image_size=False,
-                 use_thumbnail=False,
-                 ps_version='v1',
-                 min_dynamic_patch=1,
-                 max_dynamic_patch=6,
-                 **kwargs):
-        super().__init__(**kwargs)
-
-        if vision_config is None:
-            vision_config = {}
-
-        if llm_config is None:
-            llm_config = {}
-
-        self.vision_config = PretrainedConfig(**vision_config)
-        self.text_config = PretrainedConfig(**llm_config)
-
-        self.use_backbone_lora = use_backbone_lora
-        self.use_llm_lora = use_llm_lora
-        self.select_layer = select_layer
-        self.force_image_size = force_image_size
-        self.downsample_ratio = downsample_ratio
-        self.template = template
-        self.dynamic_image_size = dynamic_image_size
-        self.use_thumbnail = use_thumbnail
-        self.ps_version = ps_version  # pixel shuffle version
-        self.min_dynamic_patch = min_dynamic_patch
-        self.max_dynamic_patch = max_dynamic_patch
diff --git a/vllm/transformers_utils/configs/solar.py b/vllm/transformers_utils/configs/solar.py
deleted file mode 100644
index a83dfa40b43a5..0000000000000
--- a/vllm/transformers_utils/configs/solar.py
+++ /dev/null
@@ -1,247 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Solar model configuration"""
-
-from transformers import PretrainedConfig
-from transformers.utils import logging
-
-logger = logging.get_logger(__name__)
-
-
-class SolarConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store
-    the configuration of a [`SolarModel`].
-    It is used to instantiate an LLaMA model
-    according to the specified arguments,
-    defining the model architecture.
-    Instantiating a configuration with the
-    defaults will yield a similar
-    configuration to that of the LLaMA-7B.
-    Configuration objects inherit from [`PretrainedConfig`]
-    and can be used to control the model outputs.
-    Read the documentation from [`PretrainedConfig`] for more information.
-    Args:
-        vocab_size (`int`, *optional*, defaults to 32000):
-            Vocabulary size of the LLaMA model.
-            Defines the number of different tokens
-            that can be represented by the `inputs_ids`
-            passed when calling [`SolarModel`]
-        hidden_size (`int`, *optional*, defaults to 4096):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 11008):
-            Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 32):
-            Number of hidden layers in the Transformer decoder.
-        num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer
-            in the Transformer decoder.
-        num_key_value_heads (`int`, *optional*):
-            This is the number of key_value heads that
-            should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`,
-            the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1` the model
-            will use Multi Query Attention (MQA)
-            otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint,
-            each group key and value head should be constructed
-            by meanpooling all the original heads within that group.
-            For more details checkout [this paper]
-            (https://arxiv.org/pdf/2305.13245.pdf).
-            If it is not specified, will default to
-            `num_attention_heads`.
-        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
-            The non-linear activation function (function or string)
-            in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to 2048):
-            The maximum sequence length that this model might ever be used with.
-            Solar 1 supports up to 2048 tokens,
-            Solar 2 up to 4096, CodeSolar up to 16384.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of
-            the truncated_normal_initializer for initializing
-            all weight matrices.
-        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
-            The epsilon used by the rms normalization layers.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return
-            the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        pad_token_id (`int`, *optional*):
-            Padding token id.
-        bos_token_id (`int`, *optional*, defaults to 1):
-            Beginning of stream token id.
-        eos_token_id (`int`, *optional*, defaults to 2):
-            End of stream token id.
-        pretraining_tp (`int`, *optional*, defaults to 1):
-            Experimental feature. Tensor parallelism rank
-            used during pretraining.
-            Please refer to [this
-            document](https://huggingface.co/docs/
-            transformers/main/
-            perf_train_gpu_many#tensor-parallelism)
-             to understand more about it. This value is
-            necessary to ensure exact reproducibility
-            of the pretraining results.
-            Please refer to [this
-            issue](https://github.com/pytorch/pytorch/issues/76232).
-        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
-            Whether to tie weight embeddings
-        rope_theta (`float`, *optional*, defaults to 10000.0):
-            The base period of the RoPE embeddings.
-        rope_scaling (`dict`, *optional*):
-            Dictionary containing the scaling configuration for
-            the RoPE embeddings.
-            Currently supports two scaling
-            strategies: linear and dynamic.
-            Their scaling factor must be a float greater than 1.
-            The expected format is
-            `{"type": strategy name, "factor": scaling factor}`.
-            When using this flag, don't update
-            `max_position_embeddings` to the expected new maximum.
-            See the following thread for more information on how
-            these scaling strategies behave:
-            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/
-            dynamically_scaled_rope_further_increases/. This is an
-            experimental feature, subject to breaking
-            API changes in future versions.
-        attention_bias (`bool`, *optional*, defaults to `False`):
-            Whether to use a bias in the query, key, value
-            and output projection layers during self-attention.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-        mlp_bias (`bool`, *optional*, defaults to `False`):
-            Whether to use a bias in up_proj, down_proj and gate_proj
-            layers in the MLP layers.
-        sliding_window (`int`, *optional*, defaults to 2047):
-            Sliding window attention window size. If not specified,
-            will default to `2047`.
-    ```python
-    >>> from transformers import SolarModel, SolarConfig
-    >>> # Initializing a Solar-pro style configuration
-    >>> configuration = SolarConfig()
-    >>> # Initializing a model from the Solar-pro style configuration
-    >>> model = SolarModel(configuration)
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-
-    model_type = "solar"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=32000,
-        hidden_size=4096,
-        intermediate_size=11008,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=None,
-        hidden_act="silu",
-        max_position_embeddings=2048,
-        initializer_range=0.02,
-        rms_norm_eps=1e-6,
-        use_cache=True,
-        pad_token_id=None,
-        bos_token_id=1,
-        eos_token_id=2,
-        pretraining_tp=1,
-        tie_word_embeddings=False,
-        rope_theta=10000.0,
-        rope_scaling=None,
-        attention_bias=False,
-        attention_dropout=0.0,
-        mlp_bias=False,
-        sliding_window=2047,
-        bskcn_1=None,
-        bskcn_2=None,
-        bskcn_3=None,
-        bskcn_4=None,
-        bskcn_tv=None,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-
-        self.num_key_value_heads = num_key_value_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.pretraining_tp = pretraining_tp
-        self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
-        self._rope_scaling_validation()
-        self.attention_bias = attention_bias
-        self.attention_dropout = attention_dropout
-        self.mlp_bias = mlp_bias
-        self.sliding_window = sliding_window
-        self.bskcn_1 = bskcn_1 if bskcn_1 is not None else [12, 20, 32, 44]
-        self.bskcn_2 = bskcn_2 if bskcn_2 is not None else [20, 32]
-        self.bskcn_3 = bskcn_3 if bskcn_3 is not None else [16, 24, 36, 48]
-        self.bskcn_4 = bskcn_4 if bskcn_4 is not None else [28, 40]
-        self.bskcn_tv = bskcn_tv if bskcn_tv is not None else [0.9, 0.8]
-
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
-
-    def _rope_scaling_validation(self):
-        """
-        Validate the `rope_scaling` configuration.
-        """
-        if self.rope_scaling is None:
-            return
-
-        if (not isinstance(self.rope_scaling, dict)
-                or len(self.rope_scaling) != 2):
-            raise ValueError(
-                "`rope_scaling` must be a dictionary with two fields,"
-                " `type` and `factor`, "
-                f"got {self.rope_scaling}")
-        rope_scaling_type = self.rope_scaling.get("type", None)
-        rope_scaling_factor = self.rope_scaling.get("factor", None)
-        if rope_scaling_type is None or rope_scaling_type not in [
-                "linear",
-                "dynamic",
-        ]:
-            raise ValueError(f"`rope_scaling`'s type field must be one of "
-                             f"['linear', 'dynamic'], got {rope_scaling_type}")
-        if (rope_scaling_factor is None
-                or not isinstance(rope_scaling_factor, float)
-                or rope_scaling_factor <= 1.0):
-            raise ValueError(
-                f"`rope_scaling`'s factor field must be a float > 1,"
-                f" got {rope_scaling_factor}")
diff --git a/vllm/transformers_utils/configs/telechat2.py b/vllm/transformers_utils/configs/telechat2.py
deleted file mode 100644
index 050a7851d143f..0000000000000
--- a/vllm/transformers_utils/configs/telechat2.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# adapted from https://www.modelscope.cn/models/TeleAI/TeleChat2-3B/resolve/master/configuration_telechat2.py
-""" Telechat configuration compatible with LlamaConfig. """
-
-from transformers.configuration_utils import PretrainedConfig
-
-
-class Telechat2Config(PretrainedConfig):
-
-    model_type = "telechat"
-    keys_to_ignore_at_inference = ["past_key_values"]
-    attribute_map = {
-        "num_hidden_layers": "n_layer",
-        "num_attention_heads": "n_head",
-        "intermediate_size": "ffn_hidden_size",
-        "rms_norm_eps": "layer_norm_epsilon"
-    }
-
-    def __init__(
-        self,
-        vocab_size=160256,
-        hidden_size=4096,
-        n_layer=30,
-        n_head=32,
-        layer_norm_epsilon=1e-5,
-        initializer_range=0.02,
-        use_cache=True,
-        bos_token_id=1,
-        eos_token_id=2,
-        apply_residual_connection_post_layernorm=False,
-        hidden_dropout=0.0,
-        attention_dropout=0.0,
-        ffn_hidden_size=12288,
-        training_seqlen=8192,
-        logn=True,
-        embed_layernorm=False,
-        hidden_act="silu",
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        n_embed = kwargs.pop("n_embed", None)
-        self.hidden_size = hidden_size if n_embed is None else n_embed
-        self.n_layer = n_layer
-        self.n_head = n_head
-        self.layer_norm_epsilon = layer_norm_epsilon
-        self.initializer_range = initializer_range
-        self.use_cache = use_cache
-        self.apply_residual_connection_post_layernorm = (
-            apply_residual_connection_post_layernorm)
-        self.hidden_dropout = hidden_dropout
-        self.attention_dropout = attention_dropout
-        self.bos_token_id = bos_token_id
-        self.eos_token_id = eos_token_id
-        self.logn = logn
-        self.training_seqlen = training_seqlen
-        self.embed_layernorm = embed_layernorm
-        self.num_key_value_heads = kwargs.pop("num_key_value_heads", None)
-        self.ffn_hidden_size = ffn_hidden_size
-        self.hidden_act = hidden_act
-        super().__init__(bos_token_id=bos_token_id,
-                         eos_token_id=eos_token_id,
-                         **kwargs)
diff --git a/vllm/transformers_utils/configs/ultravox.py b/vllm/transformers_utils/configs/ultravox.py
index 62f63b02d49a4..87064cc12deda 100644
--- a/vllm/transformers_utils/configs/ultravox.py
+++ b/vllm/transformers_utils/configs/ultravox.py
@@ -45,6 +45,7 @@ class UltravoxConfig(transformers.PretrainedConfig):
     """
 
     model_type = "ultravox"
+    audio_token = "<|audio|>"
     is_composition = False
 
     def __init__(
@@ -80,29 +81,32 @@ class UltravoxConfig(transformers.PretrainedConfig):
             # Avoid circular import
             from vllm.transformers_utils.config import get_config
 
-            self.text_config = get_config(text_model_id,
-                                          trust_remote_code=False)
+            text_config_obj = get_config(text_model_id,
+                                         trust_remote_code=False)
         else:
             text_config = text_config or {}
-            self.text_config = transformers.CONFIG_MAPPING[text_config.get(
+            text_config_obj = transformers.CONFIG_MAPPING[text_config.get(
                 "model_type", "llama")](**text_config)
 
+        inner_text_config = text_config_obj.get_text_config()
+
         if audio_model_id is not None:
             # Avoid circular import
             from vllm.transformers_utils.config import get_config
 
-            self.audio_config = get_config(audio_model_id,
-                                           trust_remote_code=False)
+            audio_config = get_config(audio_model_id, trust_remote_code=False)
         else:
             audio_config = audio_config or {}
-            self.audio_config = transformers.CONFIG_MAPPING[audio_config.get(
+            audio_config = transformers.CONFIG_MAPPING[audio_config.get(
                 "model_type", "whisper")](**audio_config)
 
+        self.text_config = text_config_obj
+        self.audio_config = audio_config
         self.text_model_lora_config = text_model_lora_config or {}
         self.audio_model_lora_config = audio_model_lora_config or {}
 
-        self.vocab_size = self.text_config.vocab_size
-
-        self.initializer_range = self.text_config.initializer_range
+        self.vocab_size = inner_text_config.vocab_size
+        self.initializer_range = inner_text_config.initializer_range
+        self.text_hidden_size = inner_text_config.hidden_size
 
         super().__init__(**kwargs)
diff --git a/vllm/transformers_utils/dynamic_module.py b/vllm/transformers_utils/dynamic_module.py
new file mode 100644
index 0000000000000..05191f95216ce
--- /dev/null
+++ b/vllm/transformers_utils/dynamic_module.py
@@ -0,0 +1,60 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+from typing import Optional, Union
+
+from transformers.dynamic_module_utils import get_class_from_dynamic_module
+
+import vllm.envs as envs
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def try_get_class_from_dynamic_module(
+    class_reference: str,
+    pretrained_model_name_or_path: str,
+    cache_dir: Optional[Union[str, os.PathLike]] = None,
+    force_download: bool = False,
+    resume_download: Optional[bool] = None,
+    proxies: Optional[dict[str, str]] = None,
+    token: Optional[Union[bool, str]] = None,
+    revision: Optional[str] = None,
+    local_files_only: bool = False,
+    repo_type: Optional[str] = None,
+    code_revision: Optional[str] = None,
+    warn_on_fail: bool = True,
+    **kwargs,
+) -> Optional[type]:
+    """
+    As [transformers.dynamic_module_utils.get_class_from_dynamic_module][],
+    but ignoring any errors.
+    """
+    try:
+        return get_class_from_dynamic_module(
+            class_reference,
+            pretrained_model_name_or_path,
+            cache_dir=cache_dir,
+            force_download=force_download,
+            resume_download=resume_download,
+            proxies=proxies,
+            token=token,
+            revision=revision,
+            local_files_only=local_files_only,
+            repo_type=repo_type,
+            code_revision=code_revision,
+            **kwargs,
+        )
+    except Exception:
+        location = "ModelScope" if envs.VLLM_USE_MODELSCOPE else "HF Hub"
+
+        if warn_on_fail:
+            logger.warning(
+                "Unable to load %s from %s on %s.",
+                class_reference,
+                pretrained_model_name_or_path,
+                location,
+                exc_info=True,
+            )
+
+        return None
diff --git a/vllm/transformers_utils/processors/__init__.py b/vllm/transformers_utils/processors/__init__.py
index 14d15f2bc1673..eca4d7c884dd3 100644
--- a/vllm/transformers_utils/processors/__init__.py
+++ b/vllm/transformers_utils/processors/__init__.py
@@ -1,5 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Multi-modal processors may be defined in this directory for the following
+reasons:
+
+- There is no processing file defined by HF Hub or Transformers library.
+- There is a need to override the existing processor to support vLLM.
+"""
 
 from vllm.transformers_utils.processors.deepseek_vl2 import (
     DeepseekVLV2Processor)
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index 25dd71d877fb1..24ddd35abea60 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -295,7 +295,7 @@ def cached_tokenizer_from_config(
     return cached_get_tokenizer(
         model_config.tokenizer,
         tokenizer_mode=model_config.tokenizer_mode,
-        tokenizer_revision=model_config.tokenizer_revision,
+        revision=model_config.tokenizer_revision,
         trust_remote_code=model_config.trust_remote_code,
         **kwargs,
     )
diff --git a/vllm/transformers_utils/tokenizer_group.py b/vllm/transformers_utils/tokenizer_group.py
index eb53cceaa0585..a8bb0398dfdb1 100644
--- a/vllm/transformers_utils/tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group.py
@@ -3,6 +3,8 @@
 
 from typing import Optional
 
+from typing_extensions import assert_never
+
 from vllm.config import LoRAConfig, ModelConfig, SchedulerConfig
 from vllm.lora.request import LoRARequest
 from vllm.transformers_utils.tokenizer import (AnyTokenizer, encode_tokens,
@@ -108,6 +110,14 @@ class TokenizerGroup:
 def init_tokenizer_from_configs(model_config: ModelConfig,
                                 scheduler_config: SchedulerConfig,
                                 lora_config: Optional[LoRAConfig]):
+    runner_type = model_config.runner_type
+    if runner_type == "generate" or runner_type == "draft":
+        truncation_side = "left"
+    elif runner_type == "pooling":
+        truncation_side = "right"
+    else:
+        assert_never(runner_type)
+
     return TokenizerGroup(
         tokenizer_id=model_config.tokenizer,
         enable_lora=bool(lora_config),
@@ -117,4 +127,4 @@ def init_tokenizer_from_configs(model_config: ModelConfig,
         tokenizer_mode=model_config.tokenizer_mode,
         trust_remote_code=model_config.trust_remote_code,
         revision=model_config.tokenizer_revision,
-        truncation_side=model_config.truncation_side)
+        truncation_side=truncation_side)
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index f83405cfc016e..6ccc636efaf1b 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -183,7 +183,8 @@ def make_mistral_chat_completion_request(
             message["content"] = content
 
     # The Mistral client, in comparison to the OpenAI client, requires the
-    # "parameters" dict to be present, even if it's empty.
+    # "parameters" dict and the "description" string to be present
+    # even if they are empty.
     if tools:
         for function in [
                 tool["function"] for tool in tools
@@ -191,6 +192,8 @@ def make_mistral_chat_completion_request(
         ]:
             if function.get("parameters") is None:
                 function["parameters"] = {}
+            if function.get("description") is None:
+                function["description"] = ""
 
     from mistral_common.protocol.instruct.request import ChatCompletionRequest
     return ChatCompletionRequest(messages=messages,
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index 9f4140ac64e2f..ae978c855a8e5 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -2883,26 +2883,27 @@ def _maybe_force_spawn():
     if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") == "spawn":
         return
 
-    reason = None
-    if cuda_is_initialized():
-        reason = "CUDA is initialized"
-    elif xpu_is_initialized():
-        reason = "XPU is initialized"
-    elif is_in_ray_actor():
+    reasons = []
+    if is_in_ray_actor():
         # even if we choose to spawn, we need to pass the ray address
         # to the subprocess so that it knows how to connect to the ray cluster.
         # env vars are inherited by subprocesses, even if we use spawn.
         import ray
         os.environ["RAY_ADDRESS"] = ray.get_runtime_context().gcs_address
-        reason = "In a Ray actor and can only be spawned"
+        reasons.append("In a Ray actor and can only be spawned")
 
-    if reason is not None:
+    if cuda_is_initialized():
+        reasons.append("CUDA is initialized")
+    elif xpu_is_initialized():
+        reasons.append("XPU is initialized")
+
+    if reasons:
         logger.warning(
             "We must use the `spawn` multiprocessing start method. "
             "Overriding VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. "
             "See https://docs.vllm.ai/en/latest/usage/"
             "troubleshooting.html#python-multiprocessing "
-            "for more information. Reason: %s", reason)
+            "for more information. Reasons: %s", "; ".join(reasons))
         os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 
 
@@ -3281,14 +3282,22 @@ def has_deep_gemm() -> bool:
     return _has_module("deep_gemm")
 
 
-def bind_process_name(name: str, suffix: str = "") -> None:
-    """Bind the process name to a specific name with an optional suffix.
+def set_process_title(name: str,
+                      suffix: str = "",
+                      append: bool = False) -> None:
+    """
+    Set the current process title to a specific name with an
+    optional suffix.
 
     Args:
-        name: The base name to bind the process to.
+        name: The title to assign to the current process.
         suffix: An optional suffix to append to the base name.
+        append: Whether to append to the existing process title.
     """
-    name = f"{envs.VLLM_PROCESS_NAME_PREFIX}::{name}"
     if suffix:
         name = f"{name}_{suffix}"
+    if append:
+        name = f"{setproctitle.getproctitle()}_{name}"
+    else:
+        name = f"{envs.VLLM_PROCESS_NAME_PREFIX}::{name}"
     setproctitle.setproctitle(name)
diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py
index 09a12a8c11c5d..169b083017e46 100644
--- a/vllm/utils/deep_gemm.py
+++ b/vllm/utils/deep_gemm.py
@@ -13,7 +13,8 @@ from typing import Any, Callable, NoReturn
 import torch
 
 import vllm.envs as envs
-from vllm.utils import cuda_get_device_properties, has_deep_gemm
+from vllm.platforms import current_platform
+from vllm.utils import has_deep_gemm
 
 
 @functools.cache
@@ -21,12 +22,15 @@ def is_blackwell_deep_gemm_used() -> bool:
     """Return ``True`` if vLLM is configured to use DeepGEMM on a
     Blackwell-class GPU.
     """
-
-    if not (envs.VLLM_USE_DEEP_GEMM and has_deep_gemm()
-            and _per_block_cast_impl is not None):
+    if not (envs.VLLM_USE_DEEP_GEMM and has_deep_gemm()):
         return False
 
-    return cuda_get_device_properties(0, ("major", ))[0] == 10
+    _lazy_init()
+    if _per_block_cast_impl is None:
+        return False
+
+    return (current_platform.is_cuda()
+            and current_platform.is_device_capability(100))
 
 
 def _missing(*_: Any, **__: Any) -> NoReturn:
diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py
index b25e3a49f1812..3bfb9808c0a00 100644
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -66,6 +66,8 @@ def _lazy_import_wrapper(module_name: str,
 # Create lazy wrappers for each function
 flashinfer_trtllm_fp8_block_scale_moe = _lazy_import_wrapper(
     "flashinfer.fused_moe", "trtllm_fp8_block_scale_moe")
+flashinfer_trtllm_fp8_per_tensor_scale_moe = _lazy_import_wrapper(
+    "flashinfer.fused_moe", "trtllm_fp8_per_tensor_scale_moe")
 flashinfer_cutlass_fused_moe = _lazy_import_wrapper("flashinfer.fused_moe",
                                                     "cutlass_fused_moe")
 fp4_quantize = _lazy_import_wrapper("flashinfer", "fp4_quantize")
@@ -82,7 +84,8 @@ autotune = _lazy_import_wrapper(
 @functools.cache
 def has_flashinfer_moe() -> bool:
     """Return ``True`` if FlashInfer MoE module is available."""
-    return importlib.util.find_spec("flashinfer.fused_moe") is not None
+    return has_flashinfer() and importlib.util.find_spec(
+        "flashinfer.fused_moe") is not None
 
 
 @functools.cache
diff --git a/vllm/utils/tensor_schema.py b/vllm/utils/tensor_schema.py
index 485a0a72ddca8..343df71e1058d 100644
--- a/vllm/utils/tensor_schema.py
+++ b/vllm/utils/tensor_schema.py
@@ -86,9 +86,6 @@ class TensorSchema:
             expected_shape: tuple[Union[int, str], ...],
             dynamic_dims: set[str, ...]) -> tuple[int, ...]:
         """Validate a list/tuple of tensors and return the actual shape."""
-        if not value:
-            raise ValueError(f"{field_name} is an empty list")
-
         # Ensure all tensors in the list have the same
         # shape, besides dynamic dimensions
         first = value[0]
@@ -117,6 +114,7 @@ class TensorSchema:
                                                                          int],
                                         dynamic_dims: set[str, ...]) -> None:
         """Validate that the actual tensor shape matches the expected shape."""
+
         if len(actual_shape) != len(expected_shape):
             raise ValueError(f"{field_name} has rank {len(actual_shape)} "
                              f"but expected {len(expected_shape)}")
@@ -160,12 +158,11 @@ class TensorSchema:
                     # Skip validation when Union contains None
                     if type(None) in args:
                         continue
-                # If not optional, raise error
+                # Otherwise field is required, raise error
                 raise ValueError(f"Required field '{field_name}' is missing")
 
             # Field exists, proceed with validation
             value = getattr(self, field_name)
-
             if get_origin(field_type) is not None:
                 args = get_args(field_type)
 
@@ -173,13 +170,23 @@ class TensorSchema:
                     if isinstance(arg, TensorShape):
                         expected_shape = arg.resolve(**self._resolve_bindings)
                         if isinstance(value, (list, tuple)):
-                            actual_shape = self._validate_nested_tensors(
-                                value, field_name, expected_shape,
-                                arg.dynamic_dims)
+                            # list/tuple of Tensors → shape = (len(value), ...)
+                            if value and isinstance(value[0], torch.Tensor):
+                                actual_shape = self._validate_nested_tensors(
+                                    value, field_name, expected_shape,
+                                    arg.dynamic_dims)
+                            elif value:
+                                # list/tuple of scalars → shape = (len(value),)
+                                actual_shape = (len(value), )
+                            else:
+                                raise ValueError(
+                                    f"{field_name} is an empty list")
 
+                        # Tensor → shape = tensor.shape
                         elif isinstance(value, torch.Tensor):
                             actual_shape = value.shape
 
+                        # Otherwise, it's an unsupported type
                         else:
                             type_names = []
                             for arg in args:
diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py
index 3b6d753863d07..9ed46331863c9 100644
--- a/vllm/v1/attention/backends/cpu_attn.py
+++ b/vllm/v1/attention/backends/cpu_attn.py
@@ -315,8 +315,8 @@ class TorchSDPAMetadata(AttentionMetadata):
 
 class TorchSDPAMetadataBuilderV1(AttentionMetadataBuilder[TorchSDPAMetadata]):
 
-    def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig,
-                 device: torch.device) -> None:
+    def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
+                 vllm_config: VllmConfig, device: torch.device) -> None:
         self.kv_cache_spec = kv_cache_spec
         self.vllm_config = vllm_config
         self.scheduler_config = vllm_config.scheduler_config
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 5fe274f2c65b2..4c2a6c6b985b2 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -130,6 +130,8 @@ class FlashAttentionMetadata:
     prefix_scheduler_metadata: Optional[torch.Tensor] = None
     max_num_splits: int = 0
 
+    causal: bool = True
+
 
 def _get_sliding_window_configs(
         vllm_config: VllmConfig) -> set[Optional[tuple[int, int]]]:
@@ -146,8 +148,8 @@ class FlashAttentionMetadataBuilder(
         AttentionMetadataBuilder[FlashAttentionMetadata]):
     full_cudagraph_supported: ClassVar[bool] = get_flash_attn_version() == 3
 
-    def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig,
-                 device: torch.device):
+    def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
+                 vllm_config: VllmConfig, device: torch.device):
         self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
         self.parallel_config = vllm_config.parallel_config
@@ -213,6 +215,7 @@ class FlashAttentionMetadataBuilder(
         seq_lens_cpu = common_attn_metadata.seq_lens_cpu
         block_table_tensor = common_attn_metadata.block_table_tensor
         slot_mapping = common_attn_metadata.slot_mapping
+        causal = common_attn_metadata.causal
 
         # the overhead of the aot schedule is not worth it for spec-decode
         aot_schedule = self.aot_schedule and not fast_build
@@ -288,7 +291,7 @@ class FlashAttentionMetadataBuilder(
                                           max_query_len=max_query_len,
                                           seqlens=seq_lens,
                                           max_seq_len=max_seq_len,
-                                          causal=True)
+                                          causal=causal)
 
         if self.use_full_cuda_graph:
             assert scheduler_metadata is not None
@@ -326,7 +329,7 @@ class FlashAttentionMetadataBuilder(
             suffix_kv_lens=suffix_kv_lens,
             prefix_scheduler_metadata=prefix_scheduler_metadata,
             max_num_splits=max_num_splits,
-        )
+            causal=causal)
         return attn_metadata
 
     def can_run_in_cudagraph(
@@ -375,11 +378,14 @@ class FlashAttentionImpl(AttentionImpl):
 
         FlashAttentionBackend.validate_head_size(head_size)
 
-        if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
+        if attn_type not in [
+                AttentionType.DECODER, AttentionType.ENCODER_ONLY
+        ]:
+            raise NotImplementedError("Encoder/decoder cross-attention "
+                                      "is not implemented for "
                                       "FlashAttentionImpl")
+
+        self.attn_type = attn_type
         self.vllm_flash_attn_version = get_flash_attn_version()
         if is_quantized_kv_cache(self.kv_cache_dtype) \
             and not flash_attn_supports_fp8():
@@ -422,6 +428,8 @@ class FlashAttentionImpl(AttentionImpl):
             # Profiling run.
             return output
 
+        attn_type = self.attn_type
+
         # IMPORTANT!
         # NOTE(woosuk): With piece-wise CUDA graphs, this method is executed in
         # eager-mode PyTorch. Thus, we need to be careful about any CPU overhead
@@ -432,6 +440,18 @@ class FlashAttentionImpl(AttentionImpl):
         # performance to make sure it does not introduce any overhead.
 
         num_actual_tokens = attn_metadata.num_actual_tokens
+
+        # Handle encoder attention differently - no KV cache needed
+        if attn_type in (AttentionType.ENCODER_ONLY, ):
+            # For encoder attention,
+            # we use direct Q, K, V tensors without caching
+            return self._forward_encoder_attention(query[:num_actual_tokens],
+                                                   key[:num_actual_tokens],
+                                                   value[:num_actual_tokens],
+                                                   output[:num_actual_tokens],
+                                                   attn_metadata, layer)
+
+        # For decoder and cross-attention, use KV cache as before
         key_cache, value_cache = kv_cache.unbind(0)
 
         if self.kv_sharing_target_layer_name is None:
@@ -483,7 +503,7 @@ class FlashAttentionImpl(AttentionImpl):
                 seqused_k=seqused_k,
                 max_seqlen_k=max_seqlen_k,
                 softmax_scale=self.scale,
-                causal=True,
+                causal=attn_metadata.causal,
                 alibi_slopes=self.alibi_slopes,
                 window_size=self.sliding_window,
                 block_table=block_table,
@@ -524,6 +544,63 @@ class FlashAttentionImpl(AttentionImpl):
         )
         return output
 
+    def _forward_encoder_attention(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        output: torch.Tensor,
+        attn_metadata: FlashAttentionMetadata,
+        layer: torch.nn.Module,
+    ) -> torch.Tensor:
+        """Forward pass for encoder attention without KV cache.
+
+        Args:
+            query: shape = [num_encoder_tokens, num_heads, head_size]
+            key: shape = [num_encoder_tokens, num_kv_heads, head_size]
+            value: shape = [num_encoder_tokens, num_kv_heads, head_size]
+            output: shape = [num_encoder_tokens, num_heads, head_size]
+            attn_metadata: Encoder attention metadata
+            layer: The attention layer
+        """
+        # For encoder attention, process FP8 quantization if needed
+        if self.kv_cache_dtype.startswith("fp8"):
+            raise NotImplementedError(
+                "quantization is not supported for encoder attention")
+
+        # Use encoder-specific metadata for sequence information
+        cu_seqlens_q = attn_metadata.query_start_loc
+        cu_seqlens_k = attn_metadata.query_start_loc
+        max_seqlen_q = attn_metadata.max_query_len
+        max_seqlen_k = attn_metadata.max_query_len
+
+        descale_shape = (
+            cu_seqlens_q.shape[0] - 1,  # type: ignore[union-attr]
+            self.num_kv_heads)
+
+        # Call flash attention directly on Q, K, V tensors
+        flash_attn_varlen_func(
+            q=query,
+            k=key,
+            v=value,
+            out=output,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_k,
+            max_seqlen_q=max_seqlen_q,
+            max_seqlen_k=max_seqlen_k,
+            softmax_scale=self.scale,
+            causal=False,  # Encoder attention is bidirectional
+            alibi_slopes=self.alibi_slopes,
+            window_size=self.sliding_window,
+            softcap=self.logits_soft_cap,
+            fa_version=self.vllm_flash_attn_version,
+            q_descale=layer._q_scale.expand(descale_shape),
+            k_descale=layer._k_scale.expand(descale_shape),
+            v_descale=layer._v_scale.expand(descale_shape),
+        )
+
+        return output
+
 
 def use_cascade_attention(
     common_prefix_len: int,
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index b72745ef156eb..27552f0e7c1ef 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -21,10 +21,9 @@ from vllm.platforms import current_platform
 from vllm.utils import cdiv
 from vllm.v1.attention.backends.flash_attn import use_cascade_attention
 from vllm.v1.attention.backends.utils import (
-    AttentionMetadataBuilder, CommonAttentionMetadata, PerLayerParameters,
-    get_kv_cache_layout, get_per_layer_parameters,
-    infer_global_hyperparameters, reorder_batch_to_split_decodes_and_prefills,
-    split_decodes_and_prefills)
+    AttentionMetadataBuilder, CommonAttentionMetadata, get_kv_cache_layout,
+    get_per_layer_parameters, infer_global_hyperparameters,
+    reorder_batch_to_split_decodes_and_prefills, split_decodes_and_prefills)
 from vllm.v1.kv_cache_interface import AttentionSpec
 
 if TYPE_CHECKING:
@@ -194,7 +193,6 @@ class FlashInferMetadata:
     max_seq_len: int
     seq_lens: torch.Tensor
     block_table_tensor: torch.Tensor
-    workspace_buffer: torch.Tensor
 
     # For handling prefill decode split
     num_decodes: int
@@ -220,8 +218,8 @@ class FlashInferMetadata:
 
 class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
 
-    def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig,
-                 device: torch.device):
+    def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
+                 vllm_config: VllmConfig, device: torch.device):
         self.device = device
         self._workspace_buffer = None
         self._prefill_wrapper = None  # Wrapper for prefill/append
@@ -229,7 +227,8 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
         self._cascade_wrapper = None  # Wrapper for cascade attention
 
         # Global hyperparameters shared by all attention layers
-        self.global_hyperparameters: Optional[PerLayerParameters] = None
+        self.global_hyperparameters = infer_global_hyperparameters(
+            get_per_layer_parameters(vllm_config, layer_names, FlashInferImpl))
 
         self.vllm_config = vllm_config
         self.cache_config = vllm_config.cache_config
@@ -284,10 +283,6 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
 
     def _plan(self, num_prefills: int, num_decodes: int,
               attn_metadata: FlashInferMetadata):
-        if self.global_hyperparameters is None:
-            self.global_hyperparameters = infer_global_hyperparameters(
-                get_per_layer_parameters(self.vllm_config, FlashInferImpl))
-
         if attn_metadata.use_cascade:
             attn_metadata.cascade_wrapper = self._get_cascade_wrapper()
             attn_metadata.cascade_wrapper.plan(
@@ -473,7 +468,6 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
             max_seq_len=max_seq_len,
             seq_lens=seq_lens,
             block_table_tensor=block_table_tensor,
-            workspace_buffer=self._get_workspace_buffer(),
         )
 
         self._plan(num_prefills, num_decodes, attn_metadata)
@@ -641,11 +635,11 @@ class FlashInferImpl(AttentionImpl):
         if decode_wrapper := attn_metadata.decode_wrapper:
             decode_query = query[:num_decode_tokens]
             assert decode_query.shape[0] == num_decode_tokens
+            assert decode_wrapper is not None
             if not FlashInferBackend.use_trtllm_decode_attention(
                     attn_metadata.num_decodes, attn_metadata.max_seq_len,
                     self.kv_cache_dtype, attn_metadata.num_qo_heads,
                     attn_metadata.num_kv_heads, attn_metadata.head_dim):
-                assert decode_wrapper is not None
                 assert decode_wrapper._window_left == window_left
                 assert decode_wrapper._logits_soft_cap == (self.logits_soft_cap
                                                            or 0.0)
@@ -666,22 +660,24 @@ class FlashInferImpl(AttentionImpl):
                                                                            num_decode_tokens]
                     seq_lens_decode = attn_metadata.seq_lens[:
                                                              num_decode_tokens]
+                    workspace_buffer = decode_wrapper._float_workspace_buffer
 
                     assert get_kv_cache_layout() == "HND"
                     assert decode_query.is_contiguous()
                     assert kv_cache_permute.is_contiguous()
                     assert block_tables_decode.is_contiguous()
                     assert seq_lens_decode.is_contiguous()
+                    assert workspace_buffer.is_contiguous()
 
-                    output[:num_decode_tokens] = (
-                        trtllm_batch_decode_with_kv_cache(
-                            query=decode_query,
-                            kv_cache=kv_cache_permute,
-                            workspace_buffer=attn_metadata.workspace_buffer,
-                            block_tables=block_tables_decode,
-                            seq_lens=seq_lens_decode,
-                            max_seq_len=attn_metadata.max_seq_len,
-                            bmm1_scale=layer._k_scale_float * self.scale,
-                            bmm2_scale=layer._v_scale_float,
-                        ))
+                    trtllm_batch_decode_with_kv_cache(
+                        query=decode_query,
+                        kv_cache=kv_cache_permute,
+                        workspace_buffer=workspace_buffer,
+                        block_tables=block_tables_decode,
+                        seq_lens=seq_lens_decode,
+                        max_seq_len=attn_metadata.max_seq_len,
+                        bmm1_scale=layer._k_scale_float * self.scale,
+                        bmm2_scale=layer._v_scale_float,
+                        out=output[:num_decode_tokens],
+                    )
         return output_padded
diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py
index ad63f92cd88a7..bb0d890c7754d 100644
--- a/vllm/v1/attention/backends/flex_attention.py
+++ b/vllm/v1/attention/backends/flex_attention.py
@@ -258,8 +258,8 @@ class FlexAttentionMetadata:
 class FlexAttentionMetadataBuilder(
         AttentionMetadataBuilder[FlexAttentionMetadata]):
 
-    def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig,
-                 device: torch.device):
+    def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
+                 vllm_config: VllmConfig, device: torch.device):
         self.model_config = vllm_config.model_config
         self.parallel_config = vllm_config.parallel_config
         self.cache_config = vllm_config.cache_config
diff --git a/vllm/v1/attention/backends/mamba_attn.py b/vllm/v1/attention/backends/mamba_attn.py
index dca5de46c0653..8b702e28d67c0 100644
--- a/vllm/v1/attention/backends/mamba_attn.py
+++ b/vllm/v1/attention/backends/mamba_attn.py
@@ -87,8 +87,8 @@ class Mamba2AttentionMetadata:
 class Mamba2AttentionMetadataBuilder(
         AttentionMetadataBuilder[Mamba2AttentionMetadata]):
 
-    def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig,
-                 device: torch.device):
+    def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
+                 vllm_config: VllmConfig, device: torch.device):
         assert isinstance(kv_cache_spec, MambaSpec)
         self.kv_cache_spec = kv_cache_spec
         self.chunk_size = vllm_config.model_config.get_mamba_chunk_size()
diff --git a/vllm/v1/attention/backends/mamba_selectors.py b/vllm/v1/attention/backends/mamba_selectors.py
new file mode 100644
index 0000000000000..80021a2165567
--- /dev/null
+++ b/vllm/v1/attention/backends/mamba_selectors.py
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.attention.backends.abstract import AttentionBackend
+from vllm.v1.attention.backends.mamba_attn import Mamba2AttentionBackend
+
+
+def get_mamba_attn_backend(mamba_type: str) -> type[AttentionBackend]:
+    if mamba_type == "mamba2":
+        return Mamba2AttentionBackend
+
+    raise NotImplementedError(f"Mamba Attention type {mamba_type} is not "
+                              "supported yet.")
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 6f5fe31722d6f..52fe30779ce7e 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -406,6 +406,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
 
     def __init__(self,
                  kv_cache_spec: AttentionSpec,
+                 layer_names: list[str],
                  vllm_config: VllmConfig,
                  device: torch.device,
                  metadata_cls: Optional[type[M]] = None):
@@ -471,7 +472,8 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
                 BatchPrefillWithRaggedKVCacheWrapper] = []
 
             self._global_hyperparameters = infer_global_hyperparameters(
-                get_per_layer_parameters(vllm_config, MLACommonImpl))
+                get_per_layer_parameters(vllm_config, layer_names,
+                                         MLACommonImpl))
 
         if self._use_cudnn_prefill:
             self.cudnn_workspace = torch.empty(
diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py
index c787f25cd3adf..b23a8f0a5e870 100644
--- a/vllm/v1/attention/backends/mla/cutlass_mla.py
+++ b/vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -21,7 +21,7 @@ class CutlassMLABackend(MLACommonBackend):
 
     @staticmethod
     def get_name() -> str:
-        return "CUTLASS_MLA_VLLM_V1"
+        return "CUTLASS_MLA"
 
     @staticmethod
     def get_impl_cls() -> type["CutlassMLAImpl"]:
diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py
index 4d3ed6d576dc4..d519564d5e1e0 100644
--- a/vllm/v1/attention/backends/mla/flashmla.py
+++ b/vllm/v1/attention/backends/mla/flashmla.py
@@ -56,9 +56,10 @@ class FlashMLAMetadata(MLACommonMetadata[FlashMLADecodeMetadata]):
 class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]):
     full_cudagraph_supported: ClassVar[bool] = True  # Decode-only
 
-    def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig,
-                 device: torch.device):
-        super().__init__(kv_cache_spec, vllm_config, device, FlashMLAMetadata)
+    def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
+                 vllm_config: VllmConfig, device: torch.device):
+        super().__init__(kv_cache_spec, layer_names, vllm_config, device,
+                         FlashMLAMetadata)
 
         self.compilation_config = vllm_config.compilation_config
         self.num_q_heads = vllm_config.model_config.get_num_attention_heads(
diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
index 834c234558350..5c5891f035ae2 100644
--- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
+++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
@@ -66,9 +66,10 @@ class AiterMLAMetadata(MLACommonMetadata[AiterMLADecodeMetadata]):
 class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
     full_cudagraph_supported: ClassVar[bool] = True  # decode only
 
-    def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig,
-                 device: torch.device):
-        super().__init__(kv_cache_spec, vllm_config, device, AiterMLAMetadata)
+    def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
+                 vllm_config: VllmConfig, device: torch.device):
+        super().__init__(kv_cache_spec, layer_names, vllm_config, device,
+                         AiterMLAMetadata)
         assert self.kv_cache_spec.block_size == 1, "AITER MLA" \
             "only supports block size 1."
 
diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py
index 85a5dc8c91c13..dd10b7f02730a 100644
--- a/vllm/v1/attention/backends/rocm_aiter_fa.py
+++ b/vllm/v1/attention/backends/rocm_aiter_fa.py
@@ -231,8 +231,8 @@ class AiterFlashAttentionMetadataBuilder(
         AttentionMetadataBuilder[AiterFlashAttentionMetadata]):
     full_cudagraph_supported: ClassVar[bool] = True
 
-    def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig,
-                 device: torch.device):
+    def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
+                 vllm_config: VllmConfig, device: torch.device):
         self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
         self.parallel_config = vllm_config.parallel_config
diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py
index 83471ca51b73f..195fbd3b1b9c4 100644
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -59,8 +59,8 @@ class TritonAttentionMetadataBuilder(
         AttentionMetadataBuilder[TritonAttentionMetadata]):
     full_cudagraph_supported: ClassVar[bool] = True
 
-    def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig,
-                 device: torch.device):
+    def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
+                 vllm_config: VllmConfig, device: torch.device):
         self.device = device
         self.block_size = kv_cache_spec.block_size
         self.kv_cache_spec = kv_cache_spec
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index 32d804e815c45..93f45c8b65093 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -3,8 +3,8 @@
 import abc
 import functools
 from abc import abstractmethod
-from dataclasses import dataclass
-from typing import TYPE_CHECKING, ClassVar, Generic, Optional, TypeVar
+from dataclasses import dataclass, make_dataclass
+from typing import TYPE_CHECKING, Any, ClassVar, Generic, Optional, TypeVar
 
 import numpy as np
 import torch
@@ -59,6 +59,8 @@ class CommonAttentionMetadata:
     block_table_tensor: torch.Tensor
     slot_mapping: torch.Tensor
 
+    causal: bool = True
+
 
 @dataclass
 class UbatchSlice:
@@ -150,8 +152,8 @@ class AttentionMetadataBuilder(abc.ABC, Generic[M]):
     full_cudagraph_supported: ClassVar[bool] = False
 
     @abstractmethod
-    def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig,
-                 device: torch.device):
+    def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
+                 vllm_config: VllmConfig, device: torch.device):
         self.kv_cache_spec = kv_cache_spec
 
     @abstractmethod
@@ -258,14 +260,14 @@ class PerLayerParameters:
 
 
 def get_per_layer_parameters(
-        vllm_config: VllmConfig,
+        vllm_config: VllmConfig, layer_names: list[str],
         cls_: type['AttentionImpl']) -> dict[str, PerLayerParameters]:
     """
-    Scan all attention layers and determine some hyperparameters
+    Scan layers in `layer_names` and determine some hyperparameters
     to use during `plan`.
     """
 
-    layers = get_layers_from_vllm_config(vllm_config, Attention)
+    layers = get_layers_from_vllm_config(vllm_config, Attention, layer_names)
     per_layer_params: dict[str, PerLayerParameters] = {}
 
     for key, layer in layers.items():
@@ -302,6 +304,10 @@ def infer_global_hyperparameters(
     param_sets = list(per_layer_params.values())
     global_params = param_sets[0]
     for params in param_sets:
+        if params.window_left != global_params.window_left:
+            raise ValueError(
+                "Window left is not the same for all layers. One potential fix "
+                "is to set disable_sliding_window=True")
         assert params == global_params, (
             "FlashInfer backend currently only supports models in which all "
             "layers share the same values for the following hyperparameters: "
@@ -491,6 +497,7 @@ def make_local_attention_virtual_batches(
         max_query_len=seqlens_q_local.max(),
         block_table_tensor=block_table_local,
         slot_mapping=common_attn_metadata.slot_mapping,
+        causal=True,
     )
 
 
@@ -597,3 +604,34 @@ def reorder_batch_to_split_decodes_and_prefills(
         modified_batch = True
 
     return modified_batch
+
+
+KV_SHARING_FAST_PREFILL_METADATA_FIELDS = [
+    ('logits_indices_padded', Optional[torch.Tensor], None),
+    ('num_logits_indices', int, 0),
+]
+
+
+def subclass_attention_metadata(
+    name_prefix: str,
+    metadata_cls: Any,
+    fields: list[tuple[str, Any, Any]],
+) -> Any:
+    """
+    Return a new subclass of `metadata_cls` with additional fields
+    """
+    name: str = name_prefix + metadata_cls.__name__  # type: ignore
+    Wrapped = make_dataclass(name, fields, bases=(metadata_cls, ))
+    return Wrapped
+
+
+def make_kv_sharing_fast_prefill_attention_metadata(
+    metadata_cls: Any, ) -> Any:
+    """
+    Return a new subclass of `metadata_cls` for fast prefill
+    """
+    return subclass_attention_metadata(
+        name_prefix="KVSharingFastPrefill",
+        metadata_cls=metadata_cls,
+        fields=KV_SHARING_FAST_PREFILL_METADATA_FIELDS,
+    )
diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py
index 5bf4d3a2acb45..ad9854dd29c38 100644
--- a/vllm/v1/core/block_pool.py
+++ b/vllm/v1/core/block_pool.py
@@ -276,7 +276,7 @@ class BlockPool:
                 # candidate), so remove it.
                 if block.ref_cnt == 0 and not block.is_null:
                     self.free_block_queue.remove(block)
-                block.incr_ref()
+                block.ref_cnt += 1
 
     def free_blocks(self, ordered_blocks: Iterable[KVCacheBlock]) -> None:
         """Free a list of blocks. The blocks should be ordered by their
diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py
index de72e60434ad7..f3a16d64e19fd 100644
--- a/vllm/v1/core/kv_cache_coordinator.py
+++ b/vllm/v1/core/kv_cache_coordinator.py
@@ -7,7 +7,8 @@ from vllm.v1.core.block_pool import BlockPool
 from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock
 from vllm.v1.core.single_type_kv_cache_manager import (
     FullAttentionManager, get_manager_for_kv_cache_spec)
-from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheConfig
+from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
+                                        KVCacheSpec)
 from vllm.v1.request import Request
 
 
@@ -125,14 +126,17 @@ class KVCacheCoordinator(ABC):
     def get_num_common_prefix_blocks(self, request_id: str,
                                      num_running_requests: int) -> list[int]:
         """
-        Get the number of common prefix blocks for a request.
+        Get the number of common prefix blocks for all requests in the RUNNING
+        state for each kv cache group.
 
         Args:
             request_id: The request ID.
-            block_hashes: The block hashes of the request.
+            num_running_requests: The total number of requests in the RUNNING
+                state.
 
         Returns:
-            The number of common prefix blocks.
+            list[int]: The number of common prefix blocks for all requests in
+                the RUNNING state for each kv cache group.
         """
         num_blocks_per_group = [
             manager.get_num_common_prefix_blocks(request_id,
@@ -258,44 +262,40 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
         one of them is full attention. Then, split the kv cache groups into full
         attention groups and other groups.
         """
-        full_attention_type_id: Optional[str] = None
-        other_type_id: Optional[str] = None
+        full_attention_spec: Optional[FullAttentionSpec] = None
+        other_spec: Optional[KVCacheSpec] = None
         self.full_attention_group_ids: list[int] = []
         self.other_group_ids: list[int] = []
         for i, g in enumerate(self.kv_cache_config.kv_cache_groups):
             if isinstance(g.kv_cache_spec, FullAttentionSpec):
-                if full_attention_type_id is None:
-                    full_attention_type_id = g.kv_cache_spec.type_id
+                if full_attention_spec is None:
+                    full_attention_spec = g.kv_cache_spec
                 else:
-                    assert full_attention_type_id == g.kv_cache_spec.type_id, (
+                    assert full_attention_spec == g.kv_cache_spec, (
                         "HybridKVCacheCoordinator assumes exactly one type of "
                         "full attention groups now.")
                 self.full_attention_group_ids.append(i)
             else:
-                if other_type_id is None:
-                    other_type_id = g.kv_cache_spec.type_id
+                if other_spec is None:
+                    other_spec = g.kv_cache_spec
                 else:
-                    assert other_type_id == g.kv_cache_spec.type_id, (
+                    assert other_spec == g.kv_cache_spec, (
                         "HybridKVCacheCoordinator assumes "
                         "exactly one other type of groups now.")
                 self.other_group_ids.append(i)
 
-        assert full_attention_type_id is not None, (
+        assert full_attention_spec is not None, (
             "HybridKVCacheCoordinator assumes exactly one type of full "
             "attention groups now.")
-        assert other_type_id is not None, (
+        assert other_spec is not None, (
             "HybridKVCacheCoordinator assumes exactly one type of other "
             "groups now.")
 
         self.full_attention_manager_cls = FullAttentionManager
         self.other_attention_cls = self.single_type_managers[
             self.other_group_ids[0]].__class__
-
-        self.full_attention_spec = self.kv_cache_config.kv_cache_groups[
-            self.full_attention_group_ids[0]].kv_cache_spec
-        self.other_spec = self.kv_cache_config.kv_cache_groups[
-            self.other_group_ids[0]].kv_cache_spec
-
+        self.full_attention_spec = full_attention_spec
+        self.other_spec = other_spec
         self.full_attention_block_size = self.full_attention_spec.block_size
         self.other_block_size = self.other_spec.block_size
 
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index e820a0ad6d5d0..ce333dbe61a19 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -170,10 +170,6 @@ class KVCacheManager:
                                                self.block_size, request)
             self.req_to_block_hashes[request.request_id] = block_hashes
 
-        if self.log_stats:
-            assert self.prefix_cache_stats is not None
-            self.prefix_cache_stats.requests += 1
-
         # NOTE: When all tokens hit the cache, we must recompute the last token
         # to obtain logits. Thus, set max_cache_hit_length to prompt_length - 1.
         # This can trigger recomputation of an entire block, rather than just
@@ -187,6 +183,7 @@ class KVCacheManager:
 
         if self.log_stats:
             assert self.prefix_cache_stats is not None
+            self.prefix_cache_stats.requests += 1
             self.prefix_cache_stats.queries += request.num_tokens
             self.prefix_cache_stats.hits += num_new_computed_tokens
 
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 5b0218640a8c8..25520eb655111 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -5,7 +5,7 @@
 import os
 from collections import defaultdict, deque
 from collections.abc import Iterable, Sequence
-from dataclasses import dataclass
+from dataclasses import astuple, dataclass
 from typing import Any, Callable, NamedTuple, Optional
 
 from vllm.config import VllmConfig
@@ -154,14 +154,6 @@ class KVCacheBlock:
     # Whether the block is a null block that should never be cached.
     is_null: bool = False
 
-    # TODO(Jialin): For performance, let callers handle ref_cnt bumps to
-    # avoid function calls.
-    def incr_ref(self):
-        self.ref_cnt += 1
-
-    def decr_ref(self):
-        self.ref_cnt -= 1
-
     @property
     def block_hash(self) -> Optional[BlockHashWithGroupId]:
         return self._block_hash
@@ -727,7 +719,9 @@ def create_kv_cache_group_specs(
 
 def is_kv_cache_type_uniform(kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
     """
-    Whether all layers in the given KVCacheSpec have the same type of KV cache.
+    Whether all layers in the given KVCacheSpec have the same KV cache spec.
+    Note that we regard FullAttentionSpec with and without sliding window as
+    the same type.
 
     Args:
         kv_cache_spec: The kv cache spec of each attention layer in the model
@@ -736,8 +730,12 @@ def is_kv_cache_type_uniform(kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
         True if all layers have the same type, False otherwise.
     """
 
-    layer_keys = set(layer.type_id for layer in kv_cache_spec.values())
-    return len(layer_keys) == 1
+    try:
+        kv_cache_spec_values = list(kv_cache_spec.values())
+        _ = kv_cache_spec_values[0].merge(kv_cache_spec_values)
+    except AssertionError:
+        return False
+    return True
 
 
 def get_max_concurrency_for_kv_cache_config(
@@ -928,12 +926,12 @@ def _get_kv_cache_config_uniform_page_size(
     Returns:
         The generated KVCacheConfig
     """
-    # Group all layers by type_id.
+    # Group all layers by kv_cache_spec.
     # E.g., 2 full attention layers and 3 sliding window attention layers,
     # -> (full.0, full.1), (sw.0, sw.1, sw.2).
-    same_type_layers: dict[str, list[str]] = defaultdict(list)
+    same_type_layers: dict[KVCacheSpec, list[str]] = defaultdict(list)
     for layer_name, layer_spec in kv_cache_spec.items():
-        same_type_layers[layer_spec.type_id].append(layer_name)
+        same_type_layers[layer_spec].append(layer_name)
 
     # Split each group into smaller groups, to make the number of layers in each
     # group identical. Add padding to the last group of each type if necessary.
@@ -1017,12 +1015,7 @@ def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]):
         kv_cache_spec: The kv cache spec of each attention layer in the model
     """
 
-    def is_hybrid(kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
-        type_ids = set(layer_spec.type_id
-                       for layer_spec in kv_cache_spec.values())
-        return len(type_ids) > 1
-
-    if not is_hybrid(kv_cache_spec):
+    if is_kv_cache_type_uniform(kv_cache_spec):
         return
 
     logger.warning(
@@ -1060,7 +1053,7 @@ def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]):
                     attention_chunk_size=spec.attention_chunk_size,
                 )
 
-    if is_hybrid(kv_cache_spec):
+    if not is_kv_cache_type_uniform(kv_cache_spec):
         raise ValueError("Hybrid KV cache manager is disabled but failed to "
                          "convert the KV cache specs to one unified type.")
 
@@ -1119,11 +1112,11 @@ def unify_kv_cache_configs(kv_cache_configs: list[KVCacheConfig]):
             in-place modified to make them consistent.
     """
 
-    # Sort the kv cache groups by the type_id of their KV cache spec.
+    # Sort the kv cache groups by their KV cache spec.
     # This can avoid the inconsistency caused by the order of groups.
     for kv_cache_config in kv_cache_configs:
-        kv_cache_config.kv_cache_groups.sort(
-            key=lambda x: x.kv_cache_spec.type_id)
+        kv_cache_config.kv_cache_groups.sort(key=lambda x: (type(
+            x.kv_cache_spec).__name__, astuple(x.kv_cache_spec)))
 
     # Verify that the groups of each rank are the same.
     for kv_cache_config in kv_cache_configs[1:]:
diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py
index e8a44c7773a71..8f310023a8cd3 100644
--- a/vllm/v1/core/single_type_kv_cache_manager.py
+++ b/vllm/v1/core/single_type_kv_cache_manager.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import itertools
 from abc import ABC, abstractmethod
 from collections import defaultdict
 from typing import Callable
@@ -177,14 +178,17 @@ class SingleTypeKVCacheManager(ABC):
     def get_num_common_prefix_blocks(self, request_id: str,
                                      num_running_requests: int) -> int:
         """
-        Get the number of common prefix blocks for a request.
+        Get the number of common prefix blocks for all requests in the RUNNING
+        state.
 
         Args:
             request_id: The request ID.
-            block_hashes: The block hashes of the request.
+            num_running_requests: The total number of requests in the RUNNING
+                state.
 
         Returns:
-            The number of common prefix blocks.
+            The number of common prefix blocks for all requests in the RUNNING
+                state.
         """
 
         raise NotImplementedError
@@ -264,7 +268,7 @@ class FullAttentionManager(SingleTypeKVCacheManager):
         computed_blocks: tuple[list[KVCacheBlock], ...] = tuple(
             [] for _ in range(len(kv_cache_group_ids)))
         max_num_blocks = max_length // kv_cache_spec.block_size
-        for i, block_hash in zip(range(max_num_blocks), block_hashes):
+        for block_hash in itertools.islice(block_hashes, max_num_blocks):
             # block_hashes is a chain of block hashes. If a block hash is not
             # in the cached_block_hash_to_id, the following block hashes are
             # not computed yet for sure.
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 79dc80d8fc547..810d03f32d726 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -123,6 +123,13 @@ class EngineCoreOutput(
         return self.finish_reason is not None
 
 
+class UtilityResult:
+    """Wrapper for special handling when serializing/deserializing."""
+
+    def __init__(self, r: Any = None):
+        self.result = r
+
+
 class UtilityOutput(
         msgspec.Struct,
         array_like=True,  # type: ignore[call-arg]
@@ -132,7 +139,7 @@ class UtilityOutput(
 
     # Non-None implies the call failed, result should be None.
     failure_message: Optional[str] = None
-    result: Any = None
+    result: Optional[UtilityResult] = None
 
 
 class EngineCoreOutputs(
diff --git a/vllm/v1/engine/coordinator.py b/vllm/v1/engine/coordinator.py
index fc45eea3a73cf..440628576bcb7 100644
--- a/vllm/v1/engine/coordinator.py
+++ b/vllm/v1/engine/coordinator.py
@@ -10,11 +10,10 @@ import zmq
 
 from vllm.config import ParallelConfig
 from vllm.logger import init_logger
-from vllm.utils import get_mp_context, make_zmq_socket
+from vllm.utils import get_mp_context, make_zmq_socket, set_process_title
 from vllm.v1.engine import EngineCoreOutputs, EngineCoreRequestType
 from vllm.v1.serial_utils import MsgpackDecoder
-from vllm.v1.utils import (bind_process_name, get_engine_client_zmq_addr,
-                           shutdown)
+from vllm.v1.utils import get_engine_client_zmq_addr, shutdown
 
 logger = init_logger(__name__)
 
@@ -119,7 +118,7 @@ class DPCoordinatorProc:
     def __init__(self,
                  engine_count: int,
                  min_stats_update_interval_ms: int = 100):
-        bind_process_name(self.__class__.__name__)
+        set_process_title("DPCoordinator")
         self.ctx = zmq.Context()
 
         self.engines = [EngineState() for _ in range(engine_count)]
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 4124ee05326ce..f9a6315df8af8 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -26,8 +26,8 @@ from vllm.lora.request import LoRARequest
 from vllm.tasks import POOLING_TASKS, SupportedTask
 from vllm.transformers_utils.config import (
     maybe_register_config_serialize_by_value)
-from vllm.utils import (bind_process_name, make_zmq_socket,
-                        resolve_obj_by_qualname)
+from vllm.utils import (make_zmq_socket, resolve_obj_by_qualname,
+                        set_process_title)
 from vllm.v1.core.kv_cache_utils import (get_kv_cache_config,
                                          unify_kv_cache_configs)
 from vllm.v1.core.sched.interface import SchedulerInterface
@@ -36,7 +36,7 @@ from vllm.v1.core.sched.scheduler import Scheduler as V1Scheduler
 from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
                             EngineCoreRequestType,
                             ReconfigureDistributedRequest, ReconfigureRankType,
-                            UtilityOutput)
+                            UtilityOutput, UtilityResult)
 from vllm.v1.engine.mm_input_cache import MirroredProcessingCache
 from vllm.v1.engine.utils import EngineHandshakeMetadata, EngineZmqAddresses
 from vllm.v1.executor.abstract import Executor
@@ -111,6 +111,12 @@ class EngineCore:
                 "compatibility may not be maintained.",
                 vllm_config.scheduler_config.scheduler_cls)
 
+        if len(kv_cache_config.kv_cache_groups) == 0:
+            # Encoder models without KV cache don't support
+            # chunked prefill. But do SSM models?
+            logger.info("Disabling chunked prefill for model without KVCache")
+            vllm_config.scheduler_config.chunked_prefill_enabled = False
+
         self.scheduler: SchedulerInterface = Scheduler(
             vllm_config=vllm_config,
             kv_cache_config=kv_cache_config,
@@ -199,8 +205,17 @@ class EngineCore:
     def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
         return self.model_executor.supported_tasks
 
-    def add_request(self, request: EngineCoreRequest):
-        """Add request to the scheduler."""
+    def add_request(self, request: Request, request_wave: int = 0):
+        """Add request to the scheduler.
+        
+        `request_wave`: indicate which wave of requests this is expected to
+        belong to in DP case
+        """
+        # Validate the request_id type.
+        if not isinstance(request.request_id, str):
+            raise TypeError(
+                f"request_id must be a string, got {type(request.request_id)}")
+
         if pooling_params := request.pooling_params:
             supported_pooling_tasks = [
                 task for task in self.get_supported_tasks()
@@ -211,27 +226,12 @@ class EngineCore:
                 raise ValueError(f"Unsupported task: {pooling_params.task!r} "
                                  f"Supported tasks: {supported_pooling_tasks}")
 
-        if request.mm_hashes is not None:
-            # Here, if hash exists for a multimodal input, then it will be
-            # fetched from the cache, else it will be added to the cache.
-            # Note that the cache here is mirrored with the client cache, so
-            # anything that has a hash must have a HIT cache entry here
-            # as well.
-            assert request.mm_inputs is not None
-            request.mm_inputs = self.mm_input_cache_server.get_and_update_p1(
-                request.mm_inputs, request.mm_hashes)
-
-        req = Request.from_engine_core_request(request)
-        if req.use_structured_output:
-            # Start grammar compilation asynchronously
-            self.structured_output_manager.grammar_init(req)
-
-        if req.kv_transfer_params is not None and (
+        if request.kv_transfer_params is not None and (
                 not self.scheduler.get_kv_connector()):
             logger.warning("Got kv_transfer_params, but no KVConnector found. "
                            "Disabling KVTransfer for this request.")
 
-        self.scheduler.add_request(req)
+        self.scheduler.add_request(request)
 
     def abort_requests(self, request_ids: list[str]):
         """Abort requests from the scheduler."""
@@ -403,6 +403,31 @@ class EngineCore:
         self.model_executor.save_tensorized_model(
             tensorizer_config=tensorizer_config, )
 
+    def preprocess_add_request(
+            self, request: EngineCoreRequest) -> tuple[Request, int]:
+        """Preprocess the request.
+        
+        This function could be directly used in input processing thread to allow
+        request initialization running in parallel with Model forward
+        """
+        if request.mm_hashes is not None:
+            assert request.mm_inputs is not None
+            # Note on thread safety: no race condition.
+            # `mm_input_cache_server` is reset at the end of LLMEngine init,
+            # and will only accessed in the input processing thread afterwards.
+            request.mm_inputs = self.mm_input_cache_server.get_and_update_p1(
+                request.mm_inputs, request.mm_hashes)
+
+        req = Request.from_engine_core_request(request)
+        if req.use_structured_output:
+            # Note on thread safety: no race condition.
+            # `grammar_init` is only invoked in input processing thread. For
+            # `structured_output_manager`, each request is independent and
+            # grammar compilation is async. Scheduler always checks grammar
+            # compilation status before scheduling request.
+            self.structured_output_manager.grammar_init(req)
+        return req, request.current_wave
+
 
 class EngineCoreProc(EngineCore):
     """ZMQ-wrapper for running EngineCore in background process."""
@@ -419,7 +444,6 @@ class EngineCoreProc(EngineCore):
         client_handshake_address: Optional[str] = None,
         engine_index: int = 0,
     ):
-        bind_process_name(self.__class__.__name__, f"{engine_index}")
         self.input_queue = queue.Queue[tuple[EngineCoreRequestType, Any]]()
         self.output_queue = queue.Queue[Union[tuple[int, EngineCoreOutputs],
                                               bytes]]()
@@ -624,11 +648,13 @@ class EngineCoreProc(EngineCore):
             parallel_config: ParallelConfig = kwargs[
                 "vllm_config"].parallel_config
             if parallel_config.data_parallel_size > 1 or dp_rank > 0:
+                set_process_title("DPEngineCore", str(dp_rank))
                 # Set data parallel rank for this engine process.
                 parallel_config.data_parallel_rank = dp_rank
                 parallel_config.data_parallel_rank_local = local_dp_rank
                 engine_core = DPEngineCoreProc(*args, **kwargs)
             else:
+                set_process_title("EngineCore")
                 engine_core = EngineCoreProc(*args, **kwargs)
 
             engine_core.run_busy_loop()
@@ -695,7 +721,8 @@ class EngineCoreProc(EngineCore):
         """Dispatch request from client."""
 
         if request_type == EngineCoreRequestType.ADD:
-            self.add_request(request)
+            req, request_wave = request
+            self.add_request(req, request_wave)
         elif request_type == EngineCoreRequestType.ABORT:
             self.abort_requests(request)
         elif request_type == EngineCoreRequestType.UTILITY:
@@ -703,8 +730,8 @@ class EngineCoreProc(EngineCore):
             output = UtilityOutput(call_id)
             try:
                 method = getattr(self, method_name)
-                output.result = method(
-                    *self._convert_msgspec_args(method, args))
+                result = method(*self._convert_msgspec_args(method, args))
+                output.result = UtilityResult(result)
             except BaseException as e:
                 logger.exception("Invocation of %s method failed", method_name)
                 output.failure_message = (f"Call to {method_name} method"
@@ -794,10 +821,11 @@ class EngineCoreProc(EngineCore):
                         bytes(type_frame.buffer))
 
                     # Deserialize the request data.
-                    decoder = add_request_decoder if (
-                        request_type
-                        == EngineCoreRequestType.ADD) else generic_decoder
-                    request = decoder.decode(data_frames)
+                    if request_type == EngineCoreRequestType.ADD:
+                        request = add_request_decoder.decode(data_frames)
+                        request = self.preprocess_add_request(request)
+                    else:
+                        request = generic_decoder.decode(data_frames)
 
                     # Push to input queue for core busy loop.
                     self.input_queue.put_nowait((request_type, request))
@@ -927,17 +955,17 @@ class DPEngineCoreProc(EngineCoreProc):
         if dp_group := getattr(self, "dp_group", None):
             stateless_destroy_torch_distributed_process_group(dp_group)
 
-    def add_request(self, request: EngineCoreRequest):
-        if self.has_coordinator and request.current_wave != self.current_wave:
-            if request.current_wave > self.current_wave:
-                self.current_wave = request.current_wave
+    def add_request(self, request: Request, request_wave: int = 0):
+        if self.has_coordinator and request_wave != self.current_wave:
+            if request_wave > self.current_wave:
+                self.current_wave = request_wave
             elif not self.engines_running:
                 # Request received for an already-completed wave, notify
                 # front-end that we need to start the next one.
                 self.output_queue.put_nowait(
                     (-1, EngineCoreOutputs(start_wave=self.current_wave)))
 
-        super().add_request(request)
+        super().add_request(request, request_wave)
 
     def _handle_client_request(self, request_type: EngineCoreRequestType,
                                request: Any) -> None:
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index b14d85bbf8e9d..26985df6f62df 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
 import contextlib
+import multiprocessing
 import queue
 import sys
 import uuid
@@ -249,7 +250,8 @@ class InprocClient(EngineCoreClient):
         return self.engine_core.get_supported_tasks()
 
     def add_request(self, request: EngineCoreRequest) -> None:
-        self.engine_core.add_request(request)
+        req, request_wave = self.engine_core.preprocess_add_request(request)
+        self.engine_core.add_request(req, request_wave)
 
     def abort_requests(self, request_ids: list[str]) -> None:
         if len(request_ids) > 0:
@@ -476,6 +478,9 @@ class MPClient(EngineCoreClient):
             # underlying data.
             self.pending_messages = deque[tuple[zmq.MessageTracker, Any]]()
 
+            # Start monitoring engine core processes for unexpected failures
+            self.start_engine_core_monitor()
+
             success = True
         finally:
             if not success:
@@ -505,6 +510,41 @@ class MPClient(EngineCoreClient):
     def dp_engines_running(self) -> bool:
         return self.engines_running
 
+    def start_engine_core_monitor(self):
+        """Start a monitor thread for engine core processes."""
+        engine_manager = self.resources.engine_manager
+        if (engine_manager is None or not hasattr(engine_manager, 'processes')
+                or not engine_manager.processes):
+            # No engine processes to monitor
+            return
+
+        engine_processes = engine_manager.processes
+        self_ref = weakref.ref(self)
+
+        # Monitor engine core process liveness. If any die unexpectedly,
+        # logs an error, shuts down the client and invokes the failure
+        # callback to inform the engine.
+        def monitor_engine_cores():
+            sentinels = [proc.sentinel for proc in engine_processes]
+            died = multiprocessing.connection.wait(sentinels)
+            _self = self_ref()
+            if not _self or _self.resources.engine_dead:
+                return
+            _self.resources.engine_dead = True
+            proc_name = next(proc.name for proc in engine_processes
+                             if proc.sentinel == died[0])
+            logger.error(
+                "Engine core proc %s died unexpectedly, "
+                "shutting down client.", proc_name)
+            _self.shutdown()
+            # Note: For MPClient, we don't have a failure callback mechanism
+            # like MultiprocExecutor, but we set engine_dead flag which will
+            # cause subsequent operations to raise EngineDeadError
+
+        Thread(target=monitor_engine_cores,
+               daemon=True,
+               name="MPClientEngineMonitor").start()
+
 
 def _process_utility_output(output: UtilityOutput,
                             utility_results: dict[int, AnyFuture]):
@@ -513,7 +553,8 @@ def _process_utility_output(output: UtilityOutput,
     if output.failure_message is not None:
         future.set_exception(Exception(output.failure_message))
     else:
-        future.set_result(output.result)
+        assert output.result is not None
+        future.set_result(output.result.result)
 
 
 class SyncMPClient(MPClient):
@@ -749,6 +790,8 @@ class AsyncMPClient(MPClient):
                         outputs_queue.put_nowait(outputs)
             except Exception as e:
                 outputs_queue.put_nowait(e)
+            except asyncio.CancelledError:
+                outputs_queue.put_nowait(EngineDeadError())
 
         resources.output_queue_task = asyncio.create_task(
             process_outputs_socket(), name="EngineCoreOutputQueueTask")
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 993a90752bb2a..8270385053852 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -30,8 +30,8 @@ from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator
 from vllm.executor.multiproc_worker_utils import (
     _add_prefix, set_multiprocessing_worker_envs)
 from vllm.logger import init_logger
-from vllm.utils import (bind_process_name, get_distributed_init_method,
-                        get_loopback_ip, get_mp_context, get_open_port)
+from vllm.utils import (get_distributed_init_method, get_loopback_ip,
+                        get_mp_context, get_open_port, set_process_title)
 from vllm.v1.executor.abstract import Executor, FailureCallback
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.worker.worker_base import WorkerWrapperBase
@@ -104,8 +104,12 @@ class MultiprocExecutor(Executor):
         finally:
             if not success:
                 # Clean up the worker procs if there was a failure.
+                # Close death_writers first to signal workers to exit
+                for uw in unready_workers:
+                    if uw.death_writer is not None:
+                        uw.death_writer.close()
                 self._ensure_worker_termination(
-                    [w.proc for w in unready_workers])
+                    [uw.proc for uw in unready_workers])
 
         # For pipeline parallel, we use a thread pool for asynchronous
         # execute_model.
@@ -282,6 +286,10 @@ class MultiprocExecutor(Executor):
 
             if workers := getattr(self, 'workers', None):
                 for w in workers:
+                    # Close death_writer to signal child processes to exit
+                    if w.death_writer is not None:
+                        w.death_writer.close()
+                        w.death_writer = None
                     w.worker_response_mq = None
                 self._ensure_worker_termination([w.proc for w in workers])
 
@@ -316,6 +324,7 @@ class UnreadyWorkerProcHandle:
     proc: BaseProcess
     rank: int
     ready_pipe: Connection
+    death_writer: Optional[Connection] = None
 
 
 @dataclass
@@ -323,6 +332,7 @@ class WorkerProcHandle:
     proc: BaseProcess
     rank: int
     worker_response_mq: MessageQueue  # The worker process writes to this MQ
+    death_writer: Optional[Connection] = None
 
     @classmethod
     def from_unready_handle(
@@ -332,6 +342,7 @@ class WorkerProcHandle:
             proc=unready_handle.proc,
             rank=unready_handle.rank,
             worker_response_mq=worker_response_mq,
+            death_writer=unready_handle.death_writer,
         )
 
 
@@ -365,10 +376,14 @@ class WorkerProc:
         }
         wrapper.init_worker(all_kwargs)
         self.worker = wrapper
-        bind_process_name(
-            self.worker.worker.__class__.__name__,
-            f"TP{self.rank}_DP{vllm_config.parallel_config.data_parallel_rank}"
-        )
+
+        pp_size = vllm_config.parallel_config.pipeline_parallel_size
+        tp_size = vllm_config.parallel_config.tensor_parallel_size
+        pp_str = f"PP{rank // tp_size}" if pp_size > 1 else ""
+        tp_str = f"TP{rank % tp_size}" if tp_size > 1 else ""
+        suffix = f"{pp_str}{'_' if pp_str and tp_str else ''}{tp_str}"
+        if suffix:
+            set_process_title(suffix, append=True)
         pid = os.getpid()
         _add_prefix(sys.stdout, f"VllmWorker rank={rank}", pid)
         _add_prefix(sys.stderr, f"VllmWorker rank={rank}", pid)
@@ -396,6 +411,9 @@ class WorkerProc:
         # (reader, writer)
         reader, writer = context.Pipe(duplex=False)
 
+        # Create death pipe to detect parent process exit
+        death_reader, death_writer = context.Pipe(duplex=False)
+
         process_kwargs = {
             "vllm_config": vllm_config,
             "local_rank": local_rank,
@@ -403,6 +421,7 @@ class WorkerProc:
             "distributed_init_method": distributed_init_method,
             "input_shm_handle": input_shm_handle,
             "ready_pipe": (reader, writer),
+            "death_pipe": death_reader,
         }
         # Run EngineCore busy loop in background process.
         proc = context.Process(target=WorkerProc.worker_main,
@@ -412,7 +431,9 @@ class WorkerProc:
 
         proc.start()
         writer.close()
-        return UnreadyWorkerProcHandle(proc, rank, reader)
+        # Keep death_writer open in parent - when parent exits,
+        # death_reader in child will get EOFError
+        return UnreadyWorkerProcHandle(proc, rank, reader, death_writer)
 
     @staticmethod
     def wait_for_ready(
@@ -483,6 +504,28 @@ class WorkerProc:
         worker = None
         # tuple[Connection, Connection]
         reader, ready_writer = kwargs.pop("ready_pipe")
+        death_pipe = kwargs.pop("death_pipe", None)
+
+        # Start death monitoring thread if death_pipe is provided
+        if death_pipe is not None:
+
+            def monitor_parent_death():
+                try:
+                    # This will block until parent process exits (pipe closes)
+                    death_pipe.recv()
+                except EOFError:
+                    # Parent process has exited, terminate this worker
+                    logger.info("Parent process exited, terminating worker")
+                    # Send signal to self to trigger clean shutdown
+                    os.kill(os.getpid(), signal.SIGTERM)
+                except Exception as e:
+                    logger.warning("Death monitoring error: %s", e)
+
+            death_monitor = Thread(target=monitor_parent_death,
+                                   daemon=True,
+                                   name="WorkerDeathMonitor")
+            death_monitor.start()
+
         try:
             reader.close()
             worker = WorkerProc(*args, **kwargs)
@@ -523,6 +566,8 @@ class WorkerProc:
         finally:
             if ready_writer is not None:
                 ready_writer.close()
+            if death_pipe is not None:
+                death_pipe.close()
             # Clean up once worker exits busy loop
             if worker is not None:
                 worker.shutdown()
diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py
index bec31a7a058d2..4ff96f9786b88 100644
--- a/vllm/v1/kv_cache_interface.py
+++ b/vllm/v1/kv_cache_interface.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import copy
-from dataclasses import dataclass
+from dataclasses import dataclass, fields
 from math import prod
 from typing import Optional
 
@@ -16,7 +16,7 @@ from vllm.utils import cdiv, get_dtype_size
 logger = init_logger(__name__)
 
 
-@dataclass
+@dataclass(frozen=True)
 class KVCacheSpec:
     """
     A base class for specifying the KV cache format of one layer.
@@ -25,20 +25,6 @@ class KVCacheSpec:
     # number of tokens in a block
     block_size: int
 
-    @property
-    def type_id(self) -> str:
-        """
-        The type identifier of this KV cache.
-        Return different strings for layers with different KV cache type (e.g.,
-        different number of tokens like full attention vs sliding window
-        attention, different KV cache size per token like layers with different
-        number of heads)
-
-        Returns:
-            The type identifier of this KV cache.
-        """
-        raise NotImplementedError
-
     @property
     def page_size_bytes(self) -> int:
         """
@@ -63,13 +49,12 @@ class KVCacheSpec:
         """
         Merge a list of KVCacheSpec objects into a single KVCacheSpec object.
         """
-        assert all(spec.type_id == specs[0].type_id for spec in specs[1:]), (
-            "All layers in the same KV cache group must share the same "
-            "type_id.")
+        assert all(spec == specs[0] for spec in specs[1:]), (
+            "All layers in the same KV cache group must be the same.")
         return copy.deepcopy(specs[0])
 
 
-@dataclass
+@dataclass(frozen=True)
 class AttentionSpec(KVCacheSpec):
     num_kv_heads: int
     head_size: int
@@ -84,7 +69,7 @@ class AttentionSpec(KVCacheSpec):
                 * get_dtype_size(self.dtype)
 
 
-@dataclass
+@dataclass(frozen=True)
 class FullAttentionSpec(AttentionSpec):
     sliding_window: Optional[int] = None
     attention_chunk_size: Optional[int] = None
@@ -98,10 +83,6 @@ class FullAttentionSpec(AttentionSpec):
     Default to None for not using sliding window attention.
     """
 
-    @property
-    def type_id(self) -> str:
-        return f"full_attention_{self.block_size}_{self.page_size_bytes}"
-
     def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
         max_model_len = vllm_config.model_config.max_model_len
         return cdiv(max_model_len, self.block_size) * self.page_size_bytes
@@ -123,15 +104,28 @@ class FullAttentionSpec(AttentionSpec):
         Merge a list of FullAttentionSpec objects into a single 
         FullAttentionSpec object.
         """
-        merged_spec = super().merge(specs)
+        assert all(isinstance(spec, FullAttentionSpec) for spec in specs), (
+            "All attention layers in the same KV cache group must be "
+            "FullAttentionSpec.")
+
         sliding_window = set(spec.sliding_window for spec in specs
                              if spec.sliding_window is not None)
         attention_chunk_size = set(spec.attention_chunk_size for spec in specs
                                    if spec.attention_chunk_size is not None)
-
-        merged_spec.sliding_window = cls.merge_window_sizes(sliding_window)
-        merged_spec.attention_chunk_size = (
-            cls.merge_window_sizes(attention_chunk_size))
+        merged_spec = cls(
+            block_size=specs[0].block_size,
+            num_kv_heads=specs[0].num_kv_heads,
+            head_size=specs[0].head_size,
+            dtype=specs[0].dtype,
+            use_mla=specs[0].use_mla,
+            sliding_window=cls.merge_window_sizes(sliding_window),
+            attention_chunk_size=cls.merge_window_sizes(attention_chunk_size),
+        )
+        for spec in specs:
+            for f in fields(AttentionSpec):
+                assert getattr(spec, f.name) == getattr(merged_spec, f.name), (
+                    "All attention layers in the same KV cache group must have "
+                    "the same attention spec.")
         assert (
             (merged_spec.sliding_window is not None) +
             (merged_spec.attention_chunk_size is not None) <= 1
@@ -140,16 +134,10 @@ class FullAttentionSpec(AttentionSpec):
         return merged_spec
 
 
-@dataclass
+@dataclass(frozen=True)
 class ChunkedLocalAttentionSpec(AttentionSpec):
     attention_chunk_size: int
 
-    @property
-    def type_id(self) -> str:
-        return (
-            f"local_attention_{self.attention_chunk_size}_{self.block_size}_{self.page_size_bytes}"
-        )  # noqa
-
     def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
         max_model_len = vllm_config.model_config.max_model_len
         max_num_batched_tokens = (
@@ -165,17 +153,13 @@ class ChunkedLocalAttentionSpec(AttentionSpec):
         return cdiv(num_tokens, self.block_size) * self.page_size_bytes
 
 
-@dataclass
+@dataclass(frozen=True)
 class SlidingWindowSpec(AttentionSpec):
     sliding_window: int
 
     def __post_init__(self):
         assert not self.use_mla, "MLA is not supported for sliding window"
 
-    @property
-    def type_id(self) -> str:
-        return f"sliding_window_{self.sliding_window}_{self.block_size}_{self.page_size_bytes}"  # noqa
-
     def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
         max_model_len = vllm_config.model_config.max_model_len
         max_num_batched_tokens = (
@@ -195,22 +179,17 @@ class SlidingWindowSpec(AttentionSpec):
         return (cdiv(num_tokens, self.block_size) + 1) * self.page_size_bytes
 
 
-@dataclass
+@dataclass(frozen=True)
 class MambaSpec(KVCacheSpec):
     shapes: tuple[tuple[int, ...], ...]
     dtype: torch.dtype
     page_size_padded: Optional[int] = None
-
-    def __post_init__(self):
-        self.num_elements = sum(prod(shape) for shape in self.shapes)
-
-    @property
-    def type_id(self) -> str:
-        return f"mamba_{self.shapes}_{self.dtype}"
+    mamba_type: str = "mamba2"
 
     @property
     def page_size_bytes(self) -> int:
-        page_size = self.num_elements * get_dtype_size(self.dtype)
+        num_elements = sum(prod(shape) for shape in self.shapes)
+        page_size = num_elements * get_dtype_size(self.dtype)
         if self.page_size_padded is not None:
             assert self.page_size_padded >= page_size
             return self.page_size_padded
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 7f2556bab5a40..3b0616952babf 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -6,7 +6,6 @@ import time
 from abc import ABC, abstractmethod
 from typing import Callable, Optional, Union
 
-import numpy as np
 import prometheus_client
 
 from vllm.config import SupportsMetricsInfo, VllmConfig
@@ -67,18 +66,20 @@ class LoggingStatLogger(StatLoggerBase):
         self.last_log_time = now
 
         # Tracked stats over current local logging interval.
-        self.num_prompt_tokens: list[int] = []
-        self.num_generation_tokens: list[int] = []
+        self.num_prompt_tokens: int = 0
+        self.num_generation_tokens: int = 0
 
     def _track_iteration_stats(self, iteration_stats: IterationStats):
         # Save tracked stats for token counters.
-        self.num_prompt_tokens.append(iteration_stats.num_prompt_tokens)
-        self.num_generation_tokens.append(
-            iteration_stats.num_generation_tokens)
+        self.num_prompt_tokens += iteration_stats.num_prompt_tokens
+        self.num_generation_tokens += iteration_stats.num_generation_tokens
 
-    def _get_throughput(self, tracked_stats: list[int], now: float) -> float:
+    def _get_throughput(self, tracked_stats: int, now: float) -> float:
         # Compute summary metrics for tracked stats
-        return float(np.sum(tracked_stats) / (now - self.last_log_time))
+        delta_time = now - self.last_log_time
+        if delta_time <= 0.0:
+            return 0.0
+        return float(tracked_stats / delta_time)
 
     def record(self,
                scheduler_stats: Optional[SchedulerStats],
diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
index 87a84e5bf4350..460e1c0b05bca 100644
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -33,7 +33,7 @@ class TopKTopPSampler(nn.Module):
             if is_flashinfer_available:
                 flashinfer_version = flashinfer.__version__
                 if flashinfer_version < "0.2.3":
-                    logger.warning(
+                    logger.warning_once(
                         "FlashInfer version >= 0.2.3 required. "
                         "Falling back to default sampling implementation.")
                     self.forward = self.forward_native
@@ -46,17 +46,18 @@ class TopKTopPSampler(nn.Module):
                     # None means False, while in V1, None means True. This is
                     # why we use the condition
                     # `envs.VLLM_USE_FLASHINFER_SAMPLER is not False` here.
-                    logger.info("Using FlashInfer for top-p & top-k sampling.")
+                    logger.info_once(
+                        "Using FlashInfer for top-p & top-k sampling.")
                     self.forward = self.forward_cuda
                 else:
-                    logger.warning(
+                    logger.warning_once(
                         "FlashInfer is available, but it is not enabled. "
                         "Falling back to the PyTorch-native implementation of "
                         "top-p & top-k sampling. For the best performance, "
                         "please set VLLM_USE_FLASHINFER_SAMPLER=1.")
                     self.forward = self.forward_native
             else:
-                logger.warning(
+                logger.warning_once(
                     "FlashInfer is not available. Falling back to the PyTorch-"
                     "native implementation of top-p & top-k sampling. For the "
                     "best performance, please install FlashInfer.")
@@ -97,9 +98,9 @@ class TopKTopPSampler(nn.Module):
             probs = logits.softmax(dim=-1, dtype=torch.float32)
             return random_sample(probs, generators)
         if generators:
-            logger.warning("FlashInfer 0.2.3+ does not support "
-                           "per-request generators. Falling back to "
-                           "PyTorch-native implementation.")
+            logger.warning_once("FlashInfer 0.2.3+ does not support "
+                                "per-request generators. Falling back to "
+                                "PyTorch-native implementation.")
             return self.forward_native(logits, generators, k, p)
         # flashinfer sampling functions expect contiguous logits.
         # In flex_attn/triton_attn fp32 inference, logits can be non-contiguous
diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index 03200c2c2f8ec..809a60c1962f8 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
+import importlib
 import pickle
 from collections.abc import Sequence
 from inspect import isclass
@@ -9,6 +10,7 @@ from types import FunctionType
 from typing import Any, Optional, Union
 
 import cloudpickle
+import msgspec
 import numpy as np
 import torch
 import zmq
@@ -22,6 +24,7 @@ from vllm.multimodal.inputs import (BaseMultiModalField,
                                     MultiModalFlatField, MultiModalKwargs,
                                     MultiModalKwargsItem,
                                     MultiModalSharedField, NestedTensors)
+from vllm.v1.engine import UtilityResult
 
 logger = init_logger(__name__)
 
@@ -46,6 +49,13 @@ def _log_insecure_serialization_warning():
                         "VLLM_ALLOW_INSECURE_SERIALIZATION=1")
 
 
+def _typestr(val: Any) -> Optional[tuple[str, str]]:
+    if val is None:
+        return None
+    t = type(val)
+    return t.__module__, t.__qualname__
+
+
 class MsgpackEncoder:
     """Encoder with custom torch tensor and numpy array serialization.
 
@@ -122,6 +132,17 @@ class MsgpackEncoder:
                     for itemlist in mm._items_by_modality.values()
                     for item in itemlist]
 
+        if isinstance(obj, UtilityResult):
+            result = obj.result
+            if not envs.VLLM_ALLOW_INSECURE_SERIALIZATION:
+                return None, result
+            # Since utility results are not strongly typed, we also encode
+            # the type (or a list of types in the case it's a list) to
+            # help with correct msgspec deserialization.
+            return _typestr(result) if type(result) is not list else [
+                _typestr(v) for v in result
+            ], result
+
         if not envs.VLLM_ALLOW_INSECURE_SERIALIZATION:
             raise TypeError(f"Object of type {type(obj)} is not serializable"
                             "Set VLLM_ALLOW_INSECURE_SERIALIZATION=1 to allow "
@@ -237,8 +258,35 @@ class MsgpackDecoder:
                     k: self._decode_nested_tensors(v)
                     for k, v in obj.items()
                 })
+            if t is UtilityResult:
+                return self._decode_utility_result(obj)
         return obj
 
+    def _decode_utility_result(self, obj: Any) -> UtilityResult:
+        result_type, result = obj
+        if result_type is not None:
+            if not envs.VLLM_ALLOW_INSECURE_SERIALIZATION:
+                raise TypeError("VLLM_ALLOW_INSECURE_SERIALIZATION must "
+                                "be set to use custom utility result types")
+            assert isinstance(result_type, list)
+            if len(result_type) == 2 and isinstance(result_type[0], str):
+                result = self._convert_result(result_type, result)
+            else:
+                assert isinstance(result, list)
+                result = [
+                    self._convert_result(rt, r)
+                    for rt, r in zip(result_type, result)
+                ]
+        return UtilityResult(result)
+
+    def _convert_result(self, result_type: Sequence[str], result: Any) -> Any:
+        if result_type is None:
+            return result
+        mod_name, name = result_type
+        mod = importlib.import_module(mod_name)
+        result_type = getattr(mod, name)
+        return msgspec.convert(result, result_type, dec_hook=self.dec_hook)
+
     def _decode_ndarray(self, arr: Any) -> np.ndarray:
         dtype, shape, data = arr
         # zero-copy decode. We assume the ndarray will not be kept around,
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 967847c02ff2f..63f6fc276189d 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -330,6 +330,7 @@ class EagleProposer:
             max_query_len=new_query_len_per_req.max().item(),
             block_table_tensor=common_attn_metadata.block_table_tensor,
             slot_mapping=common_attn_metadata.slot_mapping[token_indices],
+            causal=True,
         )
 
         return spec_common_attn_metadata, token_indices
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index bb5a36f38386b..c74d8c543f76c 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -15,8 +15,8 @@ import torch
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled,
                                   usage_message)
-from vllm.utils import (bind_process_name, get_open_port,
-                        get_open_zmq_ipc_path, get_tcp_uri, kill_process_tree)
+from vllm.utils import (get_open_port, get_open_zmq_ipc_path, get_tcp_uri,
+                        kill_process_tree)
 
 if TYPE_CHECKING:
     from vllm.v1.engine.coordinator import DPCoordinator
@@ -144,7 +144,7 @@ class APIServerProcessManager:
         self.listen_address = listen_address
         self.sock = sock
         self.args = args
-        bind_process_name(self.__class__.__name__)
+
         # Start API servers
         spawn_context = multiprocessing.get_context("spawn")
         self.processes: list[BaseProcess] = []
diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py
index ca94ac8c60545..6b2b50a57e1f8 100644
--- a/vllm/v1/worker/cpu_model_runner.py
+++ b/vllm/v1/worker/cpu_model_runner.py
@@ -4,6 +4,7 @@ from contextlib import contextmanager
 from typing import Any
 
 import torch
+import torch.nn as nn
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
@@ -59,6 +60,9 @@ class CPUModelRunner(GPUModelRunner):
                                               self.scheduler_config,
                                               self.lora_config, self.device)
 
+    def get_model(self) -> nn.Module:
+        return self.model
+
     def warming_up_model(self) -> None:
         logger.info("Warming up model for the compilation...")
         # Only generate graph for the generic shape
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 28f26f93c50ac..28337a688e37f 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -49,9 +49,10 @@ from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
                         GiB_bytes, LazyLoader, check_use_alibi, get_dtype_size,
                         is_pin_memory_available, round_up)
 from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
-from vllm.v1.attention.backends.mamba_attn import Mamba2AttentionBackend
+from vllm.v1.attention.backends.mamba_selectors import get_mamba_attn_backend
 from vllm.v1.attention.backends.utils import (
     AttentionMetadataBuilder, CommonAttentionMetadata,
+    make_kv_sharing_fast_prefill_attention_metadata,
     make_local_attention_virtual_batches, split_attn_metadata)
 from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
 from vllm.v1.kv_cache_interface import (AttentionSpec,
@@ -155,8 +156,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         self.is_multimodal_model = model_config.is_multimodal_model
         self.is_pooling_model = model_config.pooler_config is not None
-        self.model_supports_multimodal_raw_input = (
-            model_config.model_supports_multimodal_raw_input)
+        self.is_encoder_only_model = False
+        self.is_multimodal_raw_input_supported = (
+            model_config.is_multimodal_raw_input_supported)
         self.max_model_len = model_config.max_model_len
         self.max_num_tokens = scheduler_config.max_num_batched_tokens
         self.max_num_reqs = scheduler_config.max_num_seqs
@@ -350,6 +352,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # means this layer will perform attention using the keys and values
         # from the KV cache of `shared_kv_cache_layers[layer_name]`.
         self.shared_kv_cache_layers: dict[str, str] = {}
+        self.kv_sharing_fast_prefill_eligible_layers: set[str] = set()
+
+        self.kv_sharing_fast_prefill_logits_indices = None
+        if self.cache_config.kv_sharing_fast_prefill:
+            self.kv_sharing_fast_prefill_logits_indices = torch.zeros(
+                self.max_num_tokens, dtype=torch.int32, device=self.device)
 
     def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None:
         """
@@ -670,7 +678,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
     ) -> dict[str, Any]:
 
         model_kwargs: dict[str, Any] = {}
-        if self.model_supports_multimodal_raw_input:
+        if self.is_multimodal_raw_input_supported:
             # This model requires the raw multimodal data in input.
             if scheduler_output:
                 multi_modal_kwargs_list = []
@@ -826,10 +834,73 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         spec_decode_common_attn_metadata = None
 
+        use_spec_decode = len(
+            scheduler_output.scheduled_spec_decode_tokens) > 0
+        if not use_spec_decode:
+            # NOTE(woosuk): Due to chunked prefills, the batch may contain
+            # partial requests. While we should not sample any token
+            # from these partial requests, we do so for simplicity.
+            # We will ignore the sampled tokens from the partial requests.
+            # TODO: Support prompt logprobs.
+            logits_indices = query_start_loc[1:] - 1
+            spec_decode_metadata = None
+        else:
+            # Get the number of draft tokens for each request.
+            # Iterate over the dictionary rather than all requests since not all
+            # requests have draft tokens.
+            num_draft_tokens = np.zeros(num_reqs, dtype=np.int32)
+            for req_id, draft_token_ids in (
+                    scheduler_output.scheduled_spec_decode_tokens.items()):
+                req_idx = self.input_batch.req_id_to_index[req_id]
+                num_draft_tokens[req_idx] = len(draft_token_ids)
+
+            spec_decode_metadata = self._calc_spec_decode_metadata(
+                num_draft_tokens, cu_num_tokens)
+            logits_indices = spec_decode_metadata.logits_indices
+
+        logits_indices_padded = None
+        if self.cache_config.kv_sharing_fast_prefill:
+            assert self.kv_sharing_fast_prefill_logits_indices is not None
+            num_logits = logits_indices.shape[0]
+            assert num_logits > 0
+            self.kv_sharing_fast_prefill_logits_indices[:num_logits].copy_(
+                logits_indices)
+            # There might have leftover indices in logits_indices[num_logits:]
+            # from previous iterations, whose values may be greater than the
+            # batch size in the current iteration. To ensure indices are always
+            # valid, we fill the padded indices with the last index.
+            self.kv_sharing_fast_prefill_logits_indices[num_logits:].fill_(
+                logits_indices[-1].item())
+            if (self.use_cuda_graph
+                    and num_logits <= self.cudagraph_batch_sizes[-1]):
+                # Use piecewise CUDA graphs.
+                # Add padding to the batch size.
+                num_logits_padded = self.vllm_config.pad_for_cudagraph(
+                    num_logits)
+            else:
+                num_logits_padded = num_logits
+            logits_indices_padded = (
+                self.kv_sharing_fast_prefill_logits_indices[:num_logits_padded]
+            )
+
         attn_metadata: PerLayerAttnMetadata = {}
         if ubatch_slices is not None:
             attn_metadata = [dict() for _ in range(len(ubatch_slices))]
 
+        # Prepare encoder attention metadata separately
+        # (encoder layers are not in KV cache groups)
+        if self.is_encoder_only_model:
+            common_attn_metadata, encoder_attn_metadata = \
+                self._build_encoder_only_attn_metadata(
+                scheduler_output)
+
+            # Add encoder attention metadata for all encoder layers
+            attention_layers = get_layers_from_vllm_config(
+                self.vllm_config, Attention)
+            for layer_name, attn_module in attention_layers.items():
+                if attn_module.attn_type == AttentionType.ENCODER_ONLY:
+                    attn_metadata[layer_name] = encoder_attn_metadata
+
         # Prepare the attention metadata for each KV cache group and make layers
         # in the same group share the same metadata.
         for kv_cache_group_id, kv_cache_group_spec in enumerate(
@@ -855,6 +926,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 max_query_len=max_num_scheduled_tokens,
                 block_table_tensor=blk_table_tensor,
                 slot_mapping=slot_mapping,
+                causal=True,
             )
 
             if self.speculative_config and \
@@ -886,7 +958,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                         common_attn_metadata_list):
                     assert common_attn_metadata.max_query_len == 1
                     attn_metadata_i = (
-                        self.attn_metadata_builders[kv_cache_group_id].build(
+                        builder.build(
                             common_prefix_len=common_prefix_len,
                             common_attn_metadata=common_attn_metadata,
                             ubatch_id=ubid))
@@ -894,42 +966,64 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                         assert type(attn_metadata) is list
                         attn_metadata[ubid][layer_name] = attn_metadata_i
             else:
-                attn_metadata_i = (
-                    self.attn_metadata_builders[kv_cache_group_id].build(
-                        common_prefix_len=common_prefix_len,
-                        common_attn_metadata=common_attn_metadata))
+                attn_metadata_i = (builder.build(
+                    common_prefix_len=common_prefix_len,
+                    common_attn_metadata=common_attn_metadata,
+                ))
+
+                fast_prefill_metadata = attn_metadata_i
+                if (self.cache_config.kv_sharing_fast_prefill
+                        and self.kv_sharing_fast_prefill_eligible_layers):
+                    # Dynamically create a a dataclass type that inherits
+                    # from attention metadata type but includes additional
+                    # fields logits_indices_padded and num_logits_indices
+                    # which are required for prefill truncation
+                    fast_prefill_metadata_type = (
+                        make_kv_sharing_fast_prefill_attention_metadata(
+                            metadata_cls=type(attn_metadata_i), ))
+                    fast_prefill_metadata = fast_prefill_metadata_type(
+                        **dataclasses.asdict(attn_metadata_i),
+                        logits_indices_padded=logits_indices_padded,
+                        num_logits_indices=logits_indices.size(0),
+                    )
+
                 for layer_name in kv_cache_group_spec.layer_names:
-                    assert type(attn_metadata) is dict
+                    if (self.cache_config.kv_sharing_fast_prefill and layer_name
+                            in self.kv_sharing_fast_prefill_eligible_layers):
+                        attn_metadata[layer_name] = fast_prefill_metadata
+                        continue
+
                     attn_metadata[layer_name] = attn_metadata_i
 
+            # Hack for now to fix chunked local attention + no hybrid kv cache
+            # manager we can remove this once
+            # https://github.com/vllm-project/vllm/pull/21588
+            # is merged (i.e. properly handle different attention backends for
+            # the same kv_cache_spec)
+            if self.attention_chunk_size is not None \
+                    and self.scheduler_config.disable_hybrid_kv_cache_manager:
+                if not hasattr(self, "local_attention_layers"):
+                    self.local_attention_layers = []
+                    attn_layers = get_layers_from_vllm_config(
+                        self.vllm_config, Attention)
+                    for layer_name, attn_module in attn_layers.items():
+                        if attn_module.use_irope:
+                            self.local_attention_layers.append(layer_name)
+
+                local_attn_metadata_i = (builder.build(
+                    common_prefix_len=0,
+                    common_attn_metadata=make_local_attention_virtual_batches(
+                        self.attention_chunk_size, common_attn_metadata,
+                        self.cache_config.block_size),
+                ))
+
+                for layer_name in self.local_attention_layers:
+                    attn_metadata[layer_name] = local_attn_metadata_i
+
         attention_cuda_graphs = all(
             b.can_run_in_cudagraph(common_attn_metadata)
             for b in self.attn_metadata_builders)
 
-        use_spec_decode = len(
-            scheduler_output.scheduled_spec_decode_tokens) > 0
-        if not use_spec_decode:
-            # NOTE(woosuk): Due to chunked prefills, the batch may contain
-            # partial requests. While we should not sample any token
-            # from these partial requests, we do so for simplicity.
-            # We will ignore the sampled tokens from the partial requests.
-            # TODO: Support prompt logprobs.
-            logits_indices = query_start_loc[1:] - 1
-            spec_decode_metadata = None
-        else:
-            # Get the number of draft tokens for each request.
-            # Iterate over the dictionary rather than all requests since not all
-            # requests have draft tokens.
-            num_draft_tokens = np.zeros(num_reqs, dtype=np.int32)
-            for req_id, draft_token_ids in (
-                    scheduler_output.scheduled_spec_decode_tokens.items()):
-                req_idx = self.input_batch.req_id_to_index[req_id]
-                num_draft_tokens[req_idx] = len(draft_token_ids)
-
-            spec_decode_metadata = self._calc_spec_decode_metadata(
-                num_draft_tokens, cu_num_tokens)
-            logits_indices = spec_decode_metadata.logits_indices
-
         # Hot-Swap lora model
         if self.lora_config:
             self.set_active_loras(self.input_batch, num_scheduled_tokens)
@@ -1389,9 +1483,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         if sync_self:
             assert intermediate_tensors is not None
             for k, v in intermediate_tensors.items():
-                _copy_slice = copy_slice(is_residual_scattered)
-                self.intermediate_tensors[k][_copy_slice].copy_(
-                    v[_copy_slice], non_blocking=True)
+                is_scattered = k == "residual" and is_residual_scattered
+                copy_len = num_tokens // tp if is_scattered else \
+                    num_tokens
+                self.intermediate_tensors[k][:copy_len].copy_(
+                    v[:copy_len], non_blocking=True)
 
         return IntermediateTensors({
             k:
@@ -2329,6 +2425,17 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 rank_mapping,
             )
 
+        if (
+            self.vllm_config.compilation_config.level == \
+                CompilationLevel.DYNAMO_AS_IS and supports_dynamo()
+        ):
+            backend = self.vllm_config.compilation_config.init_backend(
+                self.vllm_config)
+            compilation_counter.dynamo_as_is_count += 1
+            self.model.compile(
+                fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
+                backend=backend)
+
     def reload_weights(self) -> None:
         assert getattr(self, "model", None) is not None, \
             "Cannot reload weights before model is loaded."
@@ -2597,7 +2704,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     block_table_tensor=self.input_batch.block_table[
                         kv_cache_group_id].get_device_tensor()[:num_reqs],
                     slot_mapping=self.input_batch.
-                    block_table[kv_cache_group_id].slot_mapping[:num_tokens])
+                    block_table[kv_cache_group_id].slot_mapping[:num_tokens],
+                    causal=True)
 
                 
                 if ubatch_slices is not None:
@@ -2951,6 +3059,50 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         logger.info("Graph capturing finished in %.0f secs, took %.2f GiB",
                     elapsed_time, cuda_graph_size / (1 << 30))
 
+    def _initialize_single_attn_backend(
+        self, kv_cache_spec: KVCacheSpec, layer_names: list[str]
+    ) -> tuple[AttentionBackend, AttentionMetadataBuilder]:
+        if isinstance(kv_cache_spec, AttentionSpec):
+            attn_backend_i = get_attn_backend(
+                kv_cache_spec.head_size,
+                self.dtype,
+                kv_cache_spec.dtype,
+                kv_cache_spec.block_size,
+                self.model_config.is_attention_free,
+                use_mla=kv_cache_spec.use_mla,
+            )
+            if attn_backend_i is None:
+                error_msg = (f"Error with get_attn_backend: "
+                             f"{kv_cache_spec.head_size=}, "
+                             f"{self.dtype=}, {kv_cache_spec.dtype=}, "
+                             f"{kv_cache_spec.block_size=}, "
+                             f"{self.model_config.is_attention_free=}, "
+                             f"{kv_cache_spec.use_mla=}")
+                logger.error(error_msg)
+                raise NotImplementedError(
+                    "Non-Attention backend is not supported by V1 "
+                    "GPUModelRunner.")
+        elif isinstance(kv_cache_spec, MambaSpec):
+            attn_backend_i = get_mamba_attn_backend(kv_cache_spec.mamba_type)
+        else:
+            raise ValueError(
+                f"Unknown KV cache spec type: {type(kv_cache_spec)}")
+
+        attn_metadata_builder_i = attn_backend_i.get_builder_cls()(
+            kv_cache_spec,
+            layer_names,
+            self.vllm_config,
+            self.device,
+        )
+
+        if (self.full_cuda_graph
+                and not attn_metadata_builder_i.full_cudagraph_supported):
+            raise ValueError(
+                f"Full CUDAGraph not supported for "
+                f"{attn_backend_i.__name__}. Turn off CompilationConfig."
+                f"full_cuda_graph or use a different attention backend.")
+        return attn_backend_i, attn_metadata_builder_i
+
     def initialize_attn_backend(self, kv_cache_config: KVCacheConfig) -> None:
         """
         Initialize the attention backends and attention metadata builders.
@@ -2961,48 +3113,47 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         for i, kv_cache_group_spec in enumerate(
                 kv_cache_config.kv_cache_groups):
             kv_cache_spec = kv_cache_group_spec.kv_cache_spec
-            if isinstance(kv_cache_spec, AttentionSpec):
-                attn_backend_i = get_attn_backend(
-                    kv_cache_spec.head_size,
-                    self.dtype,
-                    kv_cache_spec.dtype,
-                    kv_cache_spec.block_size,
-                    self.model_config.is_attention_free,
-                    use_mla=kv_cache_spec.use_mla,
-                )
-                if attn_backend_i is None:
-                    error_msg = (f"Error with get_attn_backend: "
-                                 f"{kv_cache_spec.head_size=}, "
-                                 f"{self.dtype=}, {kv_cache_spec.dtype=}, "
-                                 f"{kv_cache_spec.block_size=}, "
-                                 f"{self.model_config.is_attention_free=}, "
-                                 f"{kv_cache_spec.use_mla=}")
-                    logger.error(error_msg)
-                    raise NotImplementedError(
-                        "Non-Attention backend is not supported by V1 "
-                        "GPUModelRunner.")
-            elif isinstance(kv_cache_spec, MambaSpec):
-                attn_backend_i = Mamba2AttentionBackend
-            else:
-                raise ValueError(
-                    f"Unknown KV cache spec type: {type(kv_cache_spec)}")
-
-            attn_metadata_builder_i = attn_backend_i.get_builder_cls()(
-                kv_cache_spec,
-                self.vllm_config,
-                self.device,
-            )
-
-            if (self.full_cuda_graph
-                    and not attn_metadata_builder_i.full_cudagraph_supported):
-                raise ValueError(
-                    f"Full CUDAGraph not supported for "
-                    f"{attn_backend_i.__name__}. Turn off CompilationConfig."
-                    f"full_cuda_graph or use a different attention backend.")
 
+            attn_backend_i, attn_metadata_builder_i = (
+                self._initialize_single_attn_backend(
+                    kv_cache_spec, kv_cache_group_spec.layer_names))
             self.attn_backends.append(attn_backend_i)
             self.attn_metadata_builders.append(attn_metadata_builder_i)
 
+        if len(self.attn_backends) > 0:
+            return
+
+        # Check if model is encoder-only
+        block_size = self.vllm_config.cache_config.block_size
+        use_mla = self.vllm_config.model_config.use_mla
+        attn_specs = list[AttentionSpec]()
+        attn_layers = get_layers_from_vllm_config(self.vllm_config, Attention)
+        for attn_module in attn_layers.values():
+
+            if attn_module.attn_type == AttentionType.ENCODER_ONLY:
+                assert attn_module.sliding_window is None, "Sliding "
+                "window attention is not supported for encoder-only models"
+
+                attn_specs.append(
+                    FullAttentionSpec(block_size=block_size,
+                                      num_kv_heads=attn_module.num_kv_heads,
+                                      head_size=attn_module.head_size,
+                                      dtype=self.kv_cache_dtype,
+                                      use_mla=use_mla))
+            else:
+                raise ValueError("Expected only encoder-only layers")
+
+        if len(attn_specs) > 0:
+            assert len(attn_specs) == len(attn_layers), \
+                "All or none of the layers are expected to be encoder-only"
+
+            attn_backend, attn_metadata_builder = (
+                self._initialize_single_attn_backend(attn_specs[0],
+                                                     attn_layers.keys()))
+            self.attn_backends.append(attn_backend)
+            self.attn_metadata_builders.append(attn_metadata_builder)
+            self.is_encoder_only_model = True
+
     def may_reinitialize_input_batch(self,
                                      kv_cache_config: KVCacheConfig) -> None:
         """
@@ -3202,6 +3353,16 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 kv_cache_config.kv_cache_groups,
                 kv_caches,
             )
+            attn_layers = get_layers_from_vllm_config(self.vllm_config,
+                                                      Attention)
+            # Iterate in reversed order and add layers that re-use KV cache
+            # e.g. in YOCO-like KV sharing setups (e.g. Gemma3n)
+            for layer_name in reversed(attn_layers):
+                if layer_name in self.shared_kv_cache_layers:
+                    self.kv_sharing_fast_prefill_eligible_layers.add(
+                        layer_name)
+                else:
+                    break
 
         bind_kv_cache(kv_caches,
                       self.compilation_config.static_forward_context,
@@ -3315,6 +3476,57 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     shapes=mamba_module.get_state_shape(),
                     dtype=self.kv_cache_dtype,
                     block_size=max_model_len,
-                    page_size_padded=page_size_padded)
+                    page_size_padded=page_size_padded,
+                    mamba_type=mamba_module.mamba_type)
 
         return kv_cache_spec
+
+    def _build_encoder_only_attn_metadata(
+            self, scheduler_output: "SchedulerOutput") -> \
+                tuple[CommonAttentionMetadata, Any]:
+        """Prepare encoder attention metadata for encoder-only models.
+
+        Args:
+            scheduler_output: Scheduler output
+
+        Returns:
+            dict[str, Any]: Encoder attention metadata
+        """
+        num_reqs = self.input_batch.num_reqs
+        total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
+
+        # Get the number of scheduled tokens for each request.
+        req_ids = self.input_batch.req_ids
+        tokens = [scheduler_output.num_scheduled_tokens[i] for i in req_ids]
+        max_num_scheduled_tokens = max(tokens)
+
+        # Use the first attention metadata builder
+        # to create encoder attention metadata
+        builder = self.attn_metadata_builders[0]
+
+        dummy_block_table = torch.zeros((num_reqs, 1),
+                                        dtype=torch.int32,
+                                        device=self.device)
+        dummy_slot_mapping = torch.zeros((total_num_scheduled_tokens, ),
+                                         dtype=torch.int32,
+                                         device=self.device)
+
+        common_metadata = CommonAttentionMetadata(
+            query_start_loc=self.query_start_loc[:num_reqs + 1],
+            query_start_loc_cpu=self.query_start_loc_cpu[:num_reqs + 1],
+            seq_lens=self.seq_lens[:num_reqs],
+            seq_lens_cpu=self.seq_lens_cpu[:num_reqs],
+            num_computed_tokens_cpu=self.input_batch.
+            num_computed_tokens_cpu_tensor[:num_reqs],
+            num_reqs=num_reqs,
+            num_actual_tokens=total_num_scheduled_tokens,
+            max_query_len=max_num_scheduled_tokens,
+            block_table_tensor=dummy_block_table,
+            slot_mapping=dummy_slot_mapping,
+            causal=False,
+        )
+
+        return common_metadata, builder.build(
+            common_prefix_len=0,  # No cascade for encoder
+            common_attn_metadata=common_metadata,
+        )
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index dcfb038d28c20..0f46ed223ab88 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -71,12 +71,23 @@ class Worker(WorkerBase):
             torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
             logger.info("Profiling enabled. Traces will be saved to: %s",
                         torch_profiler_trace_dir)
+            logger.debug(
+                "Profiler config: record_shapes=%s,"
+                "profile_memory=%s,with_stack=%s,with_flops=%s",
+                envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
+                envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
+                envs.VLLM_TORCH_PROFILER_WITH_STACK,
+                envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
+            )
             self.profiler = torch.profiler.profile(
                 activities=[
                     torch.profiler.ProfilerActivity.CPU,
                     torch.profiler.ProfilerActivity.CUDA,
                 ],
-                with_stack=True,
+                record_shapes=envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
+                profile_memory=envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
+                with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
+                with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
                 on_trace_ready=torch.profiler.tensorboard_trace_handler(
                     torch_profiler_trace_dir, use_gzip=True))
         else:
@@ -209,7 +220,7 @@ class Worker(WorkerBase):
 
     @torch.inference_mode()
     def determine_available_memory(self) -> int:
-        """Profiles the peak memory usage of the model to determine how much 
+        """Profiles the peak memory usage of the model to determine how much
         memory can be used for KV cache without OOMs.
 
         The engine will first conduct a profiling of the existing memory usage.
@@ -246,11 +257,21 @@ class Worker(WorkerBase):
         available_kv_cache_memory = self.requested_memory \
             - profile_result.non_kv_cache_memory
 
+        unrequested_memory = self.init_snapshot.free_memory \
+            - self.requested_memory
         logger.debug(
-            "Initial free memory: %.2f GiB, free memory: %.2f GiB, "
-            "requested GPU memory: %.2f GiB",
-            GiB(self.init_snapshot.free_memory), GiB(free_gpu_memory),
-            GiB(self.requested_memory))
+            "Initial free memory: %.2f GiB; "
+            "Requested memory: %.2f (util), %.2f GiB",
+            GiB(self.init_snapshot.free_memory),
+            self.cache_config.gpu_memory_utilization,
+            GiB(self.requested_memory),
+        )
+        logger.debug(
+            "Free memory after profiling: %.2f GiB (total), "
+            "%.2f GiB (within requested)",
+            GiB(free_gpu_memory),
+            GiB(free_gpu_memory - unrequested_memory),
+        )
         logger.debug(profile_result)
         logger.info("Available KV cache memory: %.2f GiB",
                     GiB(available_kv_cache_memory))
diff --git a/vllm/v1/worker/xpu_worker.py b/vllm/v1/worker/xpu_worker.py
index c7885694f7a38..2a7e0625b2f87 100644
--- a/vllm/v1/worker/xpu_worker.py
+++ b/vllm/v1/worker/xpu_worker.py
@@ -41,12 +41,23 @@ class XPUWorker(Worker):
             torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
             logger.info("Profiling enabled. Traces will be saved to: %s",
                         torch_profiler_trace_dir)
+            logger.debug(
+                "Profiler config: record_shapes=%s,"
+                "profile_memory=%s,with_stack=%s,with_flops=%s",
+                envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
+                envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
+                envs.VLLM_TORCH_PROFILER_WITH_STACK,
+                envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
+            )
             self.profiler = torch.profiler.profile(
                 activities=[
                     torch.profiler.ProfilerActivity.CPU,
                     torch.profiler.ProfilerActivity.XPU,
                 ],
-                with_stack=True,
+                record_shapes=envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
+                profile_memory=envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
+                with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
+                with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
                 on_trace_ready=torch.profiler.tensorboard_trace_handler(
                     torch_profiler_trace_dir, use_gzip=True))
         else:
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 5a185e7451ade..20b9b733cd3b9 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -22,6 +22,7 @@ import vllm.envs as envs
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.attention.backends.abstract import AttentionState
 from vllm.attention.backends.utils import CommonAttentionState
+from vllm.compilation.counter import compilation_counter
 from vllm.config import CompilationLevel, VllmConfig
 from vllm.core.scheduler import SchedulerOutputs
 from vllm.distributed import broadcast_tensor_dict, get_pp_group
@@ -1121,6 +1122,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
             CompilationLevel.DYNAMO_AS_IS and supports_dynamo():
             backend = self.vllm_config.compilation_config.init_backend(
                 self.vllm_config)
+            compilation_counter.dynamo_as_is_count += 1
             self.model = torch.compile(
                 self.model,
                 fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py
index 7ccf1a2c0a876..8317b9abff0cd 100644
--- a/vllm/worker/neuron_model_runner.py
+++ b/vllm/worker/neuron_model_runner.py
@@ -15,8 +15,7 @@ from vllm.lora.request import LoRARequest
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.model_loader.neuron import get_neuron_model
-from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
-                             MultiModalKwargs)
+from vllm.multimodal import BatchedTensorInputs, MultiModalKwargs
 from vllm.platforms import current_platform
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
@@ -88,10 +87,6 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
         self.device = self.device_config.device
         self.pin_memory = is_pin_memory_available()
 
-        # Multi-modal data support
-        self.multi_modal_input_mapper = MULTIMODAL_REGISTRY \
-            .create_input_mapper(self.model_config)
-
         # Lazy initialization.
         self.model: nn.Module  # initialize after load_model.