Merge remote-tracking branch 'nm/lwilkinson/fix-flashmla-full-cudagraph' into wide_ep_working_branch

2026-05-27 05:47:54 +08:00 · 2025-07-27 21:22:09 +00:00 · 2025-07-27 21:22:09 +00:00 · f1c9ef3afd
commit f1c9ef3afd
parent ec1250421a d80a82f961
148 changed files with 7438 additions and 1598 deletions
--- a/.buildkite/nightly-benchmarks/README.md
+++ b/.buildkite/nightly-benchmarks/README.md
@ -74,7 +74,7 @@ Here is an example of one test inside `latency-tests.json`:
 In this example:
 - The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
+- The `parameters` attribute control the command line arguments to be used for `vllm bench latency`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `vllm bench latency`. For example, the corresponding command line arguments for `vllm bench latency` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
 Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
@ -82,13 +82,13 @@ WARNING: The benchmarking script will save json results by itself, so please do
 ### Throughput test
-The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
+The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `vllm bench throughput`.
 The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
 ### Serving test
-We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
+We test the throughput by using `vllm bench serve` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
 ```json
 [
@ -118,8 +118,8 @@ Inside this example:
 - The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
 - The `server-parameters` includes the command line arguments for vLLM server.
- The `client-parameters` includes the command line arguments for `benchmark_serving.py`.
+- The `client-parameters` includes the command line arguments for `vllm bench serve`.
- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `benchmark_serving.py`
+- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `vllm bench serve`
 The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly.
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@ -100,7 +100,7 @@ if __name__ == "__main__":
            raw_result = json.loads(f.read())
        if "serving" in str(test_file):
-            # this result is generated via `benchmark_serving.py`
+            # this result is generated via `vllm bench serve` command
            # attach the benchmarking command to raw_result
            try:
@ -120,7 +120,7 @@ if __name__ == "__main__":
            continue
        elif "latency" in f.name:
-            # this result is generated via `benchmark_latency.py`
+            # this result is generated via `vllm bench latency` command
            # attach the benchmarking command to raw_result
            try:
@ -148,7 +148,7 @@ if __name__ == "__main__":
            continue
        elif "throughput" in f.name:
-            # this result is generated via `benchmark_throughput.py`
+            # this result is generated via `vllm bench throughput` command
            # attach the benchmarking command to raw_result
            try:
--- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
@ -73,7 +73,7 @@ get_current_llm_serving_engine() {
    echo "Container: vllm"
    # move to a completely irrelevant directory, to avoid import vllm from current folder
    export CURRENT_LLM_SERVING_ENGINE=vllm
-    
+
    return
  fi
 }
@ -95,12 +95,14 @@ json2args() {
 }
 kill_gpu_processes() {
-  pkill -f python
+  pkill -f '[p]ython'
-  pkill -f python3
+  pkill -f '[p]ython3'
-  pkill -f tritonserver
+  pkill -f '[t]ritonserver'
-  pkill -f pt_main_thread
+  pkill -f '[p]t_main_thread'
-  pkill -f text-generation
+  pkill -f '[t]ext-generation'
-  pkill -f lmdeploy
+  pkill -f '[l]mdeploy'
  # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
  pkill -f '[V]LLM'
  while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
    sleep 1
@ -125,7 +127,7 @@ ensure_installed() {
 }
 run_serving_tests() {
-  # run serving tests using `benchmark_serving.py`
+  # run serving tests using `vllm bench serve` command
  # $1: a json file specifying serving test cases
  local serving_test_file
@ -225,7 +227,7 @@ run_serving_tests() {
      if [[ "$dataset_name" = "sharegpt" ]]; then
-        client_command="python3 benchmark_serving.py \
+        client_command="vllm bench serve \
          --backend $backend \
          --tokenizer /tokenizer_cache \
          --model $model \
@ -246,7 +248,7 @@ run_serving_tests() {
        sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
        sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
-        client_command="python3 benchmark_serving.py \
+        client_command="vllm bench serve \
          --backend $backend \
          --tokenizer /tokenizer_cache \
          --model $model \
@ -265,13 +267,13 @@ run_serving_tests() {
          $client_args"
      else
-  
+
        echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
        exit 1
      fi
-        
+
      echo "Running test case $test_name with qps $qps"
      echo "Client command: $client_command"
@ -302,7 +304,7 @@ run_serving_tests() {
 }
 run_genai_perf_tests() {
-  # run genai-perf tests 
+  # run genai-perf tests
  # $1: a json file specifying genai-perf test cases
  local genai_perf_test_file
@ -311,14 +313,14 @@ run_genai_perf_tests() {
  # Iterate over genai-perf tests
  jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
    # get the test name, and append the GPU type back to it.
-    test_name=$(echo "$params" | jq -r '.test_name')    
+    test_name=$(echo "$params" | jq -r '.test_name')
-    
+
    # if TEST_SELECTOR is set, only run the test cases that match the selector
    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
      echo "Skip test case $test_name."
      continue
    fi
-    
+
    # prepend the current serving engine to the test name
    test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
@ -369,10 +371,10 @@ run_genai_perf_tests() {
        qps=$num_prompts
        echo "now qps is $qps"
      fi
-    
+
      new_test_name=$test_name"_qps_"$qps
      backend=$CURRENT_LLM_SERVING_ENGINE
-      
+
      if [[ "$backend" == *"vllm"* ]]; then
        backend="vllm"
      fi
@ -413,7 +415,7 @@ prepare_dataset() {
  do
    cat sonnet.txt >> sonnet_4x.txt
  done
-  
+
 }
 main() {
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@ -126,7 +126,8 @@ kill_gpu_processes() {
  ps -aux
  lsof -t -i:8000 | xargs -r kill -9
  pgrep python3 | xargs -r kill -9
-
+  # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
  pgrep VLLM | xargs -r kill -9
  # wait until GPU memory usage smaller than 1GB
  if command -v nvidia-smi; then
@ -164,7 +165,7 @@ upload_to_buildkite() {
 }
 run_latency_tests() {
-  # run latency tests using `benchmark_latency.py`
+  # run latency tests using `vllm bench latency` command
  # $1: a json file specifying latency test cases
  local latency_test_file
@ -205,7 +206,7 @@ run_latency_tests() {
      fi
    fi
-    latency_command=" $latency_envs python3 benchmark_latency.py \
+    latency_command=" $latency_envs vllm bench latency \
      --output-json $RESULTS_FOLDER/${test_name}.json \
      $latency_args"
@ -231,7 +232,7 @@ run_latency_tests() {
 }
 run_throughput_tests() {
-  # run throughput tests using `benchmark_throughput.py`
+  # run throughput tests using `vllm bench throughput`
  # $1: a json file specifying throughput test cases
  local throughput_test_file
@ -272,7 +273,7 @@ run_throughput_tests() {
      fi
    fi
-    throughput_command=" $throughput_envs python3 benchmark_throughput.py \
+    throughput_command=" $throughput_envs vllm bench throughput \
      --output-json $RESULTS_FOLDER/${test_name}.json \
      $throughput_args"
@ -297,7 +298,7 @@ run_throughput_tests() {
 }
 run_serving_tests() {
-  # run serving tests using `benchmark_serving.py`
+  # run serving tests using `vllm bench serve` command
  # $1: a json file specifying serving test cases
  local serving_test_file
@ -393,7 +394,7 @@ run_serving_tests() {
      # pass the tensor parallel size to the client so that it can be displayed
      # on the benchmark dashboard
-      client_command="python3 benchmark_serving.py \
+      client_command="vllm bench serve \
        --save-result \
        --result-dir $RESULTS_FOLDER \
        --result-filename ${new_test_name}.json \
@ -447,7 +448,7 @@ main() {
  (which jq) || (apt-get update && apt-get -y install jq)
  (which lsof) || (apt-get update && apt-get install -y lsof)
-  # get the current IP address, required by benchmark_serving.py
+  # get the current IP address, required by `vllm bench serve` command
  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
  # turn of the reporting of the status of each request, to clean up the terminal output
  export VLLM_LOGGING_LEVEL="WARNING"
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@ -13,9 +13,9 @@ NUMA_NODE=${NUMA_NODE:-1}
 export CMAKE_BUILD_PARALLEL_LEVEL=32
 # Setup cleanup
-remove_docker_container() { 
+remove_docker_container() {
-    set -e; 
+    set -e;
-    docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true; 
+    docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
 }
 trap remove_docker_container EXIT
 remove_docker_container
@ -69,7 +69,7 @@ function cpu_tests() {
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
    pytest -s -v \
-    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]" 
+    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
  # Note: disable it until supports V1
  # Run AWQ test
@ -83,7 +83,7 @@ function cpu_tests() {
    set -e
    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
-    python3 benchmarks/benchmark_serving.py \
+    vllm bench serve \
      --backend vllm \
      --dataset-name random \
      --model meta-llama/Llama-3.2-3B-Instruct \
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
@ -0,0 +1,166 @@
 #!/bin/bash
 set -xu
 remove_docker_container() { 
    docker rm -f tpu-test || true; 
    docker rm -f vllm-tpu || true;
 }
 trap remove_docker_container EXIT
 # Remove the container that might not be cleaned up in the previous run.
 remove_docker_container
 # Build the docker image.
 docker build -f docker/Dockerfile.tpu -t vllm-tpu .
 # Set up cleanup.
 cleanup_docker() {
  # Get Docker's root directory
  docker_root=$(docker info -f '{{.DockerRootDir}}')
  if [ -z "$docker_root" ]; then
    echo "Failed to determine Docker root directory."
    exit 1
  fi
  echo "Docker root directory: $docker_root"
  # Check disk usage of the filesystem where Docker's root directory is located
  disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
  # Define the threshold
  threshold=70
  if [ "$disk_usage" -gt "$threshold" ]; then
    echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
    # Remove dangling images (those that are not tagged and not used by any container)
    docker image prune -f
    # Remove unused volumes / force the system prune for old images as well.
    docker volume prune -f && docker system prune --force --filter "until=72h" --all
    echo "Docker images and volumes cleanup completed."
  else
    echo "Disk usage is below $threshold%. No cleanup needed."
  fi
 }
 cleanup_docker
 # For HF_TOKEN.
 source /etc/environment
 docker run --privileged --net host --shm-size=16G -it \
    -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
    vllm-tpu /bin/bash -c '
 set -e # Exit immediately if a command exits with a non-zero status.
 set -u # Treat unset variables as an error.
 echo "--- Starting script inside Docker container ---"
 # Create results directory
 RESULTS_DIR=$(mktemp -d)
 # If mktemp fails, set -e will cause the script to exit.
 echo "Results will be stored in: $RESULTS_DIR"
 # Install dependencies
 echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
    && python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 \
    && python3 -m pip install --progress-bar off hf-transfer
 echo "--- Python dependencies installed ---"
 export VLLM_USE_V1=1
 export VLLM_XLA_CHECK_RECOMPILATION=1
 export VLLM_XLA_CACHE_PATH=
 echo "Using VLLM V1"
 echo "--- Hardware Information ---"
 # tpu-info
 echo "--- Starting Tests ---"
 set +e
 overall_script_exit_code=0
 # --- Test Definitions ---
 # If a test fails, this function will print logs and will not cause the main script to exit.
 run_test() {
    local test_num=$1
    local test_name=$2
    local test_command=$3
    local log_file="$RESULTS_DIR/test_${test_num}.log"
    local actual_exit_code
    echo "--- TEST_$test_num: Running $test_name ---"
    # Execute the test command.
    eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2)
    actual_exit_code=$?
    echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log
    echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log
    if [ "$actual_exit_code" -ne 0 ]; then
        echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2
        echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2
        if [ -f "$log_file" ]; then
            cat "$log_file" >&2
        else
            echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2
        fi
        echo "--- End of log for TEST_$test_num ($test_name) ---" >&2
        return "$actual_exit_code" # Return the failure code
    else
        echo "TEST_$test_num ($test_name) PASSED."
        return 0 # Return success
    fi
 }
 # Helper function to call run_test and update the overall script exit code
 run_and_track_test() {
    local test_num_arg="$1"
    local test_name_arg="$2"
    local test_command_arg="$3"
    # Run the test
    run_test "$test_num_arg" "$test_name_arg" "$test_command_arg"
    local test_specific_exit_code=$?
    # If the test failed, set the overall script exit code to 1
    if [ "$test_specific_exit_code" -ne 0 ]; then
        # No need for extra echo here, run_test already logged the failure.
        overall_script_exit_code=1
    fi
 }
 # --- Actual Test Execution ---
 run_and_track_test 1 "test_struct_output_generate.py" \
    "HF_HUB_DISABLE_XET=1 python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
 run_and_track_test 2 "test_moe_pallas.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
 run_and_track_test 3 "test_lora.py" \
    "VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
 run_and_track_test 4 "test_tpu_qkv_linear.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
 run_and_track_test 5 "test_spmd_model_weight_loading.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
 run_and_track_test 6 "test_kv_cache_update_kernel.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
 # After all tests have been attempted, exit with the overall status.
 if [ "$overall_script_exit_code" -ne 0 ]; then
    echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---"
 else
    echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---"
 fi
 exit "$overall_script_exit_code"
 ' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct.
 # Capture the exit code of the docker run command
 DOCKER_RUN_EXIT_CODE=$?
 # The trap will run for cleanup.
 # Exit the main script with the Docker run command's exit code.
 if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then
    echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE."
    exit "$DOCKER_RUN_EXIT_CODE"
 else
    echo "Docker run command completed successfully."
    exit 0
 fi
 # TODO: This test fails because it uses RANDOM_SEED sampling
 # pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@ -150,18 +150,6 @@ run_and_track_test 9 "test_multimodal.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py"
 run_and_track_test 10 "test_pallas.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
 run_and_track_test 11 "test_struct_output_generate.py" \
    "HF_HUB_DISABLE_XET=1 python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
 run_and_track_test 12 "test_moe_pallas.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
 run_and_track_test 13 "test_lora.py" \
    "VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
 run_and_track_test 14 "test_tpu_qkv_linear.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
 run_and_track_test 15 "test_spmd_model_weight_loading.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
 run_and_track_test 16 "test_kv_cache_update_kernel.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
 # After all tests have been attempted, exit with the overall status.
 if [ "$overall_script_exit_code" -ne 0 ]; then
--- a/.buildkite/scripts/run-benchmarks.sh
+++ b/.buildkite/scripts/run-benchmarks.sh
@ -11,10 +11,10 @@ cd "$(dirname "${BASH_SOURCE[0]}")/../.."
 (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
 # run python-based benchmarks and upload the result to buildkite
-python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
+vllm bench latency --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
 bench_latency_exit_code=$?
-python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
+vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
 bench_throughput_exit_code=$?
 # run server-based benchmarks and upload the result to buildkite
@ -24,7 +24,7 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
 # wait for server to start, timeout after 600 seconds
 timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
-python3 benchmarks/benchmark_serving.py \
+vllm bench serve \
    --backend vllm \
    --dataset-name sharegpt \
    --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \
--- a/.buildkite/scripts/tpu/run_bm.sh
+++ b/.buildkite/scripts/tpu/run_bm.sh
@ -77,7 +77,7 @@ done
 echo "run benchmark test..."
 echo "logging to $BM_LOG"
 echo
-python benchmarks/benchmark_serving.py \
+vllm bench serve \
    --backend vllm \
    --model $MODEL  \
    --dataset-name sonnet \
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@ -7,7 +7,7 @@ permissions:
 jobs:
  lint-and-deploy:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04-arm
    steps:
      - name: Checkout
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@ -98,7 +98,7 @@ Then run the benchmarking script
 ```bash
 # download dataset
 # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-python3 vllm/benchmarks/benchmark_serving.py \
+vllm bench serve \
  --backend vllm \
  --model NousResearch/Hermes-3-Llama-3.1-8B \
  --endpoint /v1/completions \
@ -111,25 +111,25 @@ If successful, you will see the following output
 ```
 ============ Serving Benchmark Result ============
-Successful requests:                     10        
+Successful requests:                     10
-Benchmark duration (s):                  5.78      
+Benchmark duration (s):                  5.78
-Total input tokens:                      1369      
+Total input tokens:                      1369
-Total generated tokens:                  2212      
+Total generated tokens:                  2212
-Request throughput (req/s):              1.73      
+Request throughput (req/s):              1.73
-Output token throughput (tok/s):         382.89    
+Output token throughput (tok/s):         382.89
-Total Token throughput (tok/s):          619.85    
+Total Token throughput (tok/s):          619.85
 ---------------Time to First Token----------------
-Mean TTFT (ms):                          71.54     
+Mean TTFT (ms):                          71.54
-Median TTFT (ms):                        73.88     
+Median TTFT (ms):                        73.88
-P99 TTFT (ms):                           79.49     
+P99 TTFT (ms):                           79.49
 -----Time per Output Token (excl. 1st token)------
-Mean TPOT (ms):                          7.91      
+Mean TPOT (ms):                          7.91
-Median TPOT (ms):                        7.96      
+Median TPOT (ms):                        7.96
-P99 TPOT (ms):                           8.03      
+P99 TPOT (ms):                           8.03
 ---------------Inter-token Latency----------------
-Mean ITL (ms):                           7.74      
+Mean ITL (ms):                           7.74
-Median ITL (ms):                         7.70      
+Median ITL (ms):                         7.70
-P99 ITL (ms):                            8.39      
+P99 ITL (ms):                            8.39
 ==================================================
 ```
@ -141,7 +141,7 @@ If the dataset you want to benchmark is not supported yet in vLLM, even then you
 {"prompt": "What is the capital of India?"}
 {"prompt": "What is the capital of Iran?"}
 {"prompt": "What is the capital of China?"}
-``` 
+```
 ```bash
 # start server
@ -150,7 +150,7 @@ VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests
 ```bash
 # run benchmarking script
-python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detailed \
+vllm bench serve --port 9001 --save-result --save-detailed \
  --backend vllm \
  --model meta-llama/Llama-3.1-8B-Instruct \
  --endpoint /v1/completions \
@ -174,7 +174,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
 ```
 ```bash
-python3 vllm/benchmarks/benchmark_serving.py \
+vllm bench serve \
  --backend openai-chat \
  --model Qwen/Qwen2-VL-7B-Instruct \
  --endpoint /v1/chat/completions \
@ -194,7 +194,7 @@ VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
 ```
 ``` bash
-python3 benchmarks/benchmark_serving.py \
+vllm bench serve \
    --model meta-llama/Meta-Llama-3-8B-Instruct \
    --dataset-name hf \
    --dataset-path likaixin/InstructCoder \
@ -210,7 +210,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
 **`lmms-lab/LLaVA-OneVision-Data`**
 ```bash
-python3 vllm/benchmarks/benchmark_serving.py \
+vllm bench serve \
  --backend openai-chat \
  --model Qwen/Qwen2-VL-7B-Instruct \
  --endpoint /v1/chat/completions \
@ -224,7 +224,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
 **`Aeala/ShareGPT_Vicuna_unfiltered`**
 ```bash
-python3 vllm/benchmarks/benchmark_serving.py \
+vllm bench serve \
  --backend openai-chat \
  --model Qwen/Qwen2-VL-7B-Instruct \
  --endpoint /v1/chat/completions \
@ -237,7 +237,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
 **`AI-MO/aimo-validation-aime`**
 ``` bash
-python3 vllm/benchmarks/benchmark_serving.py \
+vllm bench serve \
    --model Qwen/QwQ-32B \
    --dataset-name hf \
    --dataset-path AI-MO/aimo-validation-aime \
@ -248,7 +248,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
 **`philschmid/mt-bench`**
 ``` bash
-python3 vllm/benchmarks/benchmark_serving.py \
+vllm bench serve \
    --model Qwen/QwQ-32B \
    --dataset-name hf \
    --dataset-path philschmid/mt-bench \
@ -261,7 +261,7 @@ When using OpenAI-compatible backends such as `vllm`, optional sampling
 parameters can be specified. Example client command:
 ```bash
-python3 vllm/benchmarks/benchmark_serving.py \
+vllm bench serve \
  --backend vllm \
  --model NousResearch/Hermes-3-Llama-3.1-8B \
  --endpoint /v1/completions \
@ -296,7 +296,7 @@ The following arguments can be used to control the ramp-up:
 <br/>
 ```bash
-python3 vllm/benchmarks/benchmark_throughput.py \
+vllm bench throughput \
  --model NousResearch/Hermes-3-Llama-3.1-8B \
  --dataset-name sonnet \
  --dataset-path vllm/benchmarks/sonnet.txt \
@ -314,7 +314,7 @@ Total num output tokens:  1500
 **VisionArena Benchmark for Vision Language Models**
 ``` bash
-python3 vllm/benchmarks/benchmark_throughput.py \
+vllm bench throughput \
  --model Qwen/Qwen2-VL-7B-Instruct \
  --backend vllm-chat \
  --dataset-name hf \
@ -336,7 +336,7 @@ Total num output tokens:  1280
 ``` bash
 VLLM_WORKER_MULTIPROC_METHOD=spawn \
 VLLM_USE_V1=1 \
-python3 vllm/benchmarks/benchmark_throughput.py \
+vllm bench throughput \
    --dataset-name=hf \
    --dataset-path=likaixin/InstructCoder \
    --model=meta-llama/Meta-Llama-3-8B-Instruct \
@ -360,7 +360,7 @@ Total num output tokens:  204800
 **`lmms-lab/LLaVA-OneVision-Data`**
 ```bash
-python3 vllm/benchmarks/benchmark_throughput.py \
+vllm bench throughput \
  --model Qwen/Qwen2-VL-7B-Instruct \
  --backend vllm-chat \
  --dataset-name hf \
@ -373,7 +373,7 @@ python3 vllm/benchmarks/benchmark_throughput.py \
 **`Aeala/ShareGPT_Vicuna_unfiltered`**
 ```bash
-python3 vllm/benchmarks/benchmark_throughput.py \
+vllm bench throughput \
  --model Qwen/Qwen2-VL-7B-Instruct \
  --backend vllm-chat \
  --dataset-name hf \
@ -385,7 +385,7 @@ python3 vllm/benchmarks/benchmark_throughput.py \
 **`AI-MO/aimo-validation-aime`**
 ```bash
-python3 benchmarks/benchmark_throughput.py \
+vllm bench throughput \
  --model Qwen/QwQ-32B \
  --backend vllm \
  --dataset-name hf \
@ -399,7 +399,7 @@ python3 benchmarks/benchmark_throughput.py \
 ``` bash
 # download dataset
 # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-python3 vllm/benchmarks/benchmark_throughput.py \
+vllm bench throughput \
  --model meta-llama/Llama-2-7b-hf \
  --backend vllm \
  --dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
--- a/benchmarks/auto_tune/README.md
+++ b/benchmarks/auto_tune/README.md
@ -105,7 +105,7 @@ After the script finishes, you will find the results in a new, timestamped direc
 - **Log Files**: The directory (`$BASE/auto-benchmark/YYYY_MM_DD_HH_MM/`) contains detailed logs for each run:
    - `vllm_log_...txt`: The log output from the vLLM server for each parameter combination.
-    - `bm_log_...txt`: The log output from the `benchmark_serving.py` script for each benchmark run.
+    - `bm_log_...txt`: The log output from the `vllm bench serve` command for each benchmark run.
 - **Final Result Summary**: A file named `result.txt` is created in the log directory. It contains a summary of each tested combination and concludes with the overall best parameters found.
--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
@ -1,6 +1,6 @@
 #!/bin/bash
-# This script aims to tune the best server parameter combinations to maximize throughput for given requirement. 
+# This script aims to tune the best server parameter combinations to maximize throughput for given requirement.
 # See details in README (benchmarks/auto_tune/README.md).
 TAG=$(date +"%Y_%m_%d_%H_%M")
@ -56,7 +56,7 @@ start_server() {
    local max_num_batched_tokens=$3
    local vllm_log=$4
    local profile_dir=$5
-    
+
    pkill -f vllm
    VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir vllm serve $MODEL \
@ -73,9 +73,9 @@ start_server() {
    # wait for 10 minutes...
    server_started=0
-    for i in {1..60}; do  
+    for i in {1..60}; do
        RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
-        STATUS_CODE=$(echo "$RESPONSE" | tail -n 1) 
+        STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
        if [[ "$STATUS_CODE" -eq 200 ]]; then
            server_started=1
            break
@ -98,10 +98,10 @@ update_best_profile() {
    selected_profile_file=
    if [[ "$SYSTEM" == "TPU" ]]; then
        selected_profile_file="${sorted_paths[$profile_index]}/*.xplane.pb"
-    fi 
+    fi
    if [[ "$SYSTEM" == "GPU" ]]; then
        selected_profile_file="${sorted_paths[$profile_index]}"
-    fi 
+    fi
    rm -f $PROFILE_PATH/*
    cp $selected_profile_file $PROFILE_PATH
 }
@ -129,14 +129,14 @@ run_benchmark() {
        echo "server started."
    fi
    echo
-    
+
    echo "run benchmark test..."
    meet_latency_requirement=0
    # get a basic qps by using request-rate inf
    bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
    prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
 adjusted_input_len=$(( INPUT_LEN - prefix_len ))
-    python3 benchmarks/benchmark_serving.py \
+    vllm bench serve \
        --backend vllm \
        --model $MODEL  \
        --dataset-name random \
@ -169,7 +169,7 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len ))
            curl -X POST http://0.0.0.0:8004/reset_prefix_cache
            sleep 5
            bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
-            python3 benchmarks/benchmark_serving.py \
+            vllm bench serve \
                --backend vllm \
                --model $MODEL  \
                --dataset-name random \
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@ -11,6 +11,7 @@ from typing import Any, Optional
 import numpy as np
 from tqdm import tqdm
 from typing_extensions import deprecated
 import vllm.envs as envs
 from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
@ -34,6 +35,10 @@ def save_to_pytorch_benchmark_format(
        write_to_json(pt_file, pt_records)
@deprecated(
    "benchmark_latency.py is deprecated and will be removed in a "
    "future version. Please use 'vllm bench latency' instead.",
 )
 def main(args: argparse.Namespace):
    print(args)
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@ -38,6 +38,7 @@ from typing import Any, Literal, Optional
 import numpy as np
 from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase
 from typing_extensions import deprecated
 from backend_request_func import (
    ASYNC_REQUEST_FUNCS,
@ -593,6 +594,10 @@ def save_to_pytorch_benchmark_format(
        write_to_json(pt_file, pt_records)
@deprecated(
    "benchmark_serving.py is deprecated and will be removed in a future "
    "version. Please use 'vllm bench serve' instead.",
 )
 def main(args: argparse.Namespace):
    print(args)
    random.seed(args.seed)
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@ -15,6 +15,7 @@ import torch
 import uvloop
 from tqdm import tqdm
 from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase
 from typing_extensions import deprecated
 from benchmark_dataset import (
    AIMODataset,
@ -382,6 +383,10 @@ def get_requests(args, tokenizer):
    return dataset_cls(**common_kwargs).sample(**sample_kwargs)
@deprecated(
    "benchmark_throughput.py is deprecated and will be removed in a "
    "future version. Please use 'vllm bench throughput' instead.",
 )
 def main(args: argparse.Namespace):
    if args.seed is None:
        args.seed = 0
--- a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
@ -3,7 +3,7 @@
 # benchmark the overhead of disaggregated prefill.
 # methodology:
 # - send all request to prefill vLLM instance. It will buffer KV cache.
-# - then send all request to decode instance. 
+# - then send all request to decode instance.
 # - The TTFT of decode instance is the overhead.
 set -ex
@ -12,6 +12,8 @@ kill_gpu_processes() {
  # kill all processes on GPU.
  pgrep pt_main_thread | xargs -r kill -9
  pgrep python3 | xargs -r kill -9
  # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
  pgrep VLLM | xargs -r kill -9
  sleep 10
  # remove vllm config file
@ -61,7 +63,7 @@ benchmark() {
    --gpu-memory-utilization 0.6 \
    --kv-transfer-config \
    '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
-    
+
  CUDA_VISIBLE_DEVICES=1 python3 \
    -m vllm.entrypoints.openai.api_server \
@ -76,38 +78,38 @@ benchmark() {
  wait_for_server 8200
  # let the prefill instance finish prefill
-  python3 ../benchmark_serving.py \
+  vllm bench serve \
-          --backend vllm \
+    --backend vllm \
-          --model $model \
+    --model $model \
-          --dataset-name $dataset_name \
+    --dataset-name $dataset_name \
-          --dataset-path $dataset_path \
+    --dataset-path $dataset_path \
-          --sonnet-input-len $input_len \
+    --sonnet-input-len $input_len \
-          --sonnet-output-len "$output_len" \
+    --sonnet-output-len "$output_len" \
-          --sonnet-prefix-len $prefix_len \
+    --sonnet-prefix-len $prefix_len \
-          --num-prompts $num_prompts \
+    --num-prompts $num_prompts \
-          --port 8100 \
+    --port 8100 \
-          --save-result \
+    --save-result \
-          --result-dir $results_folder \
+    --result-dir $results_folder \
-          --result-filename disagg_prefill_tp1.json \
+    --result-filename disagg_prefill_tp1.json \
-          --request-rate "inf"
+    --request-rate "inf"
  # send the request to decode.
  # The TTFT of this command will be the overhead of disagg prefill impl.
-  python3 ../benchmark_serving.py \
+  vllm bench serve \
-          --backend vllm \
+    --backend vllm \
-          --model $model \
+    --model $model \
-          --dataset-name $dataset_name \
+    --dataset-name $dataset_name \
-          --dataset-path $dataset_path \
+    --dataset-path $dataset_path \
-          --sonnet-input-len $input_len \
+    --sonnet-input-len $input_len \
-          --sonnet-output-len "$output_len" \
+    --sonnet-output-len "$output_len" \
-          --sonnet-prefix-len $prefix_len \
+    --sonnet-prefix-len $prefix_len \
-          --num-prompts $num_prompts \
+    --num-prompts $num_prompts \
-          --port 8200 \
+    --port 8200 \
-          --save-result \
+    --save-result \
-          --result-dir $results_folder \
+    --result-dir $results_folder \
-          --result-filename disagg_prefill_tp1_overhead.json \
+    --result-filename disagg_prefill_tp1_overhead.json \
-          --request-rate "$qps"
+    --request-rate "$qps"
  kill_gpu_processes
 }
--- a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
@ -18,6 +18,8 @@ kill_gpu_processes() {
  # kill all processes on GPU.
  pgrep pt_main_thread | xargs -r kill -9
  pgrep python3 | xargs -r kill -9
  # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
  pgrep VLLM | xargs -r kill -9
  for port in 8000 8100 8200; do lsof -t -i:$port | xargs -r kill -9; done
  sleep 1
 }
@ -58,7 +60,7 @@ launch_chunked_prefill() {
 launch_disagg_prefill() {
-  model="meta-llama/Meta-Llama-3.1-8B-Instruct" 
+  model="meta-llama/Meta-Llama-3.1-8B-Instruct"
  # disagg prefill
  CUDA_VISIBLE_DEVICES=0 python3 \
    -m vllm.entrypoints.openai.api_server \
@ -97,20 +99,20 @@ benchmark() {
  output_len=$2
  tag=$3
-  python3 ../benchmark_serving.py \
+  vllm bench serve \
-          --backend vllm \
+    --backend vllm \
-          --model $model \
+    --model $model \
-          --dataset-name $dataset_name \
+    --dataset-name $dataset_name \
-          --dataset-path $dataset_path \
+    --dataset-path $dataset_path \
-          --sonnet-input-len $input_len \
+    --sonnet-input-len $input_len \
-          --sonnet-output-len "$output_len" \
+    --sonnet-output-len "$output_len" \
-          --sonnet-prefix-len $prefix_len \
+    --sonnet-prefix-len $prefix_len \
-          --num-prompts $num_prompts \
+    --num-prompts $num_prompts \
-          --port 8000 \
+    --port 8000 \
-          --save-result \
+    --save-result \
-          --result-dir $results_folder \
+    --result-dir $results_folder \
-          --result-filename "$tag"-qps-"$qps".json \
+    --result-filename "$tag"-qps-"$qps".json \
-          --request-rate "$qps"
+    --request-rate "$qps"
  sleep 2
 }
--- a/benchmarks/kernels/benchmark_moe_align_block_size.py
+++ b/benchmarks/kernels/benchmark_moe_align_block_size.py
@ -5,9 +5,8 @@ import itertools
 import torch
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
-    moe_align_block_size_triton,
+    moe_align_block_size,
 )
 from vllm.triton_utils import triton
@ -21,60 +20,6 @@ def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor:
    )
 def check_correctness(num_tokens, num_experts=256, block_size=256, topk=8):
    """
    Verifies vllm vs. Triton
    """
    topk_ids = get_topk_ids(num_tokens, num_experts, topk)
    # 1. malloc space for triton and vllm
    # malloc enough space (max_num_tokens_padded) for the sorted ids
    max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
    sorted_ids_triton = torch.empty(
        (max_num_tokens_padded,), dtype=torch.int32, device="cuda"
    )
    expert_ids_triton = torch.empty(
        (max_num_tokens_padded // block_size,), dtype=torch.int32, device="cuda"
    )
    num_tokens_post_pad_triton = torch.empty((1,), dtype=torch.int32, device="cuda")
    sorted_ids_vllm = torch.empty_like(sorted_ids_triton)
    expert_ids_vllm = torch.empty_like(expert_ids_triton)
    num_tokens_post_pad_vllm = torch.empty_like(num_tokens_post_pad_triton)
    # 2. run implementations
    moe_align_block_size_triton(
        topk_ids,
        num_experts,
        block_size,
        sorted_ids_triton,
        expert_ids_triton,
        num_tokens_post_pad_triton,
    )
    ops.moe_align_block_size(
        topk_ids,
        num_experts,
        block_size,
        sorted_ids_vllm,
        expert_ids_vllm,
        num_tokens_post_pad_vllm,
    )
    print(f"✅ VLLM implementation works with {num_experts} experts!")
    # 3. compare results
    if torch.allclose(expert_ids_triton, expert_ids_vllm) and torch.allclose(
        num_tokens_post_pad_triton, num_tokens_post_pad_vllm
    ):
        print("✅ Triton and VLLM implementations match.")
    else:
        print("❌ Triton and VLLM implementations DO NOT match.")
        print("Triton expert_ids:", expert_ids_triton)
        print("VLLM expert_ids:", expert_ids_vllm)
        print("Triton num_tokens_post_pad:", num_tokens_post_pad_triton)
        print("VLLM num_tokens_post_pad:", num_tokens_post_pad_vllm)
 # test configurations
 num_tokens_range = [1, 16, 256, 4096]
 num_experts_range = [16, 64, 224, 256, 280, 512]
@ -87,8 +32,8 @@ configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range
        x_names=["num_tokens", "num_experts", "topk"],
        x_vals=configs,
        line_arg="provider",
-        line_vals=["vllm", "triton"],  # "triton"
+        line_vals=["vllm"],
-        line_names=["VLLM", "Triton"],  # "Triton"
+        line_names=["vLLM"],
        plot_name="moe-align-block-size-performance",
        args={},
    )
@ -98,36 +43,11 @@ def benchmark(num_tokens, num_experts, topk, provider):
    block_size = 256
    topk_ids = get_topk_ids(num_tokens, num_experts, topk)
    max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
    sorted_ids = torch.empty((max_num_tokens_padded,), dtype=torch.int32, device="cuda")
    max_num_m_blocks = max_num_tokens_padded // block_size
    expert_ids = torch.empty((max_num_m_blocks,), dtype=torch.int32, device="cuda")
    num_tokens_post_pad = torch.empty((1,), dtype=torch.int32, device="cuda")
    quantiles = [0.5, 0.2, 0.8]
    if provider == "vllm":
        ms, min_ms, max_ms = triton.testing.do_bench(
-            lambda: ops.moe_align_block_size(
+            lambda: moe_align_block_size(topk_ids, block_size, num_experts),
                topk_ids,
                num_experts,
                block_size,
                sorted_ids.clone(),
                expert_ids.clone(),
                num_tokens_post_pad.clone(),
            ),
            quantiles=quantiles,
        )
    elif provider == "triton":
        ms, min_ms, max_ms = triton.testing.do_bench(
            lambda: moe_align_block_size_triton(
                topk_ids,
                num_experts,
                block_size,
                sorted_ids.clone(),
                expert_ids.clone(),
                num_tokens_post_pad.clone(),
            ),
            quantiles=quantiles,
        )
@ -151,6 +71,4 @@ if __name__ == "__main__":
    )
    args = parser.parse_args()
    print("Running correctness check...")
    check_correctness(num_tokens=1024, num_experts=args.num_experts, topk=args.topk)
    benchmark.run(print_data=True, show_plots=True)
--- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py
+++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
@ -8,12 +8,13 @@ import ray
 import torch
 from transformers import AutoConfig
-from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
+from vllm.model_executor.layers.fused_moe.fused_moe import *
 from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
    _moe_permute,
    _moe_unpermute_and_reduce,
    moe_permute,
    moe_unpermute,
 )
 from vllm.model_executor.layers.fused_moe.fused_moe import *
 from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import *
 from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize
 from vllm.platforms import current_platform
 from vllm.utils import FlexibleArgumentParser
@ -63,18 +64,19 @@ def benchmark_permute(
    def run():
        if use_customized_permute:
-            (permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = (
+            (
-                moe_permute(
+                permuted_hidden_states,
-                    qhidden_states,
+                a1q_scale,
-                    topk_weights=topk_weights,
+                first_token_off,
-                    topk_ids=topk_ids,
+                inv_perm_idx,
-                    token_expert_indices=token_expert_indices,
+                m_indices,
-                    topk=topk,
+            ) = moe_permute(
-                    n_expert=num_experts,
+                qhidden_states,
-                    n_local_expert=num_experts,
+                a1q_scale=None,
-                    expert_map=None,
+                topk_ids=topk_ids,
-                    align_block_size=align_block_size,
+                n_expert=num_experts,
-                )
+                expert_map=None,
                align_block_size=align_block_size,
            )
        else:
            (
@ -150,18 +152,19 @@ def benchmark_unpermute(
    def prepare():
        if use_customized_permute:
-            (permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = (
+            (
-                moe_permute(
+                permuted_hidden_states,
-                    qhidden_states,
+                a1q_scale,
-                    topk_weights=topk_weights,
+                first_token_off,
-                    topk_ids=topk_ids,
+                inv_perm_idx,
-                    token_expert_indices=token_expert_indices,
+                m_indices,
-                    topk=topk,
+            ) = moe_permute(
-                    n_expert=num_experts,
+                qhidden_states,
-                    n_local_expert=num_experts,
+                a1q_scale=None,
-                    expert_map=None,
+                topk_ids=topk_ids,
-                    align_block_size=align_block_size,
+                n_expert=num_experts,
-                )
+                expert_map=None,
                align_block_size=align_block_size,
            )
            # convert to fp16/bf16 as gemm output
            return (
@ -191,16 +194,19 @@ def benchmark_unpermute(
    def run(input: tuple):
        if use_customized_permute:
-            (permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = input
+            (
                permuted_hidden_states,
                first_token_off,
                inv_perm_idx,
                m_indices,
            ) = input
            output = torch.empty_like(hidden_states)
            moe_unpermute(
                output,
                permuted_hidden_states,
                topk_weights,
                topk_ids,
                inv_perm_idx,
                first_token_off,
                topk,
                num_experts,
                num_experts,
            )
        else:
            (
@ -211,7 +217,11 @@ def benchmark_unpermute(
                inv_perm,
            ) = input
            _moe_unpermute_and_reduce(
-                output_hidden_states, permuted_hidden_states, inv_perm, topk_weights
+                output_hidden_states,
                permuted_hidden_states,
                inv_perm,
                topk_weights,
                True,
            )
    # JIT compilation & warmup
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@ -151,7 +151,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding);
  // Quantization
-#if defined(__AVX512F__) || defined(__aarch64__)
+#if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__))
  at::Tag stride_tag = at::Tag::needs_fixed_stride_order;
  // Compute int8 quantized tensor for given scaling factor.
--- a/csrc/moe/moe_permute_unpermute_op.cu
+++ b/csrc/moe/moe_permute_unpermute_op.cu
@ -10,32 +10,28 @@
 void moe_permute(
    const torch::Tensor& input,                      // [n_token, hidden]
-    const torch::Tensor& topk_weights,               //[n_token, topk]
+    const torch::Tensor& topk_ids,                   // [n_token, topk]
    torch::Tensor& topk_ids,                         // [n_token, topk]
    const torch::Tensor& token_expert_indices,       // [n_token, topk]
    const std::optional<torch::Tensor>& expert_map,  // [n_expert]
    int64_t n_expert, int64_t n_local_expert, int64_t topk,
    const std::optional<int64_t>& align_block_size,
-    torch::Tensor&
+    torch::Tensor& permuted_input,             // [permuted_size, hidden]
        permuted_input,  // [topk * n_token/align_block_size_m, hidden]
    torch::Tensor& expert_first_token_offset,  // [n_local_expert + 1]
-    torch::Tensor& src_row_id2dst_row_id_map,  // [n_token, topk]
+    torch::Tensor& inv_permuted_idx,           // [n_token, topk]
    torch::Tensor& permuted_idx,               // [permute_size]
    torch::Tensor& m_indices) {                // [align_expand_m]
  TORCH_CHECK(topk_weights.scalar_type() == at::ScalarType::Float,
              "topk_weights must be float32");
  TORCH_CHECK(expert_first_token_offset.scalar_type() == at::ScalarType::Long,
              "expert_first_token_offset must be int64");
  TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int,
              "topk_ids must be int32");
  TORCH_CHECK(token_expert_indices.scalar_type() == at::ScalarType::Int,
              "token_expert_indices must be int32");
-  TORCH_CHECK(src_row_id2dst_row_id_map.scalar_type() == at::ScalarType::Int,
+  TORCH_CHECK(inv_permuted_idx.scalar_type() == at::ScalarType::Int,
-              "src_row_id2dst_row_id_map must be int32");
+              "inv_permuted_idx must be int32");
  TORCH_CHECK(expert_first_token_offset.size(0) == n_local_expert + 1,
              "expert_first_token_offset shape != n_local_expert+1")
-  TORCH_CHECK(
+  TORCH_CHECK(inv_permuted_idx.sizes() == token_expert_indices.sizes(),
-      src_row_id2dst_row_id_map.sizes() == token_expert_indices.sizes(),
+              "token_expert_indices shape must be same as inv_permuted_idx");
      "token_expert_indices shape must be same as src_row_id2dst_row_id_map");
  auto n_token = input.sizes()[0];
  auto n_hidden = input.sizes()[1];
  auto align_block_size_value =
@ -46,8 +42,9 @@ void moe_permute(
  auto sort_workspace = torch::empty(
      {sorter_size},
      torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false));
  auto copy_topk_ids = topk_ids.clone();  // copy topk_ids for preprocess
  auto permuted_experts_id = torch::empty_like(topk_ids);
-  auto dst_row_id2src_row_id_map = torch::empty_like(src_row_id2dst_row_id_map);
+  auto sorted_row_idx = torch::empty_like(inv_permuted_idx);
  auto align_expert_first_token_offset =
      torch::zeros_like(expert_first_token_offset);
@ -67,24 +64,22 @@ void moe_permute(
    const int* expert_map_ptr = get_ptr<int>(expert_map.value());
    valid_num_ptr =
        get_ptr<int64_t>(expert_first_token_offset) + n_local_expert;
-    preprocessTopkIdLauncher(get_ptr<int>(topk_ids), n_token * topk,
+    preprocessTopkIdLauncher(get_ptr<int>(copy_topk_ids), n_token * topk,
                             expert_map_ptr, n_expert, stream);
  }
  // expert sort topk expert id and scan expert id get expert_first_token_offset
-  sortAndScanExpert(get_ptr<int>(topk_ids), get_ptr<int>(token_expert_indices),
+  sortAndScanExpert(
-                    get_ptr<int>(permuted_experts_id),
+      get_ptr<int>(copy_topk_ids), get_ptr<int>(token_expert_indices),
-                    get_ptr<int>(dst_row_id2src_row_id_map),
+      get_ptr<int>(permuted_experts_id), get_ptr<int>(sorted_row_idx),
-                    get_ptr<int64_t>(expert_first_token_offset), n_token,
+      get_ptr<int64_t>(expert_first_token_offset), n_token, n_expert,
-                    n_expert, n_local_expert, topk, sorter,
+      n_local_expert, topk, sorter, get_ptr<int>(sort_workspace), stream);
                    get_ptr<int>(sort_workspace), stream);
  // dispatch expandInputRowsKernelLauncher
  MOE_DISPATCH(input.scalar_type(), [&] {
    expandInputRowsKernelLauncher<scalar_t>(
        get_ptr<scalar_t>(input), get_ptr<scalar_t>(permuted_input),
-        get_ptr<float>(topk_weights), get_ptr<int>(permuted_experts_id),
+        get_ptr<int>(permuted_experts_id), get_ptr<int>(sorted_row_idx),
-        get_ptr<int>(dst_row_id2src_row_id_map),
+        get_ptr<int>(inv_permuted_idx), get_ptr<int>(permuted_idx),
        get_ptr<int>(src_row_id2dst_row_id_map),
        get_ptr<int64_t>(expert_first_token_offset), n_token, valid_num_ptr,
        n_hidden, topk, n_local_expert, align_block_size_value, stream);
  });
@ -101,32 +96,34 @@ void moe_permute(
 }
 void moe_unpermute(
-    const torch::Tensor& permuted_hidden_states,     // [n_token * topk, hidden]
+    const torch::Tensor& permuted_hidden_states,  // [n_token * topk, hidden]
-    const torch::Tensor& topk_weights,               //[n_token, topk]
+    const torch::Tensor& topk_weights,            // [n_token, topk]
-    const torch::Tensor& topk_ids,                   // [n_token, topk]
+    const torch::Tensor& inv_permuted_idx,        // [n_token, topk]
-    const torch::Tensor& src_row_id2dst_row_id_map,  // [n_token, topk]
+    const std::optional<torch::Tensor>&
-    const torch::Tensor& expert_first_token_offset,  // [n_local_expert+1]
+        expert_first_token_offset,  // [n_local_expert+1]
-    int64_t n_expert, int64_t n_local_expert, int64_t topk,
+    int64_t topk,
    torch::Tensor& hidden_states  // [n_token, hidden]
 ) {
  TORCH_CHECK(src_row_id2dst_row_id_map.sizes() == topk_ids.sizes(),
              "topk_ids shape must be same as src_row_id2dst_row_id_map");
  TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int,
              "topk_ids must be int32");
  TORCH_CHECK(
      permuted_hidden_states.scalar_type() == hidden_states.scalar_type(),
-      "topk_ids dtype must be same as src_row_id2dst_row_id_map");
+      "permuted_hidden_states dtype must be same as hidden_states");
  auto n_token = hidden_states.size(0);
  auto n_hidden = hidden_states.size(1);
  auto stream = at::cuda::getCurrentCUDAStream().stream();
-  const int64_t* valid_ptr =
+
-      get_ptr<int64_t>(expert_first_token_offset) + n_local_expert;
+  int64_t const* valid_ptr = nullptr;
  if (expert_first_token_offset.has_value()) {
    int n_local_expert = expert_first_token_offset.value().size(0) - 1;
    valid_ptr =
        get_ptr<int64_t>(expert_first_token_offset.value()) + n_local_expert;
  }
  MOE_DISPATCH(hidden_states.scalar_type(), [&] {
    finalizeMoeRoutingKernelLauncher<scalar_t, scalar_t>(
        get_ptr<scalar_t>(permuted_hidden_states),
        get_ptr<scalar_t>(hidden_states), get_ptr<float>(topk_weights),
-        get_ptr<int>(src_row_id2dst_row_id_map), get_ptr<int>(topk_ids),
+        get_ptr<int>(inv_permuted_idx), n_token, n_hidden, topk, valid_ptr,
-        n_token, n_hidden, topk, valid_ptr, stream);
+        stream);
  });
 }
--- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu
+++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu
@ -177,7 +177,7 @@ __global__ void getMIndicesKernel(int64_t* expert_first_token_offset,
  int tidx = threadIdx.x;
  extern __shared__ int64_t smem_expert_first_token_offset[];
  for (int i = tidx; i <= num_local_expert; i += blockDim.x) {
-    smem_expert_first_token_offset[tidx] = __ldg(expert_first_token_offset + i);
+    smem_expert_first_token_offset[i] = __ldg(expert_first_token_offset + i);
  }
  __syncthreads();
  auto last_token_offset = smem_expert_first_token_offset[eidx + 1];
--- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h
+++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h
@ -57,31 +57,19 @@ void sortAndScanExpert(int* expert_for_source_row, const int* source_rows,
 template <typename T>
 void expandInputRowsKernelLauncher(
-    T const* unpermuted_input, T* permuted_output,
+    T const* unpermuted_input, T* permuted_output, int* sorted_experts,
    const float* unpermuted_scales, int* sorted_experts,
    int const* expanded_dest_row_to_expanded_source_row,
-    int* expanded_source_row_to_expanded_dest_row,
+    int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
    int64_t* expert_first_token_offset, int64_t const num_rows,
    int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k,
    int num_local_experts, const int& align_block_size, cudaStream_t stream);
 // Final kernel to unpermute and scale
 // This kernel unpermutes the original data, does the k-way reduction and
 // performs the final skip connection.
 template <typename T, typename OutputType, bool CHECK_SKIPPED>
 __global__ void finalizeMoeRoutingKernel(
    T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
    float const* scales, int const* expanded_source_row_to_expanded_dest_row,
    int const* expert_for_source_row, int64_t const orig_cols, int64_t const k,
    int64_t const* num_valid_ptr);
 template <class T, class OutputType>
 void finalizeMoeRoutingKernelLauncher(
    T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
    float const* scales, int const* expanded_source_row_to_expanded_dest_row,
-    int const* expert_for_source_row, int64_t const num_rows,
+    int64_t const num_rows, int64_t const cols, int64_t const k,
-    int64_t const cols, int64_t const k, int64_t const* num_valid_ptr,
+    int64_t const* num_valid_ptr, cudaStream_t stream);
    cudaStream_t stream);
 void preprocessTopkIdLauncher(int* topk_id_ptr, int size,
                              const int* expert_map_ptr, int num_experts,
--- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl
+++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl
@ -2,10 +2,9 @@
 template <typename T, bool CHECK_SKIPPED, bool ALIGN_BLOCK_SIZE>
 __global__ void expandInputRowsKernel(
-    T const* unpermuted_input, T* permuted_output,
+    T const* unpermuted_input, T* permuted_output, int* sorted_experts,
    const float* unpermuted_scales, int* sorted_experts,
    int const* expanded_dest_row_to_expanded_source_row,
-    int* expanded_source_row_to_expanded_dest_row,
+    int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
    int64_t* expert_first_token_offset, int64_t const num_rows,
    int64_t const* num_dest_rows, int64_t const cols, int64_t k,
    int num_local_experts, int align_block_size) {
@ -54,6 +53,10 @@ __global__ void expandInputRowsKernel(
    assert(expanded_dest_row <= INT32_MAX);
    expanded_source_row_to_expanded_dest_row[expanded_source_row] =
        static_cast<int>(expanded_dest_row);
    // skip non local expert token
    if (!CHECK_SKIPPED || blockIdx.x < *num_dest_rows) {
      permuted_idx[expanded_dest_row] = expanded_source_row;
    }
  }
  if (!CHECK_SKIPPED || blockIdx.x < *num_dest_rows) {
@ -62,7 +65,7 @@ __global__ void expandInputRowsKernel(
    using DataElem = cutlass::Array<T, ELEM_PER_THREAD>;
    // Duplicate and permute rows
-    int64_t const source_row = expanded_source_row % num_rows;
+    int64_t const source_row = expanded_source_row / k;
    auto const* source_row_ptr =
        reinterpret_cast<DataElem const*>(unpermuted_input + source_row * cols);
@ -82,10 +85,9 @@ __global__ void expandInputRowsKernel(
 template <typename T>
 void expandInputRowsKernelLauncher(
-    T const* unpermuted_input, T* permuted_output,
+    T const* unpermuted_input, T* permuted_output, int* sorted_experts,
    const float* unpermuted_scales, int* sorted_experts,
    int const* expanded_dest_row_to_expanded_source_row,
-    int* expanded_source_row_to_expanded_dest_row,
+    int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
    int64_t* expert_first_token_offset, int64_t const num_rows,
    int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k,
    int num_local_experts, const int& align_block_size, cudaStream_t stream) {
@ -105,11 +107,11 @@ void expandInputRowsKernelLauncher(
  int64_t smem_size = sizeof(int64_t) * (num_local_experts + 1);
  func<<<blocks, threads, smem_size, stream>>>(
-      unpermuted_input, permuted_output, unpermuted_scales, sorted_experts,
+      unpermuted_input, permuted_output, sorted_experts,
      expanded_dest_row_to_expanded_source_row,
-      expanded_source_row_to_expanded_dest_row, expert_first_token_offset,
+      expanded_source_row_to_expanded_dest_row, permuted_idx,
-      num_rows, num_valid_tokens_ptr, cols, k, num_local_experts,
+      expert_first_token_offset, num_rows, num_valid_tokens_ptr, cols, k,
-      align_block_size);
+      num_local_experts, align_block_size);
 }
 template <class T, class U>
@ -128,11 +130,9 @@ template <typename T, typename OutputType, bool CHECK_SKIPPED>
 __global__ void finalizeMoeRoutingKernel(
    T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
    float const* scales, int const* expanded_source_row_to_expanded_dest_row,
-    int const* expert_for_source_row, int64_t const orig_cols, int64_t const k,
+    int64_t const orig_cols, int64_t const k, int64_t const* num_valid_ptr) {
    int64_t const* num_valid_ptr) {
  assert(orig_cols % 4 == 0);
  int64_t const original_row = blockIdx.x;
  int64_t const num_rows = gridDim.x;
  auto const offset = original_row * orig_cols;
  OutputType* reduced_row_ptr = reduced_unpermuted_output + offset;
  int64_t const num_valid = *num_valid_ptr;
@ -159,14 +159,13 @@ __global__ void finalizeMoeRoutingKernel(
    ComputeElem thread_output;
    thread_output.fill(0);
    for (int k_idx = 0; k_idx < k; ++k_idx) {
-      int64_t const expanded_original_row = original_row + k_idx * num_rows;
+      int64_t const expanded_original_row = original_row * k + k_idx;
      int64_t const expanded_permuted_row =
          expanded_source_row_to_expanded_dest_row[expanded_original_row];
      int64_t const k_offset = original_row * k + k_idx;
      float const row_scale = scales[k_offset];
      // Check after row_rescale has accumulated
      if (CHECK_SKIPPED && expanded_permuted_row >= num_valid) {
        continue;
      }
@ -189,9 +188,8 @@ template <class T, class OutputType>
 void finalizeMoeRoutingKernelLauncher(
    T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
    float const* scales, int const* expanded_source_row_to_expanded_dest_row,
-    int const* expert_for_source_row, int64_t const num_rows,
+    int64_t const num_rows, int64_t const cols, int64_t const k,
-    int64_t const cols, int64_t const k, int64_t const* num_valid_ptr,
+    int64_t const* num_valid_ptr, cudaStream_t stream) {
    cudaStream_t stream) {
  int64_t const blocks = num_rows;
  int64_t const threads = 256;
  bool const check_finished = num_valid_ptr != nullptr;
@ -201,6 +199,5 @@ void finalizeMoeRoutingKernelLauncher(
  auto* const kernel = func_map[check_finished];
  kernel<<<blocks, threads, 0, stream>>>(
      expanded_permuted_rows, reduced_unpermuted_output, scales,
-      expanded_source_row_to_expanded_dest_row, expert_for_source_row, cols, k,
+      expanded_source_row_to_expanded_dest_row, cols, k, num_valid_ptr);
      num_valid_ptr);
 }
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@ -56,18 +56,17 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
      " -> Tensor");
  m.def(
-      "moe_permute(Tensor input, Tensor topk_weight, Tensor! topk_ids,"
+      "moe_permute(Tensor input, Tensor topk_ids,"
      "Tensor token_expert_indices, Tensor? expert_map, int n_expert,"
      "int n_local_expert,"
      "int topk, int? align_block_size,Tensor! permuted_input, Tensor! "
-      "expert_first_token_offset, Tensor! src_row_id2dst_row_id_map, Tensor! "
+      "expert_first_token_offset, Tensor! inv_permuted_idx, Tensor! "
-      "m_indices)->()");
+      "permuted_idx, Tensor! m_indices)->()");
  m.def(
      "moe_unpermute(Tensor permuted_hidden_states, Tensor topk_weights,"
-      "Tensor topk_ids,Tensor src_row_id2dst_row_id_map, Tensor "
+      "Tensor inv_permuted_idx, Tensor? expert_first_token_offset, "
-      "expert_first_token_offset, int n_expert, int n_local_expert,int "
+      "int topk, Tensor! hidden_states)->()");
      "topk, Tensor! hidden_states)->()");
  m.def("moe_permute_unpermute_supported() -> bool");
  m.impl("moe_permute_unpermute_supported", &moe_permute_unpermute_supported);
--- a/csrc/ops.h
+++ b/csrc/ops.h
@ -292,6 +292,11 @@ void per_token_group_quant_fp8(const torch::Tensor& input,
                               torch::Tensor& output_q, torch::Tensor& output_s,
                               int64_t group_size, double eps, double fp8_min,
                               double fp8_max, bool scale_ue8m0);
 void per_token_group_quant_int8(const torch::Tensor& input,
                                torch::Tensor& output_q,
                                torch::Tensor& output_s, int64_t group_size,
                                double eps, double int8_min, double int8_max);
 #endif
 void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
--- a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
+++ b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
@ -1,6 +1,8 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <torch/all.h>
 #include "../per_token_group_quant_8bit.h"
 #include <cmath>
 #include "../../dispatch_utils.h"
@ -336,3 +338,11 @@ void dynamic_scaled_int8_quant(
        }
      });
 }
 void per_token_group_quant_int8(const torch::Tensor& input,
                                torch::Tensor& output_q,
                                torch::Tensor& output_s, int64_t group_size,
                                double eps, double int8_min, double int8_max) {
  per_token_group_quant_8bit(input, output_q, output_s, group_size, eps,
                             int8_min, int8_max);
 }
--- a/csrc/quantization/fp8/per_token_group_quant.cu
+++ b/csrc/quantization/fp8/per_token_group_quant.cu
@ -1,6 +1,8 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/util/Float8_e4m3fn.h>
 #include "../per_token_group_quant_8bit.h"
 #include <cmath>
 #include <cuda_fp16.h>
@ -120,7 +122,7 @@ void per_token_group_quant_8bit(const torch::Tensor& input,
                                torch::Tensor& output_q,
                                torch::Tensor& output_s, int64_t group_size,
                                double eps, double min_8bit, double max_8bit,
-                                bool scale_ue8m0 = false) {
+                                bool scale_ue8m0) {
  TORCH_CHECK(input.is_contiguous());
  TORCH_CHECK(output_q.is_contiguous());
@ -198,6 +200,8 @@ void per_token_group_quant_8bit(const torch::Tensor& input,
      input.scalar_type(), "per_token_group_quant_8bit", ([&] {
        if (dst_type == at::ScalarType::Float8_e4m3fn) {
          LAUNCH_KERNEL(scalar_t, c10::Float8_e4m3fn);
        } else if (dst_type == at::ScalarType::Char) {
          LAUNCH_KERNEL(scalar_t, int8_t);
        }
      }));
--- a/csrc/quantization/machete/machete_prepacked_layout.cuh
+++ b/csrc/quantization/machete/machete_prepacked_layout.cuh
@ -187,8 +187,12 @@ struct PrepackedLayoutBTemplate {
  CUTE_HOST_DEVICE static constexpr auto TVbNbKL_to_offset_copy(
      Shape_NKL shape_mkl) {
    auto layout = TVbNbKL_to_offset(shape_mkl);
-    return make_layout(coalesce(get<0>(layout)), get<1>(layout),
+    // for 4-bit elements, having >= 64 values per column
-                       get<2>(layout));
+    // allows TMA to load full 32-byte sectors
    auto inner_layout =
        make_layout(make_shape(_256{}, size<0>(layout) / _256{}));
    return make_layout(inner_layout, get<1>(layout), get<2>(layout));
  }
  // ((BlockN, BlockK), (BlocksN, BlocksK), L) -> (storage_idx)
--- a/csrc/quantization/per_token_group_quant_8bit.h
+++ b/csrc/quantization/per_token_group_quant_8bit.h
@ -0,0 +1,10 @@
 #pragma once
 #include <torch/all.h>
 // TODO(wentao): refactor the folder to 8bit, then includes fp8 and int8 folders
 // 8-bit per-token-group quantization helper used by both FP8 and INT8
 void per_token_group_quant_8bit(const torch::Tensor& input,
                                torch::Tensor& output_q,
                                torch::Tensor& output_s, int64_t group_size,
                                double eps, double min_8bit, double max_8bit,
                                bool scale_ue8m0 = false);
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@ -624,6 +624,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  ops.impl("per_token_group_fp8_quant", torch::kCUDA,
           &per_token_group_quant_fp8);
  // Compute per-token-group INT8 quantized tensor and scaling factor.
  ops.def(
      "per_token_group_quant_int8(Tensor input, Tensor! output_q, Tensor! "
      "output_s, int group_size, float eps, float int8_min, float int8_max) -> "
      "()");
  ops.impl("per_token_group_quant_int8", torch::kCUDA,
           &per_token_group_quant_int8);
  // reorder weight for AllSpark Ampere W8A16 Fused Gemm kernel
  ops.def(
      "rearrange_kn_weight_as_n32k16_order(Tensor b_qweight, Tensor b_scales, "
--- a/docker/Dockerfile.arm
+++ b/docker/Dockerfile.arm
@ -1,62 +0,0 @@
 # This vLLM Dockerfile is used to construct an image that can build and run vLLM on ARM CPU platform.
 FROM ubuntu:22.04 AS cpu-test-arm
 ENV CCACHE_DIR=/root/.cache/ccache
 ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
 RUN --mount=type=cache,target=/var/cache/apt \
    apt-get update -y \
    && apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
 # tcmalloc provides better memory allocation efficiency, e.g., holding memory in caches to speed up access of commonly-used objects.
 RUN --mount=type=cache,target=/root/.cache/pip \
    pip install py-cpuinfo  # Use this to gather CPU info and optimize based on ARM Neoverse cores
 # Set LD_PRELOAD for tcmalloc on ARM
 ENV LD_PRELOAD="/usr/lib/aarch64-linux-gnu/libtcmalloc_minimal.so.4"
 RUN echo 'ulimit -c 0' >> ~/.bashrc
 WORKDIR /workspace
 ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
 ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
    pip install --upgrade pip && \
    pip install -r requirements/build.txt
 FROM cpu-test-arm AS build
 WORKDIR /workspace/vllm
 RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
    --mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
    pip install -v -r requirements/cpu.txt
 COPY . .
 ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
 # Disabling AVX512 specific optimizations for ARM
 ARG VLLM_CPU_DISABLE_AVX512="true"
 ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
 RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=cache,target=/root/.cache/ccache \
    --mount=type=bind,source=.git,target=.git \
    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
    pip install dist/*.whl && \
    rm -rf dist
 WORKDIR /workspace/
 RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@ -1,4 +1,11 @@
-# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
+# This vLLM Dockerfile is used to build images that can run vLLM on both x86_64 and arm64 CPU platforms.
 #
 # Supported platforms:
 #   - linux/amd64 (x86_64)
 #   - linux/arm64 (aarch64)
 #
 # Use the `--platform` option with `docker buildx build` to specify the target architecture, e.g.:
 #   docker buildx build --platform=linux/arm64 -f docker/Dockerfile.cpu .
 #
 # Build targets:
 #   vllm-openai (default): used for serving deployment
@ -53,7 +60,20 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --upgrade pip && \
    uv pip install -r requirements/cpu.txt
-ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/venv/lib/libiomp5.so:$LD_PRELOAD"
+ARG TARGETARCH
 ENV TARGETARCH=${TARGETARCH}
 RUN if [ "$TARGETARCH" = "arm64" ]; then \
        PRELOAD_PATH="/usr/lib/aarch64-linux-gnu/libtcmalloc_minimal.so.4"; \
    else \
        PRELOAD_PATH="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/venv/lib/libiomp5.so"; \
    fi && \
    echo "export LD_PRELOAD=$PRELOAD_PATH" >> ~/.bashrc
 # Ensure that the LD_PRELOAD environment variable for export is in effect.
 SHELL ["/bin/bash", "-c"]
 ENV LD_PRELOAD=${LD_PRELOAD}
 RUN echo 'ulimit -c 0' >> ~/.bashrc
--- a/docker/Dockerfile.tpu
+++ b/docker/Dockerfile.tpu
@ -1,4 +1,4 @@
-ARG NIGHTLY_DATE="20250714"
+ARG NIGHTLY_DATE="20250724"
 ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.12_tpuvm_$NIGHTLY_DATE"
 FROM $BASE_IMAGE
--- a/docs/contributing/profiling.md
+++ b/docs/contributing/profiling.md
@ -9,10 +9,13 @@ We support tracing vLLM workers using the `torch.profiler` module. You can enabl
 The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` environment variable set.
-When using `benchmarks/benchmark_serving.py`, you can enable profiling by passing the `--profile` flag.
+When using `vllm bench serve`, you can enable profiling by passing the `--profile` flag.
 Traces can be visualized using <https://ui.perfetto.dev/>.
 !!! tip
 You can directly call bench module without installing vllm using `python -m vllm.entrypoints.cli.main bench`.
 !!! tip
    Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly.
@ -35,10 +38,10 @@ VLLM_TORCH_PROFILER_DIR=./vllm_profile \
    --model meta-llama/Meta-Llama-3-70B
 ```
-benchmark_serving.py:
+vllm bench command:
 ```bash
-python benchmarks/benchmark_serving.py \
+vllm bench serve \
    --backend vllm \
    --model meta-llama/Meta-Llama-3-70B \
    --dataset-name sharegpt \
@ -69,13 +72,13 @@ apt install nsight-systems-cli
 For basic usage, you can just append `nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace=node` before any existing script you would run for offline inference.
-The following is an example using the `benchmarks/benchmark_latency.py` script:
+The following is an example using the `vllm bench latency` script:
 ```bash
 nsys profile -o report.nsys-rep \
    --trace-fork-before-exec=true \
    --cuda-graph-trace=node \
-    python benchmarks/benchmark_latency.py \
+vllm bench latency \
    --model meta-llama/Llama-3.1-8B-Instruct \
    --num-iters-warmup 5 \
    --num-iters 1 \
@ -98,7 +101,7 @@ nsys profile -o report.nsys-rep \
    vllm serve meta-llama/Llama-3.1-8B-Instruct
 # client
-python benchmarks/benchmark_serving.py \
+vllm bench serve \
    --backend vllm \
    --model meta-llama/Llama-3.1-8B-Instruct \
    --num-prompts 1 \
@ -132,7 +135,7 @@ You can view these profiles either as summaries in the CLI, using `nsys stats [p
    ...
    ** CUDA GPU Kernel Summary (cuda_gpu_kern_sum):
-    Time (%)  Total Time (ns)  Instances   Avg (ns)     Med (ns)    Min (ns)  Max (ns)   StdDev (ns)                                                  Name                                                
+    Time (%)  Total Time (ns)  Instances   Avg (ns)     Med (ns)    Min (ns)  Max (ns)   StdDev (ns)                                                  Name
    --------  ---------------  ---------  -----------  -----------  --------  ---------  -----------  ----------------------------------------------------------------------------------------------------
        46.3   10,327,352,338     17,505    589,965.9    144,383.0    27,040  3,126,460    944,263.8  sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize128x128x64_warpgroupsize1x1x1_execute_segment_k_of…
        14.8    3,305,114,764      5,152    641,520.7    293,408.0   287,296  2,822,716    867,124.9  sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize256x128x64_warpgroupsize2x1x1_execute_segment_k_of…
@ -143,7 +146,7 @@ You can view these profiles either as summaries in the CLI, using `nsys stats [p
        2.6      587,283,113     37,824     15,526.7      3,008.0     2,719  2,517,756    139,091.1  std::enable_if<T2>(int)0&&vllm::_typeConvert<T1>::exists, void>::type vllm::fused_add_rms_norm_kern…
        1.9      418,362,605     18,912     22,121.5      3,871.0     3,328  2,523,870    175,248.2  void vllm::rotary_embedding_kernel<c10::BFloat16, (bool)1>(const long *, T1 *, T1 *, const T1 *, in…
        0.7      167,083,069     18,880      8,849.7      2,240.0     1,471  2,499,996    101,436.1  void vllm::reshape_and_cache_flash_kernel<__nv_bfloat16, __nv_bfloat16, (vllm::Fp8KVCacheDataType)0…
-    ... 
+    ...
    ```
 GUI example:
--- a/docs/design/v1/p2p_nccl_connector.md
+++ b/docs/design/v1/p2p_nccl_connector.md
@ -3,14 +3,14 @@ An implementation of xPyD with dynamic scaling based on point-to-point communica
 # Detailed Design
 ## Overall Process
-As shown in Figure 1, the overall process of this **PD disaggregation** solution is described through a request flow:  
+As shown in Figure 1, the overall process of this **PD disaggregation** solution is described through a request flow:
-1. The client sends an HTTP request to the Proxy/Router's `/v1/completions` interface.  
+1. The client sends an HTTP request to the Proxy/Router's `/v1/completions` interface.
-2. The Proxy/Router selects a **1P1D (1 Prefill instance + 1 Decode instance)** through either through round-robin or random selection, generates a `request_id` (rules to be introduced later), modifies the `max_tokens` in the HTTP request message to **1**, and then forwards the request to the **P instance**.  
+2. The Proxy/Router selects a **1P1D (1 Prefill instance + 1 Decode instance)** through either through round-robin or random selection, generates a `request_id` (rules to be introduced later), modifies the `max_tokens` in the HTTP request message to **1**, and then forwards the request to the **P instance**.
-3. Immediately afterward, the Proxy/Router forwards the **original HTTP request** to the **D instance**.  
+3. Immediately afterward, the Proxy/Router forwards the **original HTTP request** to the **D instance**.
-4. The **P instance** performs **Prefill** and then **actively sends the generated KV cache** to the D instance (using **PUT_ASYNC** mode). The D instance's `zmq_addr` can be resolved through the `request_id`.  
+4. The **P instance** performs **Prefill** and then **actively sends the generated KV cache** to the D instance (using **PUT_ASYNC** mode). The D instance's `zmq_addr` can be resolved through the `request_id`.
-5. The **D instance** has a **dedicated thread** for receiving the KV cache (to avoid blocking the main process). The received KV cache is saved into the **GPU memory buffer**, the size of which is determined by the vLLM startup parameter `kv_buffer_size`. When the GPU buffer is full, the KV cache is stored in the **local Tensor memory pool**.  
+5. The **D instance** has a **dedicated thread** for receiving the KV cache (to avoid blocking the main process). The received KV cache is saved into the **GPU memory buffer**, the size of which is determined by the vLLM startup parameter `kv_buffer_size`. When the GPU buffer is full, the KV cache is stored in the **local Tensor memory pool**.
-6. During the **Decode**, the D instance's main process retrieves the KV cache (transmitted by the P instance) from either the **GPU buffer** or the **memory pool**, thereby **skipping Prefill**.  
+6. During the **Decode**, the D instance's main process retrieves the KV cache (transmitted by the P instance) from either the **GPU buffer** or the **memory pool**, thereby **skipping Prefill**.
 7. After completing **Decode**, the D instance returns the result to the **Proxy/Router**, which then forwards it to the **client**.
 ![image1](https://github.com/user-attachments/assets/fb01bde6-755b-49f7-ad45-48a94b1e10a7)
@ -291,7 +291,7 @@ curl -X POST -s http://10.0.1.1:10001/v1/completions \
 ??? console "Command"
    ```shell
-    python3 benchmark_serving.py \
+    vllm bench serve \
        --backend vllm \
        --model base_model \
        --tokenizer meta-llama/Llama-3.1-8B-Instruct \
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@ -177,6 +177,70 @@ Multi-image input can be extended to perform video captioning. We show this with
 You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary
 instead of using multi-image input.
 Instead of NumPy arrays, you can also pass `'torch.Tensor'` instances, as shown in this example using Qwen2.5-VL:
 ??? code
    ```python
    from transformers import AutoProcessor
    from vllm import LLM, SamplingParams
    from qwen_vl_utils import process_vision_info
    model_path = "Qwen/Qwen2.5-VL-3B-Instruct/"
    video_path = "https://content.pexels.com/videos/free-videos.mp4"
    llm = LLM(
        model=model_path,
        gpu_memory_utilization=0.8,
        enforce_eager=True,
        limit_mm_per_prompt={"video": 1},
    )
    sampling_params = SamplingParams(
        max_tokens=1024,
    )
    video_messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": [
                {"type": "text", "text": "describe this video."},
                {
                    "type": "video",
                    "video": video_path,
                    "total_pixels": 20480 * 28 * 28,
                    "min_pixels": 16 * 28 * 28
                }
            ]
        },
    ]
    messages = video_messages
    processor = AutoProcessor.from_pretrained(model_path)
    prompt = processor.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
    )
    image_inputs, video_inputs = process_vision_info(messages)
    mm_data = {}
    if video_inputs is not None:
        mm_data["video"] = video_inputs
    llm_inputs = {
        "prompt": prompt,
        "multi_modal_data": mm_data,
    }
    outputs = llm.generate([llm_inputs], sampling_params=sampling_params)
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)
    ```
    !!! note
        'process_vision_info' is only applicable to Qwen2.5-VL and similar models.
 Full example: <gh-file:examples/offline_inference/vision_language.py>
 ### Audio Inputs
--- a/docs/features/quantization/README.md
+++ b/docs/features/quantization/README.md
@ -6,6 +6,7 @@ Contents:
 - [Supported Hardware](supported_hardware.md)
 - [AutoAWQ](auto_awq.md)
 - [AutoRound](auto_round.md)
 - [BitsAndBytes](bnb.md)
 - [BitBLAS](bitblas.md)
 - [GGUF](gguf.md)
--- a/docs/features/quantization/auto_round.md
+++ b/docs/features/quantization/auto_round.md
@ -0,0 +1,103 @@
 # AutoRound
 [AutoRound](https://github.com/intel/auto-round) is Intel’s advanced quantization algorithm designed to produce highly efficient **INT2, INT3, INT4, and INT8**
 quantized large language models—striking an optimal balance between accuracy and deployment performance.
 AutoRound applies weight-only quantization to transformer-based models, enabling significant memory savings and faster
 inference while maintaining near-original accuracy. It supports a wide range of hardware platforms, including **CPUs,
 Intel GPUs, HPUs, and CUDA-enabled devices**.
 Please refer to the [AutoRound guide](https://github.com/intel/auto-round/blob/main/docs/step_by_step.md) for more details.
 Key Features:
 ✅ **AutoRound, AutoAWQ, AutoGPTQ, and GGUF** are supported
 ✅ **10+ vision-language models (VLMs)** are supported
 ✅ **Per-layer mixed-bit quantization** for fine-grained control
 ✅ **RTN (Round-To-Nearest) mode** for quick quantization with slight accuracy loss
 ✅ **Multiple quantization recipes**: best, base, and light
 ✅ Advanced utilities such as immediate packing and support for **10+ backends**
 ## Installation
 ```bash
 uv pip install auto-round
 ```
 ## Quantizing a model
 For VLMs, please change to `auto-round-mllm` in CLI usage and `AutoRoundMLLM` in API usage.
 ### CLI usage
 ```bash
 auto-round \
    --model Qwen/Qwen3-0.6B \
    --bits 4 \
    --group_size 128 \
    --format "auto_round" \
    --output_dir ./tmp_autoround
 ```
 ```bash
 auto-round \
    --model Qwen/Qwen3-0.6B \
    --format "gguf:q4_k_m" \
    --output_dir ./tmp_autoround
 ```
 ### API usage
 ```python
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from auto_round import AutoRound
 model_name = "Qwen/Qwen3-0.6B"
 model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 bits, group_size, sym = 4, 128, True
 autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym)
 # the best accuracy, 4-5X slower, low_gpu_mem_usage could save ~20G but ~30% slower
 # autoround = AutoRound(model, tokenizer, nsamples=512, iters=1000, low_gpu_mem_usage=True, bits=bits, group_size=group_size, sym=sym)
 # 2-3X speedup, slight accuracy drop at W4G128
 # autoround = AutoRound(model, tokenizer, nsamples=128, iters=50, lr=5e-3, bits=bits, group_size=group_size, sym=sym )
 output_dir = "./tmp_autoround"
 # format= 'auto_round'(default), 'auto_gptq', 'auto_awq'
 autoround.quantize_and_save(output_dir, format="auto_round")
 ```
 ## Running a quantized model with vLLM
 Here is some example code to run auto-round format in vLLM:
 ```python
 from vllm import LLM, SamplingParams
 prompts = [
    "Hello, my name is",
 ]
 sampling_params = SamplingParams(temperature=0.6, top_p=0.95)
 model_name = "Intel/DeepSeek-R1-0528-Qwen3-8B-int4-AutoRound"
 llm = LLM(model=model_name)
 outputs = llm.generate(prompts, sampling_params)
 for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 ```
 # Acknowledgement
 Special thanks to open-source low precision libraries such as AutoGPTQ, AutoAWQ, GPTQModel, Triton, Marlin, and
 ExLLaMAV2 for providing low-precision CUDA kernels, which are leveraged in AutoRound.
--- a/docs/getting_started/installation/cpu/arm.inc.md
+++ b/docs/getting_started/installation/cpu/arm.inc.md
@ -33,7 +33,7 @@ Testing has been conducted on AWS Graviton3 instances for compatibility.
 # --8<-- [end:pre-built-images]
 # --8<-- [start:build-image-from-source]
 ```bash
-docker build -f docker/Dockerfile.arm \
+docker build -f docker/Dockerfile.cpu \
        --tag vllm-cpu-env .
 # Launching OpenAI server
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@ -365,6 +365,7 @@ th {
 | `Grok1ModelForCausalLM` | Grok1 | `hpcai-tech/grok-1`. | ✅︎ | ✅︎ | ✅︎ |
 | `HunYuanDenseV1ForCausalLM` | Hunyuan-7B-Instruct-0124 | `tencent/Hunyuan-7B-Instruct-0124` | ✅︎ | | ✅︎ |
 | `HunYuanMoEV1ForCausalLM` | Hunyuan-80B-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | | ✅︎ |
 | `HCXVisionForCausalLM` | HyperCLOVAX-SEED-Vision-Instruct-3B | `naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B` | | | ✅︎ |
 | `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `InternLM3ForCausalLM` | InternLM3 | `internlm/internlm3-8b-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
@ -592,6 +593,7 @@ Specified using `--task generate`.
 | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ |
 | `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ |
 | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ |
 | `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, etc. | | ✅︎ | ✅︎ |
 | `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | | | ✅︎ |
 | `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | | ✅︎ |
@ -612,6 +614,7 @@ Specified using `--task generate`.
 | `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ | ⚠️ |
 | `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ | ✅︎ |
 | `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Phi4MultimodalForCausalLM` | Phi-4-multimodal (HF Transformers) | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct` (with revision `refs/pr/70`), etc. | ✅︎ | ✅︎ | ✅︎ |
 | `PixtralForConditionalGeneration` | Mistral 3 (Mistral format), Pixtral (Mistral format) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistralai/Pixtral-12B-2409`, etc. | | ✅︎ | ✅︎ |
 | `QwenVLForConditionalGeneration`<sup>^</sup> | Qwen-VL | T + I<sup>E+</sup> | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A<sup>+</sup> | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ | ✅︎ |
--- a/docs/training/rlhf.md
+++ b/docs/training/rlhf.md
@ -2,10 +2,14 @@
 Reinforcement Learning from Human Feedback (RLHF) is a technique that fine-tunes language models using human-generated preference data to align model outputs with desired behaviors.
-vLLM can be used to generate the completions for RLHF. The best way to do this is with libraries like [TRL](https://github.com/huggingface/trl), [OpenRLHF](https://github.com/OpenRLHF/OpenRLHF) and [verl](https://github.com/volcengine/verl).
+vLLM can be used to generate the completions for RLHF. Some ways to do this include using libraries like [TRL](https://github.com/huggingface/trl), [OpenRLHF](https://github.com/OpenRLHF/OpenRLHF), [verl](https://github.com/volcengine/verl) and [unsloth](https://github.com/unslothai/unsloth).
 See the following basic examples to get started if you don't want to use an existing library:
 - [Training and inference processes are located on separate GPUs (inspired by OpenRLHF)](../examples/offline_inference/rlhf.md)
 - [Training and inference processes are colocated on the same GPUs using Ray](../examples/offline_inference/rlhf_colocate.md)
 - [Utilities for performing RLHF with vLLM](../examples/offline_inference/rlhf_utils.md)
 See the following notebooks showing how to use vLLM for GRPO:
 - [Qwen-3 4B GRPO using Unsloth + vLLM](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_(4B)-GRPO.ipynb)
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@ -190,6 +190,37 @@ def run_phi4mm(question: str, audio_count: int) -> ModelRequestData:
    )
 def run_phi4_multimodal(question: str, audio_count: int) -> ModelRequestData:
    """
    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
    show how to process audio inputs.
    """
    model_path = snapshot_download(
        "microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70"
    )
    # Since the vision-lora and speech-lora co-exist with the base model,
    # we have to manually specify the path of the lora weights.
    speech_lora_path = os.path.join(model_path, "speech-lora")
    placeholders = "<|audio|>" * audio_count
    prompts = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
    engine_args = EngineArgs(
        model=model_path,
        max_model_len=12800,
        max_num_seqs=2,
        enable_lora=True,
        max_lora_rank=320,
        limit_mm_per_prompt={"audio": audio_count},
    )
    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompts,
        lora_requests=[LoRARequest("speech", 1, speech_lora_path)],
    )
 # Qwen2-Audio
 def run_qwen2_audio(question: str, audio_count: int) -> ModelRequestData:
    model_name = "Qwen/Qwen2-Audio-7B-Instruct"
@ -303,6 +334,7 @@ model_example_map = {
    "granite_speech": run_granite_speech,
    "minicpmo": run_minicpmo,
    "phi4_mm": run_phi4mm,
    "phi4_multimodal": run_phi4_multimodal,
    "qwen2_audio": run_qwen2_audio,
    "qwen2_5_omni": run_qwen2_5_omni,
    "ultravox": run_ultravox,
--- a/examples/offline_inference/prithvi_geospatial_mae.py
+++ b/examples/offline_inference/prithvi_geospatial_mae.py
@ -3,12 +3,12 @@
 import argparse
 import datetime
 import os
 import re
 from typing import Union
 import albumentations
 import numpy as np
 import rasterio
 import regex as re
 import torch
 from einops import rearrange
 from terratorch.datamodules import Sen1Floods11NonGeoDataModule
--- a/examples/offline_inference/save_sharded_state.py
+++ b/examples/offline_inference/save_sharded_state.py
@ -29,6 +29,7 @@ import shutil
 from pathlib import Path
 from vllm import LLM, EngineArgs
 from vllm.model_executor.model_loader import ShardedStateLoader
 from vllm.utils import FlexibleArgumentParser
@ -39,7 +40,10 @@ def parse_args():
        "--output", "-o", required=True, type=str, help="path to output checkpoint"
    )
    parser.add_argument(
-        "--file-pattern", type=str, help="string pattern of saved filenames"
+        "--file-pattern",
        type=str,
        default=ShardedStateLoader.DEFAULT_PATTERN,
        help="string pattern of saved filenames",
    )
    parser.add_argument(
        "--max-file-size",
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@ -316,6 +316,85 @@ def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
    )
 # naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B
 def run_hyperclovax_seed_vision(
    questions: list[str], modality: str
 ) -> ModelRequestData:
    model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192 if modality == "image" else 16384,
        limit_mm_per_prompt={modality: 1},
    )
    messages = list()
    for question in questions:
        if modality == "image":
            """
            ocr: List the words in the image in raster order. 
                Even if the word order feels unnatural for reading, 
                the model will handle it as long as it follows raster order.
                e.g. "Naver, CLOVA, bigshane"
            lens_keywords: List the entity names in the image.
                e.g. "iPhone"
            lens_local_keywords: List the entity names with quads in the image.
                e.g. "[0.07, 0.21, 0.92, 0.90] iPhone"
            """
            messages.append(
                [
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "image",
                                "ocr": "",
                                "lens_keywords": "",
                                "lens_local_keywords": "",
                            },
                            {
                                "type": "text",
                                "text": question,
                            },
                        ],
                    }
                ]
            )
        elif modality == "video":
            messages.append(
                [
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "video",
                            },
                            {
                                "type": "text",
                                "text": question,
                            },
                        ],
                    }
                ]
            )
        else:
            raise ValueError(f"Unsupported modality: {modality}")
    prompts = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
    )
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=None,
    )
 # Idefics3-8B-Llama3
 def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
@ -389,6 +468,39 @@ def run_tarsier(questions: list[str], modality: str) -> ModelRequestData:
    )
 # Intern-S1
 def run_interns1(questions: list[str], modality: str) -> ModelRequestData:
    model_name = "internlm/Intern-S1"
    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
        max_num_seqs=2,
        limit_mm_per_prompt={modality: 1},
        enforce_eager=True,
    )
    if modality == "image":
        placeholder = "<IMG_CONTEXT>"
    elif modality == "video":
        placeholder = "<video>"
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    messages = [
        [{"role": "user", "content": f"{placeholder}\n{question}"}]
        for question in questions
    ]
    prompts = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
 # InternVL
 def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
    model_name = "OpenGVLab/InternVL3-2B"
@ -987,6 +1099,41 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
    )
 # HF format Phi-4-multimodal-instruct
 def run_phi4_multimodal(questions: list[str], modality: str) -> ModelRequestData:
    """
    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
    show how to process image inputs.
    """
    assert modality == "image"
    model_path = snapshot_download(
        "microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70"
    )
    # Since the vision-lora and speech-lora co-exist with the base model,
    # we have to manually specify the path of the lora weights.
    vision_lora_path = os.path.join(model_path, "vision-lora")
    prompts = [
        f"<|user|><|image|>{question}<|end|><|assistant|>" for question in questions
    ]
    engine_args = EngineArgs(
        model=model_path,
        max_model_len=5120,
        max_num_seqs=2,
        max_num_batched_tokens=12800,
        enable_lora=True,
        max_lora_rank=320,
        # Note - mm_processor_kwargs can also be passed to generate/chat calls
        mm_processor_kwargs={"dynamic_hd": 16},
        limit_mm_per_prompt={"image": 1},
    )
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
    )
 # Pixtral HF-format
 def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
@ -1222,7 +1369,9 @@ model_example_map = {
    "glm4v": run_glm4v,
    "glm4_1v": run_glm4_1v,
    "h2ovl_chat": run_h2ovl,
    "hyperclovax_seed_vision": run_hyperclovax_seed_vision,
    "idefics3": run_idefics3,
    "interns1": run_interns1,
    "internvl_chat": run_internvl,
    "nemotron_vl": run_nemotron_vl,
    "keye_vl": run_keye_vl,
@ -1244,6 +1393,7 @@ model_example_map = {
    "paligemma2": run_paligemma2,
    "phi3_v": run_phi3v,
    "phi4_mm": run_phi4mm,
    "phi4_multimodal": run_phi4_multimodal,
    "pixtral_hf": run_pixtral_hf,
    "qwen_vl": run_qwen_vl,
    "qwen2_vl": run_qwen2_vl,
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@ -253,6 +253,33 @@ def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData:
    )
 def load_interns1(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "internlm/Intern-S1"
    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
        limit_mm_per_prompt={"image": len(image_urls)},
    )
    placeholders = "\n".join(
        f"Image-{i}: <IMG_CONTEXT>\n" for i, _ in enumerate(image_urls, start=1)
    )
    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )
 def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "OpenGVLab/InternVL2-2B"
@ -289,6 +316,53 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
    )
 def load_hyperclovax_seed_vision(
    question: str, image_urls: list[str]
 ) -> ModelRequestData:
    model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=16384,
        limit_mm_per_prompt={"image": len(image_urls)},
    )
    message = {"role": "user", "content": list()}
    for _image_url in image_urls:
        message["content"].append(
            {
                "type": "image",
                "image": _image_url,
                "ocr": "",
                "lens_keywords": "",
                "lens_local_keywords": "",
            }
        )
    message["content"].append(
        {
            "type": "text",
            "text": question,
        }
    )
    prompt = tokenizer.apply_chat_template(
        [
            message,
        ],
        tokenize=False,
        add_generation_prompt=True,
    )
    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        stop_token_ids=None,
        image_data=[fetch_image(url) for url in image_urls],
    )
 def load_llava(question: str, image_urls: list[str]) -> ModelRequestData:
    # NOTE: CAUTION! Original Llava models wasn't really trained on multi-image inputs,
    # it will generate poor response for multi-image inputs!
@ -686,6 +760,40 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
    )
 def load_phi4_multimodal(question: str, image_urls: list[str]) -> ModelRequestData:
    """
    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
    show how to process multi images inputs.
    """
    model_path = snapshot_download(
        "microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70"
    )
    # Since the vision-lora and speech-lora co-exist with the base model,
    # we have to manually specify the path of the lora weights.
    vision_lora_path = os.path.join(model_path, "vision-lora")
    engine_args = EngineArgs(
        model=model_path,
        max_model_len=4096,
        max_num_seqs=2,
        limit_mm_per_prompt={"image": len(image_urls)},
        enable_lora=True,
        max_lora_rank=320,
        # Note - mm_processor_kwargs can also be passed to generate/chat calls
        mm_processor_kwargs={"dynamic_hd": 4},
    )
    placeholders = "<|image|>" * len(image_urls)
    prompt = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
        lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
    )
 def load_qwen_vl_chat(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "Qwen/Qwen-VL-Chat"
    engine_args = EngineArgs(
@ -899,7 +1007,9 @@ model_example_map = {
    "gemma3": load_gemma3,
    "h2ovl_chat": load_h2ovl,
    "idefics3": load_idefics3,
    "interns1": load_interns1,
    "internvl_chat": load_internvl,
    "hyperclovax_seed_vision": load_hyperclovax_seed_vision,
    "keye_vl": load_keye_vl,
    "kimi_vl": load_kimi_vl,
    "llava": load_llava,
@ -912,6 +1022,7 @@ model_example_map = {
    "ovis": load_ovis,
    "phi3_v": load_phi3v,
    "phi4_mm": load_phi4mm,
    "phi4_multimodal": load_phi4_multimodal,
    "pixtral_hf": load_pixtral_hf,
    "qwen_vl_chat": load_qwen_vl_chat,
    "qwen2_vl": load_qwen2_vl,
--- a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh
+++ b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh
@ -29,7 +29,7 @@ PROXY_PORT=${PROXY_PORT:-30001}
 PREFILL_GPUS=${PREFILL_GPUS:-0}
 DECODE_GPUS=${DECODE_GPUS:-1,2,3}
 PREFILL_PORTS=${PREFILL_PORTS:-20003}
-DECODE_PORTS=${DECODE_PORTS:-20005,20007,20009} 
+DECODE_PORTS=${DECODE_PORTS:-20005,20007,20009}
 echo "Warning: P2P NCCL disaggregated prefill XpYd support for vLLM v1 is experimental and subject to change."
 echo ""
@ -164,7 +164,7 @@ main() {
        local gpu_id=${PREFILL_GPU_ARRAY[$i]}
        local port=${PREFILL_PORT_ARRAY[$i]}
        local kv_port=$((21001 + i))
-        
+
        echo "  Prefill server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
        CUDA_VISIBLE_DEVICES=$gpu_id VLLM_USE_V1=1 vllm serve $MODEL \
        --enforce-eager \
@ -193,7 +193,7 @@ main() {
        local gpu_id=${DECODE_GPU_ARRAY[$i]}
        local port=${DECODE_PORT_ARRAY[$i]}
        local kv_port=$((22001 + i))
-        
+
        echo "  Decode server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
        VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
        --enforce-eager \
@ -233,7 +233,7 @@ main() {
    # Run Benchmark
    # =============================================================================
    cd ../../../benchmarks/
-    python3 benchmark_serving.py --port 10001 --seed $(date +%s) \
+    vllm bench serve --port 10001 --seed $(date +%s) \
        --model $MODEL \
        --dataset-name random --random-input-len 7500 --random-output-len 200 \
        --num-prompts 200 --burstiness 100 --request-rate 2 | tee benchmark.log
@ -243,4 +243,4 @@ main() {
    cleanup
 }
-main
+main
--- a/examples/online_serving/prometheus_grafana/README.md
+++ b/examples/online_serving/prometheus_grafana/README.md
@ -28,7 +28,7 @@ Submit some sample requests to the server:
 ```bash
 wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-python3 ../../../benchmarks/benchmark_serving.py \
+vllm bench serve \
    --model mistralai/Mistral-7B-v0.1 \
    --tokenizer mistralai/Mistral-7B-v0.1 \
    --endpoint /v1/completions \
--- a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh
+++ b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh
@ -122,7 +122,7 @@ main() {
    # begin benchmark
    cd ../../../../benchmarks/
-    python3 benchmark_serving.py --port 9000 --seed $(date +%s) \
+    vllm bench serve --port 9000 --seed $(date +%s) \
        --model meta-llama/Llama-3.1-8B-Instruct \
        --dataset-name random --random-input-len 7500 --random-output-len 200 \
        --num-prompts 200 --burstiness 100 --request-rate 3.6 | tee benchmark.log
@ -133,4 +133,4 @@ main() {
 }
-main
+main
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@ -10,7 +10,8 @@ setuptools>=77.0.3,<80.0.0
 --extra-index-url https://download.pytorch.org/whl/cpu
 torch==2.6.0+cpu; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218
 torch==2.7.0; platform_system == "Darwin"
-torch==2.7.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
+torch==2.7.0; platform_machine == "ppc64le"
 torch==2.6.0; platform_machine == "aarch64" # for arm64 CPUs, torch 2.7.0 has a issue: https://github.com/vllm-project/vllm/issues/17960
 # required for the image processor of minicpm-o-2_6, this must be updated alongside torch
 torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
@ -25,3 +26,6 @@ datasets # for benchmark scripts
 intel-openmp==2024.2.1; platform_machine == "x86_64"
 intel_extension_for_pytorch==2.6.0; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218
 triton==3.2.0; platform_machine == "x86_64" # Triton is required for torch 2.6+cpu, as it is imported in torch.compile.
 # Use this to gather CPU info and optimize based on ARM Neoverse cores
 py-cpuinfo; platform_machine == "aarch64"
--- a/requirements/tpu.txt
+++ b/requirements/tpu.txt
@ -19,8 +19,8 @@ nixl==0.3.0
 --find-links https://storage.googleapis.com/libtpu-releases/index.html
 --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
 --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-torch==2.9.0.dev20250716
+torch==2.9.0.dev20250724
-torchvision==0.24.0.dev20250716
+torchvision==0.24.0.dev20250724
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250716-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250724-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250716-cp312-cp312-linux_x86_64.whl ; python_version == "3.12"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250724-cp312-cp312-linux_x86_64.whl ; python_version == "3.12"
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -1062,8 +1062,17 @@ class VllmRunner:
        return [req_output.outputs.score for req_output in req_outputs]
    def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
-        executor = self.llm.llm_engine.model_executor
+        if hasattr(self.llm.llm_engine, "model_executor"):
-        return executor.apply_model(func)
+            # This works either in V0 or in V1 with
            # VLLM_ENABLE_V1_MULTIPROCESSING=0
            executor = self.llm.llm_engine.model_executor
            return executor.apply_model(func)
        # This works in V1 with VLLM_ALLOW_INSECURE_SERIALIZATION=1
        def _apply_model(self):
            return func(self.get_model())
        return self.llm.llm_engine.collective_rpc(_apply_model)
    def __enter__(self):
        return self
--- a/tests/entrypoints/openai/test_skip_tokenizer.py
+++ b/tests/entrypoints/openai/test_skip_tokenizer.py
@ -0,0 +1,93 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import base64
 import io
 import numpy as np
 import pytest
 import requests
 import torch
 from ...utils import RemoteOpenAIServer
 MODEL_NAME = "christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM"
 DTYPE = "float16"
@pytest.fixture(autouse=True)
 def v1(run_with_both_engines):
    # Simple autouse wrapper to run both engines for each test
    # This can be promoted up to conftest.py to run for every
    # test in a package
    pass
@pytest.fixture(scope="module")
 def server():
    args = [
        "--task",
        "embed",
        # use half precision for speed and memory savings in CI environment
        "--dtype",
        DTYPE,
        "--enforce-eager",
        "--trust-remote-code",
        "--skip-tokenizer-init",
        "--max-num-seqs",
        "32"
    ]
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
        yield remote_server
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_single_request(server: RemoteOpenAIServer, model_name: str):
    pixel_values = torch.full((6, 512, 512), 1.0, dtype=torch.float16)
    location_coords = torch.full((1, 2), 1.0, dtype=torch.float16)
    buffer_tiff = io.BytesIO()
    torch.save(pixel_values, buffer_tiff)
    buffer_tiff.seek(0)
    binary_data = buffer_tiff.read()
    base64_tensor_embedding = base64.b64encode(binary_data).decode('utf-8')
    buffer_coord = io.BytesIO()
    torch.save(location_coords, buffer_coord)
    buffer_coord.seek(0)
    binary_data = buffer_coord.read()
    base64_coord_embedding = base64.b64encode(binary_data).decode('utf-8')
    prompt = {
        "model":
        model_name,
        "additional_data": {
            "prompt_token_ids": [1]
        },
        "encoding_format":
        "base64",
        "messages": [{
            "role":
            "user",
            "content": [{
                "type": "image_embeds",
                "image_embeds": {
                    "pixel_values": base64_tensor_embedding,
                    "location_coords": base64_coord_embedding,
                },
            }],
        }]
    }
    # test single pooling
    response = requests.post(server.url_for("pooling"), json=prompt)
    response.raise_for_status()
    output = response.json()["data"][0]['data']
    np_response = np.frombuffer(base64.b64decode(output), dtype=np.float32)
    assert len(np_response) == 524288
--- a/tests/kernels/attention/test_aiter_flash_attn.py
+++ b/tests/kernels/attention/test_aiter_flash_attn.py
@ -0,0 +1,191 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 import pytest
 import torch
 import vllm.v1.attention.backends.rocm_aiter_fa  # noqa: F401
 from vllm.platforms import current_platform
 NUM_HEADS = [(4, 4), (8, 2), (16, 2)]
 HEAD_SIZES = [128, 256]
 BLOCK_SIZES = [16, 32]
 DTYPES = [torch.float16, torch.bfloat16]
 QDTYPES = [None]
 # one value large enough to test overflow in index calculation.
 # one value small enough to test the schema op check
 NUM_BLOCKS = [32768, 2048]
 def ref_paged_attn(
    query: torch.Tensor,
    key_cache: torch.Tensor,
    value_cache: torch.Tensor,
    query_lens: list[int],
    kv_lens: list[int],
    block_tables: torch.Tensor,
    scale: float,
    sliding_window: Optional[int] = None,
    soft_cap: Optional[float] = None,
 ) -> torch.Tensor:
    num_seqs = len(query_lens)
    block_tables = block_tables.cpu().numpy()
    _, block_size, num_kv_heads, head_size = key_cache.shape
    outputs: list[torch.Tensor] = []
    start_idx = 0
    for i in range(num_seqs):
        query_len = query_lens[i]
        kv_len = kv_lens[i]
        q = query[start_idx:start_idx + query_len]
        q *= scale
        num_kv_blocks = (kv_len + block_size - 1) // block_size
        block_indices = block_tables[i, :num_kv_blocks]
        k = key_cache[block_indices].view(-1, num_kv_heads, head_size)
        k = k[:kv_len]
        v = value_cache[block_indices].view(-1, num_kv_heads, head_size)
        v = v[:kv_len]
        if q.shape[1] != k.shape[1]:
            k = torch.repeat_interleave(k, q.shape[1] // k.shape[1], dim=1)
            v = torch.repeat_interleave(v, q.shape[1] // v.shape[1], dim=1)
        attn = torch.einsum("qhd,khd->hqk", q, k).float()
        empty_mask = torch.ones(query_len, kv_len)
        mask = torch.triu(empty_mask, diagonal=kv_len - query_len + 1).bool()
        if sliding_window is not None:
            sliding_window_mask = torch.triu(empty_mask,
                                             diagonal=kv_len -
                                             (query_len + sliding_window) +
                                             1).bool().logical_not()
            mask |= sliding_window_mask
        if soft_cap is not None:
            attn = soft_cap * torch.tanh(attn / soft_cap)
        attn.masked_fill_(mask, float("-inf"))
        attn = torch.softmax(attn, dim=-1).to(v.dtype)
        out = torch.einsum("hqk,khd->qhd", attn, v)
        outputs.append(out)
        start_idx += query_len
    return torch.cat(outputs, dim=0)
@pytest.mark.skipif(not current_platform.is_rocm(),
                    reason="Only ROCm is supported")
@pytest.mark.parametrize("seq_lens",
                         [[(10, 1328), (5, 18),
                           (129, 463)], [(8, 523), (24, 37), (3, 2011)]])
@pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", HEAD_SIZES)
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
@pytest.mark.parametrize("sliding_window", [None, 256])
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("soft_cap", [None])
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
@pytest.mark.parametrize("q_dtype", QDTYPES)
@torch.inference_mode()
 def test_varlen_with_paged_kv(
    seq_lens: list[tuple[int, int]],
    num_heads: tuple[int, int],
    head_size: int,
    sliding_window: Optional[int],
    dtype: torch.dtype,
    block_size: int,
    soft_cap: Optional[float],
    num_blocks: int,
    q_dtype: Optional[torch.dtype],
 ) -> None:
    torch.set_default_device("cuda")
    current_platform.seed_everything(0)
    num_seqs = len(seq_lens)
    query_lens = [x[0] for x in seq_lens]
    kv_lens = [x[1] for x in seq_lens]
    num_query_heads = num_heads[0]
    num_kv_heads = num_heads[1]
    assert num_query_heads % num_kv_heads == 0
    max_query_len = max(query_lens)
    max_kv_len = max(kv_lens)
    window_size = ((sliding_window - 1, 0) if sliding_window is not None else
                   (-1, -1))
    scale = head_size**-0.5
    query = torch.randn(sum(query_lens),
                        num_query_heads,
                        head_size,
                        dtype=dtype)
    key_cache = torch.randn(num_blocks,
                            block_size,
                            num_kv_heads,
                            head_size,
                            dtype=dtype)
    value_cache = torch.randn_like(key_cache)
    cu_query_lens = torch.tensor([0] + query_lens,
                                 dtype=torch.int32).cumsum(dim=0,
                                                           dtype=torch.int32)
    cu_seq_lens = torch.tensor([0] + kv_lens,
                               dtype=torch.int32).cumsum(dim=0,
                                                         dtype=torch.int32)
    kv_lens = torch.tensor(kv_lens, dtype=torch.int32)
    max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
    block_tables = torch.randint(0,
                                 num_blocks,
                                 (num_seqs, max_num_blocks_per_seq),
                                 dtype=torch.int32)
    output = torch.empty_like(query)
    maybe_quantized_query = query
    maybe_quantized_key_cache = key_cache
    maybe_quantized_value_cache = value_cache
    k_descale = None
    v_descale = None
    if q_dtype is not None:
        # QKV are drawn from N(0, 1): no need for a fp8 scaling factor
        maybe_quantized_query = query.to(q_dtype)
        maybe_quantized_key_cache = key_cache.to(q_dtype)
        maybe_quantized_value_cache = value_cache.to(q_dtype)
        scale_shape = (num_seqs, num_kv_heads)
        k_descale = torch.ones(scale_shape, dtype=torch.float32)
        v_descale = torch.ones(scale_shape, dtype=torch.float32)
    torch.ops.vllm.flash_attn_varlen_func(
        maybe_quantized_query,
        maybe_quantized_key_cache,
        maybe_quantized_value_cache,
        out=output,
        cu_seqlens_q=cu_query_lens,
        max_seqlen_q=max_query_len,
        max_seqlen_k=max_kv_len,
        softmax_scale=scale,
        alibi_slopes=None,
        window_size=window_size,
        block_table=block_tables,
        cu_seqlens_k=cu_seq_lens,
        k_scale=k_descale,
        v_scale=v_descale,
    )
    ref_output = ref_paged_attn(
        query=query,
        key_cache=key_cache,
        value_cache=value_cache,
        query_lens=query_lens,
        kv_lens=kv_lens,
        block_tables=block_tables,
        scale=scale,
        sliding_window=sliding_window,
        soft_cap=soft_cap,
    )
    atol, rtol = 2e-2, 2e-2
    if q_dtype is not None:
        atol, rtol = 1.5e-1, 1.5e-1
    torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol), \
        f"{torch.max(torch.abs(output - ref_output))}"
--- a/tests/kernels/moe/test_moe_permute_unpermute.py
+++ b/tests/kernels/moe/test_moe_permute_unpermute.py
@ -17,28 +17,34 @@ from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
    moe_permute, moe_permute_unpermute_supported, moe_unpermute)
 from vllm.platforms import current_platform
-NUM_EXPERTS = [16, 64]
+NUM_EXPERTS = [16, 64, 256]
 TOP_KS = [2, 4, 6, 8]
 EP_SIZE = [1, 4, 16]
 current_platform.seed_everything(0)
-def torch_permute(hidden_states: torch.Tensor,
+def torch_permute(
-                  topk_ids: torch.Tensor,
+        hidden_states: torch.Tensor,
-                  token_expert_indices: torch.Tensor,
+        topk_ids: torch.Tensor,
-                  topk: int,
+        #   token_expert_indices: torch.Tensor,
-                  n_expert: int,
+        topk: int,
-                  n_local_expert: int,
+        n_expert: int,
-                  start_expert: int,
+        n_local_expert: int,
-                  expert_map: Optional[torch.Tensor] = None,
+        start_expert: int,
-                  align_block_size: Optional[int] = None,
+        expert_map: Optional[torch.Tensor] = None,
-                  fill_invalid_expert: int = -1) -> list[torch.Tensor]:
+        align_block_size: Optional[int] = None,
        fill_invalid_expert: int = -1) -> list[torch.Tensor]:
    n_token, n_hidden = hidden_states.shape[0], hidden_states.shape[1]
    if expert_map is not None:
        is_local_expert = (expert_map[topk_ids] != -1)
        not_local_expert = (expert_map[topk_ids] == -1)
        topk_ids = is_local_expert * (
            topk_ids - start_expert) + not_local_expert * (topk_ids + n_expert)
    token_expert_indices = torch.arange(0,
                                        n_token * topk,
                                        dtype=torch.int32,
                                        device=hidden_states.device).reshape(
                                            (n_token, topk))
    sorted_topk_ids, sorted_indices = torch.sort(topk_ids.flatten(),
                                                 stable=True)
@ -59,8 +65,8 @@ def torch_permute(hidden_states: torch.Tensor,
    valid_row_idx = []
    if align_block_size is None:
-        permuted_hidden_states = hidden_states[dst_row_id2src_row_id_map %
+        permuted_hidden_states = hidden_states[dst_row_id2src_row_id_map //
-                                               n_token, ...]
+                                               topk, ...]
        permuted_row_size = permuted_hidden_states.shape[0]
        m_indices = torch.empty(permuted_row_size,
                                device="cuda",
@ -73,14 +79,21 @@ def torch_permute(hidden_states: torch.Tensor,
            0, n_token * topk, device="cuda",
            dtype=torch.int32)[src2dst_idx].reshape((n_token, topk))
        valid_row_idx += [i for i in range(expert_first_token_offset[-1])]
        dst_row_id2src_row_id_map[
            expert_first_token_offset[-1]:] = n_token * topk
        return [
            permuted_hidden_states, expert_first_token_offset,
-            src_row_id2dst_row_id_map, m_indices, valid_row_idx
+            src_row_id2dst_row_id_map, dst_row_id2src_row_id_map, m_indices,
            valid_row_idx
        ]
    else:
        permuted_row_size = (topk * n_token + n_expert *
                             (align_block_size - 1) + align_block_size -
                             1) // align_block_size * align_block_size
        permuted_idx = torch.full((permuted_row_size, ),
                                  n_token * topk,
                                  dtype=torch.int32,
                                  device=hidden_states.device)
        permuted_hidden_states = torch.empty((permuted_row_size, n_hidden),
                                             device="cuda",
                                             dtype=hidden_states.dtype)
@ -105,13 +118,16 @@ def torch_permute(hidden_states: torch.Tensor,
            align_first_token_offset = align_expert_first_token_offset[i - 1]
            align_last_token_offset = align_expert_first_token_offset[i]
            dst_row_id2src_row_id_in_expert = dst_row_id2src_row_id_map[
-                first_token_offset:first_token_offset +
+                first_token_offset:first_token_offset + n_token_in_expert]
                n_token_in_expert] % n_token
            # store token in current expert with align_first_token_offset
            permuted_hidden_states[align_first_token_offset:\
                                   align_first_token_offset+n_token_in_expert,\
                                      ...] = hidden_states[\
-                                       dst_row_id2src_row_id_in_expert, ...]
+                                       dst_row_id2src_row_id_in_expert // topk,\
                                          ...]
            permuted_idx[align_first_token_offset:\
                         align_first_token_offset+\
                         n_token_in_expert] = dst_row_id2src_row_id_in_expert
            # set current expert m_indices
            m_indices[align_first_token_offset:align_last_token_offset] = i - 1
            valid_row_idx += [
@ -135,7 +151,7 @@ def torch_permute(hidden_states: torch.Tensor,
            src2dst_idx].reshape((n_token, topk))
        return [
            permuted_hidden_states, align_expert_first_token_offset,
-            align_src_row_id2dst_row_id, m_indices, valid_row_idx
+            align_src_row_id2dst_row_id, permuted_idx, m_indices, valid_row_idx
        ]
@ -146,15 +162,18 @@ def torch_unpermute(permuted_hidden_states: torch.Tensor,
                    valid_row_idx: torch.Tensor, topk: int,
                    n_expert: int) -> torch.Tensor:
    # ignore invalid row
    n_hidden = permuted_hidden_states.shape[1]
    mask = torch.zeros(permuted_hidden_states.shape[0],
                       dtype=bool,
                       device="cuda")
    mask[valid_row_idx] = True
    permuted_hidden_states[~mask] = 0
-    idx = src_row_id2dst_row_id_map.flatten()[
+
-        token_expert_indices.flatten()].reshape(token_expert_indices.shape)
+    permuted_hidden_states = permuted_hidden_states[
-    output = permuted_hidden_states[idx, ...] * topk_weights[..., None]
+        src_row_id2dst_row_id_map.flatten(), ...]
-    output = output.sum(dim=1).to(permuted_hidden_states.dtype)
+    permuted_hidden_states = permuted_hidden_states.view(-1, topk, n_hidden)
    output = (permuted_hidden_states * topk_weights.unsqueeze(2)).sum(1).to(
        permuted_hidden_states.dtype)
    return output
@ -184,43 +203,56 @@ def test_moe_permute_unpermute(n_token: int, n_hidden: int, topk: int,
    gating_output = torch.randn((n_token, n_expert), device="cuda").to(dtype)
    topk_weights, topk_ids, token_expert_indices = fused_topk(
        hidden_states, gating_output, topk, False)
-    gold0, gold1, gold2, gold3, valid_row_idx = torch_permute(
+    (gold_permuted_hidden_states, gold_expert_first_token_offset,
-        hidden_states,
+     gold_inv_permuted_idx, gold_permuted_idx, gold_m_indices,
-        topk_ids,
+     valid_row_idx) = torch_permute(
-        token_expert_indices,
+         hidden_states,
-        topk,
+         topk_ids,
-        n_expert,
+         # token_expert_indices,
-        n_local_expert,
+         topk,
-        start_expert,
+         n_expert,
-        expert_map=expert_map,
+         n_local_expert,
-        align_block_size=align_block_size,
+         start_expert,
-        fill_invalid_expert=fill_invalid_expert)
+         expert_map=expert_map,
         align_block_size=align_block_size,
         fill_invalid_expert=fill_invalid_expert)
-    result0, result1, result2, result3 = moe_permute(
+    (permuted_hidden_states, _, expert_first_token_offset, inv_permuted_idx,
-        hidden_states, topk_weights, topk_ids, token_expert_indices, topk,
+     m_indices) = moe_permute(hidden_states=hidden_states,
-        n_expert, n_local_expert, expert_map, align_block_size,
+                              a1q_scale=None,
-        fill_invalid_expert)
+                              topk_ids=topk_ids,
                              n_expert=n_expert,
                              n_local_expert=n_local_expert,
                              expert_map=expert_map,
                              align_block_size=align_block_size,
                              fill_invalid_expert=fill_invalid_expert)
    # check expert_first_token_offset
-    torch.testing.assert_close(gold1, result1, atol=0, rtol=0)
+    torch.testing.assert_close(gold_expert_first_token_offset,
-    # check src_row_id2dst_row_id_map
+                               expert_first_token_offset,
-    torch.testing.assert_close(gold2, result2, atol=0, rtol=0)
+                               atol=0,
-    # check mindice
+                               rtol=0)
-    torch.testing.assert_close(gold3, result3, atol=0, rtol=0)
+    # check src_row_id2dst_row_id_map
-    # check permuted_hidden_states, only valid token
+    torch.testing.assert_close(gold_inv_permuted_idx.flatten(),
-    torch.testing.assert_close(gold0[valid_row_idx],
+                               inv_permuted_idx,
-                               result0[valid_row_idx],
+                               atol=0,
                               rtol=0)
    # check mindice
    torch.testing.assert_close(gold_m_indices, m_indices, atol=0, rtol=0)
    # check permuted_hidden_states, only valid token
    torch.testing.assert_close(gold_permuted_hidden_states[valid_row_idx],
                               permuted_hidden_states[valid_row_idx],
                               atol=0,
                               rtol=0)
    # add a random tensor to simulate group gemm
-    result0 = 0.5 * result0 + torch.randn_like(result0)
+    result0 = 0.5 * permuted_hidden_states + torch.randn_like(
        permuted_hidden_states)
    result4 = torch.empty_like(hidden_states)
    moe_unpermute(result4, result0, topk_weights, inv_permuted_idx,
                  expert_first_token_offset)
    result4 = moe_unpermute(result0, topk_weights, topk_ids, result2, result1,
                            topk, n_expert, n_local_expert)
    gold4 = torch_unpermute(result0, topk_weights, topk_ids,
-                            token_expert_indices, result2, valid_row_idx, topk,
+                            token_expert_indices, inv_permuted_idx,
-                            n_local_expert)
+                            valid_row_idx, topk, n_local_expert)
    # check unpermuted hidden
    torch.testing.assert_close(result4, gold4, atol=2e-2, rtol=0)
--- a/tests/model_executor/test_model_load_with_params.py
+++ b/tests/model_executor/test_model_load_with_params.py
@ -22,10 +22,12 @@ REVISION_ROBERTA = os.environ.get("REVISION", "main")
@pytest.mark.skipif(current_platform.is_rocm(),
                    reason="Xformers backend is not supported on ROCm.")
-def test_model_loading_with_params(vllm_runner):
+def test_model_loading_with_params(vllm_runner, monkeypatch):
    """
    Test parameter weight loading with tp>1.
    """
    # to use apply_model
    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
    with vllm_runner(model_name=MODEL_NAME,
                     revision=REVISION,
                     dtype="float16",
@ -61,10 +63,12 @@ def test_model_loading_with_params(vllm_runner):
@pytest.mark.skipif(current_platform.is_rocm(),
                    reason="Xformers backend is not supported on ROCm.")
-def test_roberta_model_loading_with_params(vllm_runner):
+def test_roberta_model_loading_with_params(vllm_runner, monkeypatch):
    """
    Test parameter weight loading with tp>1.
    """
    # to use apply_model
    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
    with vllm_runner(model_name=MODEL_NAME_ROBERTA,
                     revision=REVISION_ROBERTA,
                     dtype="float16",
@ -101,10 +105,12 @@ def test_roberta_model_loading_with_params(vllm_runner):
@pytest.mark.skipif(current_platform.is_rocm(),
                    reason="Xformers backend is not supported on ROCm.")
-def test_facebook_roberta_model_loading_with_params(vllm_runner):
+def test_facebook_roberta_model_loading_with_params(vllm_runner, monkeypatch):
    """
    Test loading roberta-base model with no lm_head.
    """
    # to use apply_model
    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
    model_name = "FacebookAI/roberta-base"
    with vllm_runner(model_name=model_name,
                     dtype="float16",
--- a/tests/models/language/pooling/test_embedding.py
+++ b/tests/models/language/pooling/test_embedding.py
@ -39,17 +39,9 @@ def v1(run_with_both_engines):
        pytest.param("ssmits/Qwen2-7B-Instruct-embed-base",
                     marks=[pytest.mark.skip_v0, pytest.mark.cpu_model]),
        # [Encoder-only]
-        pytest.param(
+        pytest.param("BAAI/bge-base-en-v1.5", marks=[pytest.mark.core_model]),
-            "BAAI/bge-base-en-v1.5",
+        pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
-            marks=[
+        pytest.param("intfloat/multilingual-e5-small"),
                # CPU only supports V1
                pytest.mark.core_model,
                pytest.mark.skip_v1
            ]),
        pytest.param("sentence-transformers/all-MiniLM-L12-v2",
                     marks=[pytest.mark.skip_v1]),
        pytest.param("intfloat/multilingual-e5-small",
                     marks=[pytest.mark.skip_v1]),
        pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
                     marks=[pytest.mark.skip_v1]),
        # [Cross-Encoder]
--- a/tests/models/language/pooling/test_jina.py
+++ b/tests/models/language/pooling/test_jina.py
@ -23,6 +23,14 @@ RERANK_MODELS = [
 ]
@pytest.fixture(autouse=True)
 def v1(run_with_both_engines):
    # Simple autouse wrapper to run both engines for each test
    # This can be promoted up to conftest.py to run for every
    # test in a package
    pass
@pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
 def test_embed_models_mteb(hf_runner, vllm_runner,
                           model_info: EmbedModelInfo) -> None:
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@ -677,6 +677,7 @@ VLM_TEST_SETTINGS = {
        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
        img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
        video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
        multi_image_prompt="Picture 1: <vlm_image>\nPicture 2: <vlm_image>\nDescribe these two images with one paragraph respectively.",    # noqa: E501
        max_model_len=4096,
        max_num_seqs=2,
        auto_cls=AutoModelForVision2Seq,
--- a/tests/models/multimodal/generation/test_maverick.py
+++ b/tests/models/multimodal/generation/test_maverick.py
@ -22,6 +22,9 @@ from transformers import (AutoConfig, AutoProcessor, AutoTokenizer,
                          GenerationConfig)
 from vllm import LLM, SamplingParams
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.kv_cache_interface import (ChunkedLocalAttentionSpec,
                                        FullAttentionSpec)
 from ....utils import multi_gpu_test
@ -69,6 +72,26 @@ def run_maverick_serving(model: str):
        raise
 def get_rope_layers_config(model_path: str) -> list[int]:
    """
    Get the interleaved RoPE configuration from HuggingFace config
    Args:
        model_path: Path to the local directory containing the reduced
            Maverick model checkpoint
    Returns:
        List of 0 or 1 indicating whether each layer uses RoPE and local attn
        0 indicates that RoPE is not used while 1 indicates that RoPE is used.
    """
    config_path = Path(model_path) / "config.json"
    model_config = json.loads(config_path.read_text())
    text_config = model_config["text_config"]
    no_rope_layers = text_config["no_rope_layers"]
    print(f"Found no_rope_layers: {no_rope_layers}")
    return no_rope_layers
 def create_reduced_maverick_model(
    original_model_name:
    str = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
@ -113,7 +136,6 @@ def create_reduced_maverick_model(
        print("Loading original model configuration...")
        original_config = AutoConfig.from_pretrained(original_model_name,
                                                     trust_remote_code=True)
        print("Creating reduced configuration...")
        reduced_config = create_reduced_config(original_config, text_layers,
                                               num_experts, vision_layers)
@ -510,21 +532,32 @@ def save_weights_to_safetensors(weights: dict[str, torch.Tensor],
          f"{index_data['metadata']['total_size'] / (1024**3):.2f} GB")
-def run_reduced_model(model_path: str,
+def check_attention_spec_interleaved_rope(
-                      should_profile: bool = False,
+    llm: LLM,
-                      **kwargs) -> None:
+    num_attention_layers: int,
-    """Test the created reduced model with vLLM."""
+    num_ranks: int,
-
+    rope_layers: list[int],
-    print(f"\nTesting reduced model at {model_path}...")
+):
-
+    """Check that the attention spec is correct."""
-    llm = LLM(
+    assert isinstance(llm.llm_engine.model_executor, Executor)
-        model=model_path,
+    kv_cache_specs_per_rank = llm.llm_engine.model_executor.get_kv_cache_specs(
        trust_remote_code=True,
        max_model_len=512,  # Small context for testing
        gpu_memory_utilization=0.3,  # Conservative memory usage
        **kwargs,
    )
    for rank in range(num_ranks):
        kv_cache_specs = kv_cache_specs_per_rank[rank]
        assert len(kv_cache_specs.keys()) == num_attention_layers
        for i in range(num_attention_layers):
            if rope_layers[i] == 0:
                expected_spec = FullAttentionSpec
            else:
                expected_spec = ChunkedLocalAttentionSpec
            assert isinstance(
                kv_cache_specs[
                    f"language_model.model.layers.{i}.self_attn.attn"],
                expected_spec)
 def run_reduced_model(llm: LLM, should_profile: bool = False) -> None:
    """Test the created reduced model with vLLM."""
    sampling_params = SamplingParams(temperature=0.8,
                                     top_p=0.95,
                                     max_tokens=50)
@ -551,6 +584,7 @@ def run_reduced_model(model_path: str,
@pytest.mark.parametrize("tp,ep", [(2, True)])
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
 def test_dummy_maverick(
    monkeypatch,
    original_model_name: str,
    text_layers: int,
    num_experts: int,
@ -562,6 +596,10 @@ def test_dummy_maverick(
    force_recreate: bool = True,
    profile: bool = False,
 ) -> None:
    # Disable multiprocessing allows us to access model executor from LLM engine
    monkeypatch.setenv("VLLM_USE_V1", "1")
    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
    model_path = create_reduced_maverick_model(
        original_model_name=original_model_name,
        output_dir=output_dir,
@ -573,11 +611,27 @@ def test_dummy_maverick(
    print(f"\nReduced model created successfully at: {model_path}")
-    run_reduced_model(model_path=model_path,
+    rope_layers = get_rope_layers_config(model_path)
-                      should_profile=profile,
+
-                      enforce_eager=enforce_eager,
+    llm = LLM(
-                      tensor_parallel_size=tp,
+        model=model_path,
-                      enable_expert_parallel=ep)
+        trust_remote_code=True,
        max_model_len=512,  # Small context for testing
        gpu_memory_utilization=0.3,  # Conservative memory usage
        enforce_eager=enforce_eager,
        tensor_parallel_size=tp,
        enable_expert_parallel=ep,
    )
    check_attention_spec_interleaved_rope(
        llm,
        text_layers,
        tp,
        rope_layers,
    )
    print(f"\nTesting reduced model at {model_path}...")
    run_reduced_model(llm=llm, should_profile=profile)
 def main():
--- a/tests/models/multimodal/generation/test_phi4_multimodal.py
+++ b/tests/models/multimodal/generation/test_phi4_multimodal.py
@ -0,0 +1,252 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 from collections.abc import Sequence
 from typing import Optional
 import librosa
 import pytest
 from huggingface_hub import snapshot_download
 from vllm.assets.image import ImageAsset
 from vllm.lora.request import LoRARequest
 from vllm.multimodal.image import rescale_image_size
 from vllm.platforms import current_platform
 from ....conftest import (IMAGE_ASSETS, HfRunner, PromptAudioInput,
                          PromptImageInput, VllmRunner)
 from ....utils import large_gpu_test
 from ...utils import check_logprobs_close
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    "stop_sign":
    "<|user|>\n<|image|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n",  # noqa: E501
    "cherry_blossom":
    "<|user|>\n<|image|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n",  # noqa: E501
 })
 HF_MULTIIMAGE_IMAGE_PROMPT = "<|user|>\n<|image|>\n<|image|>\nDescribe these images.<|end|>\n<|assistant|>\n"  # noqa: E501
 model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct",
                               revision="refs/pr/70")
 # Since the vision-lora and speech-lora co-exist with the base model,
 # we have to manually specify the path of the lora weights.
 vision_lora_path = os.path.join(model_path, "vision-lora")
 speech_question = os.path.join(model_path, "examples",
                               "what_is_shown_in_this_image.wav")
 models = [model_path]
 target_dtype = "half"
 # ROCm Triton FA can run into shared memory issues with these models,
 # use other backends in the meantime
 # FIXME (mattwong, gshtrasb, hongxiayan)
 if current_platform.is_rocm():
    os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
 def run_test(
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
    inputs: Sequence[tuple[list[str], PromptImageInput,
                           Optional[PromptAudioInput]]],
    model: str,
    *,
    max_model_len: int,
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
    mm_limit: int,
    tensor_parallel_size: int,
    distributed_executor_backend: Optional[str] = None,
 ):
    """Inference result should be the same between hf and vllm.
    All the image fixtures for the test are from IMAGE_ASSETS.
    For huggingface runner, we provide the PIL images as input.
    For vllm runner, we provide MultiModalDataDict objects
    and corresponding MultiModalConfig as input.
    Note, the text input is also adjusted to abide by vllm contract.
    The text output is sanitized to be able to compare with hf.
    """
    # NOTE: take care of the order. run vLLM first, and then run HF.
    # vLLM needs a fresh new process without cuda initialization.
    # if we run HF first, the cuda initialization will be done and it
    # will hurt multiprocessing backend with fork method (the default method).
    # max_model_len should be greater than image_feature_size
    with vllm_runner(
            model,
            task="generate",
            max_model_len=max_model_len,
            max_num_seqs=2,
            dtype=dtype,
            limit_mm_per_prompt={"image": mm_limit},
            tensor_parallel_size=tensor_parallel_size,
            distributed_executor_backend=distributed_executor_backend,
            enable_lora=True,
            max_lora_rank=320,
            gpu_memory_utilization=0.8,  # set to 0.8 to avoid OOM in CI
            enforce_eager=True,
            trust_remote_code=False,
    ) as vllm_model:
        lora_request = LoRARequest("vision", 1, vision_lora_path)
        vllm_outputs_per_case = [
            vllm_model.generate_greedy_logprobs(prompts,
                                                max_tokens,
                                                num_logprobs=num_logprobs,
                                                images=images,
                                                audios=audios,
                                                lora_request=lora_request)
            for prompts, images, audios in inputs
        ]
    with hf_runner(model, dtype=dtype) as hf_model:
        hf_model.model.load_adapter(
            vision_lora_path,
            adapter_name="vision",
        )
        hf_processor = hf_model.processor
        eos_token_id = hf_processor.tokenizer.eos_token_id
        hf_outputs_per_case = [
            hf_model.generate_greedy_logprobs_limit(prompts,
                                                    max_tokens,
                                                    num_logprobs=num_logprobs,
                                                    images=images,
                                                    audios=audios,
                                                    eos_token_id=eos_token_id)
            for prompts, images, audios in inputs
        ]
    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
                                        vllm_outputs_per_case):
        check_logprobs_close(
            outputs_0_lst=hf_outputs,
            outputs_1_lst=vllm_outputs,
            name_0="hf",
            name_1="vllm",
        )
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
    "size_factors",
    [
        # No image
        [],
        # Single-scale
        [1.0],
        # Single-scale, batched
        [1.0, 1.0, 1.0],
        # Multi-scale
        [0.25, 0.5, 1.0],
    ],
 )
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_model_len", [12800])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
 def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
                dtype: str, max_model_len: int, max_tokens: int,
                num_logprobs: int) -> None:
    images = [asset.pil_image for asset in image_assets]
    inputs_per_image = [(
        [prompt for _ in size_factors],
        [rescale_image_size(image, factor) for factor in size_factors],
        None,
    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
    run_test(
        hf_runner,
        vllm_runner,
        inputs_per_image,
        model,
        dtype=dtype,
        max_model_len=max_model_len,
        max_tokens=max_tokens,
        num_logprobs=num_logprobs,
        mm_limit=1,
        tensor_parallel_size=1,
    )
@large_gpu_test(min_gb=48)
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
    "size_factors",
    [
        # No image
        # [],
        # Single-scale
        [1.0],
        # Single-scale, batched
        [1.0, 1.0, 1.0],
        # Multi-scale
        [0.25, 0.5, 1.0],
    ],
 )
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_model_len", [25600])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
 def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
                             size_factors, dtype: str, max_model_len: int,
                             max_tokens: int, num_logprobs: int) -> None:
    images = [asset.pil_image for asset in image_assets]
    inputs_per_case = [
        (
            [HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
            [[rescale_image_size(image, factor) for image in images]
             for factor in size_factors],
            None,
        ),
    ]
    run_test(
        hf_runner,
        vllm_runner,
        inputs_per_case,
        model,
        dtype=dtype,
        max_model_len=max_model_len,
        max_tokens=max_tokens,
        num_logprobs=num_logprobs,
        mm_limit=2,
        tensor_parallel_size=1,
    )
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_model_len", [12800])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
 def test_vision_speech_models(hf_runner, vllm_runner, model, dtype: str,
                              max_model_len: int, max_tokens: int,
                              num_logprobs: int) -> None:
    # use the example speech question so that the model outputs are reasonable
    audio = librosa.load(speech_question, sr=16000)
    image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
    inputs_vision_speech = [
        (
            ["<|user|><|image|><|audio|><|end|><|assistant|>"],
            [image],
            [audio],
        ),
    ]
    run_test(
        hf_runner,
        vllm_runner,
        inputs_vision_speech,
        model,
        dtype=dtype,
        max_model_len=max_model_len,
        max_tokens=max_tokens,
        num_logprobs=num_logprobs,
        mm_limit=1,
        tensor_parallel_size=1,
    )
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@ -41,12 +41,18 @@ def glm4_1v_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
 def _test_processing_correctness(
-    model_id: str,
+    model_id_or_arch: str,
    hit_rate: float,
    num_batches: int,
    simplify_rate: float,
 ):
-    model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
+    if model_id_or_arch in HF_EXAMPLE_MODELS.get_supported_archs():
        # Use model architecture to get the default model id
        model_info = HF_EXAMPLE_MODELS.get_hf_info(model_id_or_arch)
        model_id = model_info.default
    else:
        model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id_or_arch)
        model_id = model_id_or_arch
    model_info.check_available_online(on_fail="skip")
    model_info.check_transformers_version(on_fail="skip")
@ -58,7 +64,7 @@ def _test_processing_correctness(
        trust_remote_code=model_info.trust_remote_code,
        seed=0,
        dtype="auto",
-        revision=None,
+        revision=model_info.revision,
        hf_overrides=model_info.hf_overrides,
    )
@ -272,12 +278,14 @@ def _test_processing_correctness_one(
    "THUDM/GLM-4.1V-9B-Thinking",
    "ibm-granite/granite-speech-3.3-2b",
    "h2oai/h2ovl-mississippi-800m",
    "internlm/Intern-S1",
    "OpenGVLab/InternVL2-1B",
    "OpenGVLab/InternVL3-1B",
    "HuggingFaceM4/Idefics3-8B-Llama3",
    "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
    "moonshotai/Kimi-VL-A3B-Instruct",
    "meta-llama/Llama-4-Scout-17B-16E-Instruct",
    "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
    "llava-hf/llava-1.5-7b-hf",
    "llava-hf/llava-v1.6-mistral-7b-hf",
    "llava-hf/LLaVA-NeXT-Video-7B-hf",
@ -330,6 +338,28 @@ def test_processing_correctness(
    )
 # Phi4MultimodalForCausalLM share same model repo with original format
 # Phi4MMForCausalLM, so we add it as a separate test case
 # Remove this test after conversion PR merged:
 # https://huggingface.co/microsoft/Phi-4-multimodal-instruct/discussions/70
@pytest.mark.parametrize("model_arch", ["Phi4MultimodalForCausalLM"])
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
@pytest.mark.parametrize("num_batches", [32])
@pytest.mark.parametrize("simplify_rate", [1.0])
 def test_processing_correctness_phi4_multimodal(
    model_arch: str,
    hit_rate: float,
    num_batches: int,
    simplify_rate: float,
 ):
    _test_processing_correctness(
        model_arch,
        hit_rate=hit_rate,
        num_batches=num_batches,
        simplify_rate=simplify_rate,
    )
 def _assert_inputs_equal(
    a: MultiModalInputs,
    b: MultiModalInputs,
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@ -201,6 +201,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                               trust_remote_code=True),
    "HunYuanDenseV1ForCausalLM":_HfExamplesInfo("tencent/Hunyuan-7B-Instruct-0124",
                                               trust_remote_code=True),
    "HCXVisionForCausalLM": _HfExamplesInfo(
        "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
        trust_remote_code=True),
    "InternLMForCausalLM": _HfExamplesInfo("internlm/internlm-chat-7b",
                                           trust_remote_code=True),
    "InternLM2ForCausalLM": _HfExamplesInfo("internlm/internlm2-chat-7b",
@ -218,6 +221,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                                "fp8": "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"}),  # noqa: E501
    "LLaMAForCausalLM": _HfExamplesInfo("decapoda-research/llama-7b-hf",
                                        is_available_online=False),
    "Llama4ForCausalLM": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501
                                         is_available_online=False),
    "MambaForCausalLM": _HfExamplesInfo("state-spaces/mamba-130m-hf"),
    "Mamba2ForCausalLM": _HfExamplesInfo("mistralai/Mamba-Codestral-7B-v0.1"),
    "FalconMambaForCausalLM": _HfExamplesInfo("tiiuae/falcon-mamba-7b-instruct"),  # noqa: E501
@ -376,6 +381,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                         extras={"2B": "OpenGVLab/InternVL2-2B",
                                                 "3.0": "OpenGVLab/InternVL3-1B"},  # noqa: E501
                                         trust_remote_code=True),
    "InternS1ForConditionalGeneration": _HfExamplesInfo("internlm/Intern-S1",
                                         trust_remote_code=True),
    "Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3",  # noqa: E501
                                                        {"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}),  # noqa: E501
    "KeyeForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501
@ -426,6 +433,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                    "1.6-gemma": "AIDC-AI/Ovis1.6-Gemma2-9B"}),  # noqa: E501
    "Phi4MMForCausalLM": _HfExamplesInfo("microsoft/Phi-4-multimodal-instruct",
                                        trust_remote_code=True),
    "Phi4MultimodalForCausalLM": _HfExamplesInfo("microsoft/Phi-4-multimodal-instruct",  # noqa: E501
                                                 revision="refs/pr/70"),
    "PixtralForConditionalGeneration": _HfExamplesInfo("mistralai/Pixtral-12B-2409",  # noqa: E501
                                                       tokenizer_mode="mistral"),
    "QwenVLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen-VL",
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@ -17,7 +17,9 @@ from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tenso
    CompressedTensorsW4A4Fp4, CompressedTensorsW4A16Fp4,
    CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8,
    CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8,
-    CompressedTensorsWNA16, cutlass_fp4_supported)
+    CompressedTensorsWNA16)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
    cutlass_fp4_supported)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
    sparse_cutlass_supported)
 from vllm.platforms import current_platform
--- a/tests/quantization/test_rtn.py
+++ b/tests/quantization/test_rtn.py
@ -8,7 +8,10 @@ import pytest
 from tests.quantization.utils import is_quant_method_supported
-MODELS = ["microsoft/Phi-3-mini-4k-instruct"]
+MODELS = [
    "microsoft/Phi-3-mini-4k-instruct",  # dense model
    "ai21labs/Jamba-tiny-dev",  # MoE model
 ]
@pytest.mark.skipif(not is_quant_method_supported("rtn"),
--- a/tests/standalone_tests/test_tensor_schema.py
+++ b/tests/standalone_tests/test_tensor_schema.py
@ -4,6 +4,7 @@
 import pytest
 import torch
 from vllm.model_executor.models.fuyu import FuyuImagePatchInputs
 from vllm.model_executor.models.phi3v import Phi3VImagePixelInputs
@ -124,3 +125,24 @@ def test_tensor_schema_with_invalid_resolve_binding_dims():
                "w": 336
            },
        )
 def test_tensor_schema_with_list_of_symbolic_dim():
    flat_data = torch.stack([torch.randn(768) for _ in range(3)])  # (bn=3, fn)
    patches_per_image = [64, 64, 64]  # len = bn = 3
    FuyuImagePatchInputs(
        flat_data=flat_data,
        patches_per_image=patches_per_image,
    )
 def test_tensor_schema_with_list_of_symbolic_dim_mismatch_in_length():
    flat_data = torch.stack([torch.randn(768) for _ in range(4)])  # (bn=4, fn)
    patches_per_image = [64, 64, 64]  # len = 3 ≠ bn
    with pytest.raises(ValueError, match="expected 'bn'=4, got 3"):
        FuyuImagePatchInputs(
            flat_data=flat_data,
            patches_per_image=patches_per_image,
        )
--- a/tests/v1/attention/utils.py
+++ b/tests/v1/attention/utils.py
@ -93,6 +93,7 @@ def create_common_attn_metadata(
        max_query_len=max_query_len,
        block_table_tensor=block_table_tensor,
        slot_mapping=slot_mapping,
        causal=True,
    )
--- a/tests/v1/test_oracle.py
+++ b/tests/v1/test_oracle.py
@ -13,7 +13,6 @@ UNSUPPORTED_MODELS_V1 = [
    "openai/whisper-large-v3",  # transcription
    "facebook/bart-large-cnn",  # encoder decoder
    "state-spaces/mamba-130m-hf",  # mamba1
    "BAAI/bge-m3",  # embedding
 ]
 MODEL = "meta-llama/Llama-3.2-1B-Instruct"
--- a/tests/v1/test_utils.py
+++ b/tests/v1/test_utils.py
@ -1,9 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import re
 import pytest
 import regex as re
 import requests
 import torch
--- a/tests/v1/tpu/test_basic.py
+++ b/tests/v1/tpu/test_basic.py
@ -59,7 +59,7 @@ def test_basic(
                # actually test chunked prompt
                max_num_batched_tokens=1024,
                max_model_len=8192,
-                gpu_memory_utilization=0.95,
+                gpu_memory_utilization=0.7,
                max_num_seqs=max_num_seqs,
                tensor_parallel_size=tensor_parallel_size) as vllm_model:
            vllm_outputs = vllm_model.generate_greedy(example_prompts,
--- a/vllm/benchmarks/utils.py
+++ b/vllm/benchmarks/utils.py
@ -67,4 +67,9 @@ class InfEncoder(json.JSONEncoder):
 def write_to_json(filename: str, records: list) -> None:
    with open(filename, "w") as f:
-        json.dump(records, f, cls=InfEncoder)
+        json.dump(
            records,
            f,
            cls=InfEncoder,
            default=lambda o: f"<{type(o).__name__} is not JSON serializable>",
        )
--- a/vllm/config.py
+++ b/vllm/config.py
@ -4790,26 +4790,26 @@ class VllmConfig:
    def __str__(self):
        return (
-            f"model={self.model_config.model!r},"
+            f"model={self.model_config.model!r}, "
-            f" speculative_config={self.speculative_config!r},"
+            f"speculative_config={self.speculative_config!r}, "
-            f" tokenizer={self.model_config.tokenizer!r}, "
+            f"tokenizer={self.model_config.tokenizer!r}, "
-            f"skip_tokenizer_init={self.model_config.skip_tokenizer_init},"
+            f"skip_tokenizer_init={self.model_config.skip_tokenizer_init}, "
-            f" tokenizer_mode={self.model_config.tokenizer_mode}, "
+            f"tokenizer_mode={self.model_config.tokenizer_mode}, "
            f"revision={self.model_config.revision}, "
-            f"override_neuron_config={self.model_config.override_neuron_config},"
+            f"override_neuron_config={self.model_config.override_neuron_config}, "  # noqa
-            f" tokenizer_revision={self.model_config.tokenizer_revision}, "
+            f"tokenizer_revision={self.model_config.tokenizer_revision}, "
            f"trust_remote_code={self.model_config.trust_remote_code}, "
            f"dtype={self.model_config.dtype}, "
-            f"max_seq_len={self.model_config.max_model_len},"
+            f"max_seq_len={self.model_config.max_model_len}, "
-            f" download_dir={self.load_config.download_dir!r}, "
+            f"download_dir={self.load_config.download_dir!r}, "
            f"load_format={self.load_config.load_format}, "
-            f"tensor_parallel_size={self.parallel_config.tensor_parallel_size},"
+            f"tensor_parallel_size={self.parallel_config.tensor_parallel_size}, "  # noqa
-            f" pipeline_parallel_size={self.parallel_config.pipeline_parallel_size}, "  # noqa
+            f"pipeline_parallel_size={self.parallel_config.pipeline_parallel_size}, "  # noqa
            f"disable_custom_all_reduce={self.parallel_config.disable_custom_all_reduce}, "  # noqa
            f"quantization={self.model_config.quantization}, "
            f"enforce_eager={self.model_config.enforce_eager}, "
            f"kv_cache_dtype={self.cache_config.cache_dtype}, "
-            f" device_config={self.device_config.device}, "
+            f"device_config={self.device_config.device}, "
            f"decoding_config={self.decoding_config!r}, "
            f"observability_config={self.observability_config!r}, "
            f"seed={self.model_config.seed}, "
--- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
@ -156,8 +156,16 @@ class SharedStorageConnector(KVConnectorBase_V1):
            logger.info("Inject KV cache of %d tokens to the paged memory",
                        len(request.slot_mapping))
            for layer_name in forward_context.no_compile_layers:
-                attn_layer = forward_context.no_compile_layers[layer_name]
+                layer = forward_context.no_compile_layers[layer_name]
-                kv_cache_layer = attn_layer.kv_cache[\
+
                # Only process layers that have kv_cache
                # attribute (attention layers) Skip non-attention
                # layers like FusedMoE/MLP etc.
                kv_cache_attr = getattr(layer, 'kv_cache', None)
                if kv_cache_attr is None:
                    continue
                kv_cache_layer = kv_cache_attr[ \
                        forward_context.virtual_engine]
                filename = self._generate_filename_debug(
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@ -1649,7 +1649,8 @@ class EngineArgs:
        if (self.max_num_seqs is None
                and usage_context in default_max_num_seqs):
-            self.max_num_seqs = default_max_num_seqs[usage_context]
+            self.max_num_seqs = min(default_max_num_seqs[usage_context],
                                    self.max_num_batched_tokens or sys.maxsize)
            logger.debug("Setting max_num_seqs to %d for %s usage context.",
                         self.max_num_seqs, use_context_value)
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@ -97,11 +97,16 @@ class MQLLMEngineClient(EngineClient):
        self.model_config = engine_config.model_config
        self.decoding_config = engine_config.decoding_config
-        # Create the tokenizer group.
+        if self.vllm_config.model_config.skip_tokenizer_init:
-        self.tokenizer = init_tokenizer_from_configs(
+            self.tokenizer = None
-            model_config=self.model_config,
+
-            scheduler_config=engine_config.scheduler_config,
+        else:
-            lora_config=engine_config.lora_config)
+            # Create the tokenizer group.
            self.tokenizer = init_tokenizer_from_configs(
                model_config=self.model_config,
                scheduler_config=engine_config.scheduler_config,
                lora_config=engine_config.lora_config)
        self.input_preprocessor = InputPreprocessor(self.model_config,
                                                    self.tokenizer)
@ -375,7 +380,10 @@ class MQLLMEngineClient(EngineClient):
        return self.input_preprocessor
    async def get_tokenizer(self, lora_request: Optional[LoRARequest] = None):
-        return await self.tokenizer.get_lora_tokenizer_async(lora_request)
+        if self.tokenizer is None:
            return None
        else:
            return await self.tokenizer.get_lora_tokenizer_async(lora_request)
    async def get_vllm_config(self) -> VllmConfig:
        return self.vllm_config
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@ -14,6 +14,7 @@ from pydantic import ValidationError
 from tqdm.auto import tqdm
 from typing_extensions import TypeVar, deprecated
 import vllm.envs as envs
 from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput,
                              BeamSearchSequence,
                              create_sort_beams_key_function)
@ -44,9 +45,10 @@ from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.outputs import (ClassificationRequestOutput, EmbeddingRequestOutput,
                          PoolingRequestOutput, RequestOutput,
                          ScoringRequestOutput)
-from vllm.pooling_params import PoolingParams, PoolingTask
+from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams,
                                  RequestOutputKind, SamplingParams)
 from vllm.tasks import PoolingTask
 from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
                                               get_cached_tokenizer)
 from vllm.usage.usage_lib import UsageContext
@ -277,6 +279,16 @@ class LLM:
        self.request_counter = Counter()
        self.default_sampling_params: Union[dict[str, Any], None] = None
        if envs.VLLM_USE_V1:
            supported_tasks = self.llm_engine \
                .get_supported_tasks()  # type: ignore
        else:
            supported_tasks = self.llm_engine.model_config.supported_tasks
        logger.info("Supported_tasks: %s", supported_tasks)
        self.supported_tasks = supported_tasks
    def get_tokenizer(
        self,
        lora_request: Optional[LoRARequest] = None,
@ -1170,8 +1182,7 @@ class LLM:
            A list of `EmbeddingRequestOutput` objects containing the
            embedding vectors in the same order as the input prompts.
        """
-        model_config = self.llm_engine.model_config
+        if "embed" not in self.supported_tasks:
        if "embed" not in model_config.supported_tasks:
            raise ValueError("Embedding API is not supported by this model. "
                             "Please set `--task embed`.")
@ -1215,8 +1226,7 @@ class LLM:
            A list of `ClassificationRequestOutput` objects containing the
            embedding vectors in the same order as the input prompts.
        """
-        model_config = self.llm_engine.model_config
+        if "classify" not in self.supported_tasks:
        if "classify" not in model_config.supported_tasks:
            raise ValueError(
                "Classification API is not supported by this model. "
                "Please set `--task classify`.")
@ -1397,8 +1407,8 @@ class LLM:
            raise ValueError(" ".join(messages))
-        if all(t not in model_config.supported_tasks
+        supported_tasks = self.supported_tasks
-               for t in ("embed", "classify")):
+        if all(t not in supported_tasks for t in ("embed", "classify")):
            raise ValueError("Score API is not supported by this model. "
                             "Please set `--task embed` or `--task classify`.")
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@ -1586,6 +1586,14 @@ async def init_app_state(
    state.vllm_config = vllm_config
    model_config = vllm_config.model_config
    if envs.VLLM_USE_V1:
        supported_tasks = await engine_client \
            .get_supported_tasks()  # type: ignore
    else:
        supported_tasks = model_config.supported_tasks
    logger.info("Supported_tasks: %s", supported_tasks)
    resolved_chat_template = load_chat_template(args.chat_template)
    if resolved_chat_template is not None:
        # Get the tokenizer to check official template
@ -1647,7 +1655,7 @@ async def init_app_state(
        reasoning_parser=args.reasoning_parser,
        enable_prompt_tokens_details=args.enable_prompt_tokens_details,
        enable_force_include_usage=args.enable_force_include_usage,
-    ) if "generate" in model_config.supported_tasks else None
+    ) if "generate" in supported_tasks else None
    state.openai_serving_chat = OpenAIServingChat(
        engine_client,
        model_config,
@ -1664,7 +1672,7 @@ async def init_app_state(
        reasoning_parser=args.reasoning_parser,
        enable_prompt_tokens_details=args.enable_prompt_tokens_details,
        enable_force_include_usage=args.enable_force_include_usage,
-    ) if "generate" in model_config.supported_tasks else None
+    ) if "generate" in supported_tasks else None
    state.openai_serving_completion = OpenAIServingCompletion(
        engine_client,
        model_config,
@ -1673,7 +1681,7 @@ async def init_app_state(
        return_tokens_as_token_ids=args.return_tokens_as_token_ids,
        enable_prompt_tokens_details=args.enable_prompt_tokens_details,
        enable_force_include_usage=args.enable_force_include_usage,
-    ) if "generate" in model_config.supported_tasks else None
+    ) if "generate" in supported_tasks else None
    state.openai_serving_pooling = OpenAIServingPooling(
        engine_client,
        model_config,
@ -1681,7 +1689,7 @@ async def init_app_state(
        request_logger=request_logger,
        chat_template=resolved_chat_template,
        chat_template_content_format=args.chat_template_content_format,
-    ) if "encode" in model_config.supported_tasks else None
+    ) if "encode" in supported_tasks else None
    state.openai_serving_embedding = OpenAIServingEmbedding(
        engine_client,
        model_config,
@ -1689,24 +1697,22 @@ async def init_app_state(
        request_logger=request_logger,
        chat_template=resolved_chat_template,
        chat_template_content_format=args.chat_template_content_format,
-    ) if "embed" in model_config.supported_tasks else None
+    ) if "embed" in supported_tasks else None
    state.openai_serving_classification = ServingClassification(
        engine_client,
        model_config,
        state.openai_serving_models,
        request_logger=request_logger,
-    ) if "classify" in model_config.supported_tasks else None
+    ) if "classify" in supported_tasks else None
-    enable_serving_reranking = ("classify" in model_config.supported_tasks
+    enable_serving_reranking = ("classify" in supported_tasks and getattr(
-                                and getattr(model_config.hf_config,
+        model_config.hf_config, "num_labels", 0) == 1)
                                            "num_labels", 0) == 1)
    state.openai_serving_scores = ServingScores(
        engine_client,
        model_config,
        state.openai_serving_models,
        request_logger=request_logger,
-    ) if ("embed" in model_config.supported_tasks
+    ) if ("embed" in supported_tasks or enable_serving_reranking) else None
          or enable_serving_reranking) else None
    state.openai_serving_tokenization = OpenAIServingTokenization(
        engine_client,
@ -1721,13 +1727,13 @@ async def init_app_state(
        model_config,
        state.openai_serving_models,
        request_logger=request_logger,
-    ) if "transcription" in model_config.supported_tasks else None
+    ) if "transcription" in supported_tasks else None
    state.openai_serving_translation = OpenAIServingTranslation(
        engine_client,
        model_config,
        state.openai_serving_models,
        request_logger=request_logger,
-    ) if "transcription" in model_config.supported_tasks else None
+    ) if "transcription" in supported_tasks else None
    state.task = model_config.task
    state.enable_server_load_tracking = args.enable_server_load_tracking
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@ -1007,6 +1007,13 @@ class CompletionRequest(OpenAIBaseModel):
            "default: 0). Any priority other than 0 will raise an error "
            "if the served model does not use priority scheduling."),
    )
    request_id: str = Field(
        default_factory=lambda: f"{random_uuid()}",
        description=(
            "The request_id related to this request. If the caller does "
            "not set it, a random_uuid will be generated. This id is used "
            "through out the inference process and return in response."),
    )
    logits_processors: Optional[LogitsProcessors] = Field(
        default=None,
        description=(
@ -1251,6 +1258,13 @@ class EmbeddingCompletionRequest(OpenAIBaseModel):
            "default: 0). Any priority other than 0 will raise an error "
            "if the served model does not use priority scheduling."),
    )
    request_id: str = Field(
        default_factory=lambda: f"{random_uuid()}",
        description=(
            "The request_id related to this request. If the caller does "
            "not set it, a random_uuid will be generated. This id is used "
            "through out the inference process and return in response."),
    )
    # --8<-- [end:embedding-extra-params]
@ -1302,6 +1316,13 @@ class EmbeddingChatRequest(OpenAIBaseModel):
            "default: 0). Any priority other than 0 will raise an error "
            "if the served model does not use priority scheduling."),
    )
    request_id: str = Field(
        default_factory=lambda: f"{random_uuid()}",
        description=(
            "The request_id related to this request. If the caller does "
            "not set it, a random_uuid will be generated. This id is used "
            "through out the inference process and return in response."),
    )
    # --8<-- [end:chat-embedding-extra-params]
    @model_validator(mode="before")
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@ -14,6 +14,7 @@ import torch
 from prometheus_client import start_http_server
 from tqdm import tqdm
 import vllm.envs as envs
 from vllm.config import VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs, optional_type
 from vllm.engine.protocol import EngineClient
@ -335,6 +336,14 @@ async def run_batch(
    model_config = vllm_config.model_config
    if envs.VLLM_USE_V1:
        supported_tasks = await engine_client \
            .get_supported_tasks()  # type: ignore
    else:
        supported_tasks = model_config.supported_tasks
    logger.info("Supported_tasks: %s", supported_tasks)
    # Create the openai serving objects.
    openai_serving_models = OpenAIServingModels(
        engine_client=engine_client,
@ -351,7 +360,7 @@ async def run_batch(
        chat_template=None,
        chat_template_content_format="auto",
        enable_prompt_tokens_details=args.enable_prompt_tokens_details,
-    ) if "generate" in model_config.supported_tasks else None
+    ) if "generate" in supported_tasks else None
    openai_serving_embedding = OpenAIServingEmbedding(
        engine_client,
        model_config,
@ -359,19 +368,17 @@ async def run_batch(
        request_logger=request_logger,
        chat_template=None,
        chat_template_content_format="auto",
-    ) if "embed" in model_config.supported_tasks else None
+    ) if "embed" in supported_tasks else None
-    enable_serving_reranking = ("classify" in model_config.supported_tasks
+    enable_serving_reranking = ("classify" in supported_tasks and getattr(
-                                and getattr(model_config.hf_config,
+        model_config.hf_config, "num_labels", 0) == 1)
                                            "num_labels", 0) == 1)
    openai_serving_scores = ServingScores(
        engine_client,
        model_config,
        openai_serving_models,
        request_logger=request_logger,
-    ) if ("embed" in model_config.supported_tasks
+    ) if ("embed" in supported_tasks or enable_serving_reranking) else None
          or enable_serving_reranking) else None
    tracker = BatchProgressTracker()
    logger.info("Reading batch from %s...", args.input_file)
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@ -113,7 +113,9 @@ class OpenAIServingCompletion(OpenAIServing):
            return self.create_error_response(
                "Echo is unsupported with prompt embeds.")
-        request_id = f"cmpl-{self._base_request_id(raw_request)}"
+        request_id = (
            f"cmpl-"
            f"{self._base_request_id(raw_request, request.request_id)}")
        created_time = int(time.time())
        request_metadata = RequestResponseMetadata(request_id=request_id)
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@ -163,8 +163,9 @@ class OpenAIServingEmbedding(EmbeddingMixin):
        for the API specification. This API mimics the OpenAI Embedding API.
        """
        model_name = self._get_model_name(request.model)
-        request_id = (f"{self.request_id_prefix}-"
+        request_id = (
-                      f"{self._base_request_id(raw_request)}")
+            f"{self.request_id_prefix}-"
            f"{self._base_request_id(raw_request, request.request_id)}")
        ctx = EmbeddingServeContext(
            request=request,
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@ -880,7 +880,10 @@ class OpenAIServing:
        _chat_template_kwargs.update(chat_template_kwargs or {})
        request_prompt: Union[str, list[int]]
-        if isinstance(tokenizer, MistralTokenizer):
+
        if tokenizer is None:
            request_prompt = "placeholder"
        elif isinstance(tokenizer, MistralTokenizer):
            request_prompt = apply_mistral_chat_template(
                tokenizer,
                messages=messages,
@ -910,7 +913,14 @@ class OpenAIServing:
            request = tool_parser(tokenizer).adjust_request(  # type: ignore
                request=request)
-        if isinstance(request_prompt, str):
+        if tokenizer is None:
            assert isinstance(request_prompt, str), (
                "Prompt has to be a string", \
                "when the tokenizer is not initialised"
            )
            prompt_inputs = TextTokensPrompt(prompt=request_prompt,
                                             prompt_token_ids=[1])
        elif isinstance(request_prompt, str):
            prompt_inputs = await self._tokenize_prompt_input_async(
                request,
                tokenizer,
@ -947,9 +957,11 @@ class OpenAIServing:
        def _load_and_validate_embed(embed: bytes) -> EmbedsPrompt:
            tensor = torch.load(io.BytesIO(base64.b64decode(embed)),
                                weights_only=True)
-            assert isinstance(
+            assert isinstance(tensor, torch.Tensor) and tensor.dtype in (
-                tensor,
+                torch.float32,
-                (torch.FloatTensor, torch.BFloat16Tensor, torch.HalfTensor))
+                torch.bfloat16,
                torch.float16,
            )
            if tensor.dim() > 2:
                tensor = tensor.squeeze(0)
                assert tensor.dim() == 2
--- a/vllm/entrypoints/openai/serving_pooling.py
+++ b/vllm/entrypoints/openai/serving_pooling.py
@ -96,7 +96,11 @@ class OpenAIServingPooling(OpenAIServing):
                self.max_model_len, truncate_prompt_tokens)
            lora_request = self._maybe_get_adapters(request)
-            tokenizer = await self.engine_client.get_tokenizer(lora_request)
+            if self.model_config.skip_tokenizer_init:
                tokenizer = None
            else:
                tokenizer = await self.engine_client.get_tokenizer(lora_request
                                                                   )
            if isinstance(request, PoolingChatRequest):
                (
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@ -16,8 +16,8 @@ from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.pooling_params import PoolingTask
 from vllm.sequence import ExecuteModelRequest, PoolerOutput
 from vllm.tasks import SupportedTask
 from vllm.utils import make_async
 from vllm.worker.worker_base import WorkerBase
@ -136,9 +136,9 @@ class ExecutorBase(ABC):
        return self.collective_rpc(rpc_func)
    @cached_property  # Avoid unnecessary RPC calls
-    def supported_pooling_tasks(self) -> tuple[PoolingTask, ...]:
+    def supported_tasks(self) -> tuple[SupportedTask, ...]:
-        output = self.collective_rpc("get_supported_pooling_tasks")
+        output = self.collective_rpc("get_supported_tasks")
-        return tuple({task for tasks in output for task in tasks})
+        return output[0]
    def execute_model(
        self, execute_model_req: ExecuteModelRequest
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@ -1127,6 +1127,7 @@ def flashinfer_fused_moe_blockscale_fp8(
        tile_tokens_dim=_get_tile_tokens_dim(x.shape[0], top_k,
                                             global_num_experts),
        routing_method_type=2,  # DeepSeek-styled routing method
        use_shuffled_weight=False,
    )
--- a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
+++ b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
@ -5,144 +5,8 @@ from typing import Optional
 import torch
 from vllm import _custom_ops as ops
-from vllm.triton_utils import tl, triton
+from vllm.triton_utils import triton
-from vllm.utils import cdiv, round_up
+from vllm.utils import round_up
@triton.jit
 def moe_align_block_size_stage1(
    topk_ids_ptr,
    tokens_cnts_ptr,
    num_experts: tl.constexpr,
    numel: tl.constexpr,
    tokens_per_thread: tl.constexpr,
 ):
    pid = tl.program_id(0)
    start_idx = pid * tokens_per_thread
    off_c = (pid + 1) * num_experts
    for i in range(tokens_per_thread):
        if start_idx + i < numel:
            idx = tl.load(topk_ids_ptr + start_idx + i)
            token_cnt = tl.load(tokens_cnts_ptr + off_c + idx)
            tl.store(tokens_cnts_ptr + off_c + idx, token_cnt + 1)
@triton.jit
 def moe_align_block_size_stage2(
    tokens_cnts_ptr,
    num_experts: tl.constexpr,
 ):
    pid = tl.program_id(0)
    last_cnt = 0
    for i in range(1, num_experts + 1):
        token_cnt = tl.load(tokens_cnts_ptr + i * num_experts + pid)
        last_cnt = last_cnt + token_cnt
        tl.store(tokens_cnts_ptr + i * num_experts + pid, last_cnt)
@triton.jit
 def moe_align_block_size_stage3(
    total_tokens_post_pad_ptr,
    tokens_cnts_ptr,
    cumsum_ptr,
    num_experts: tl.constexpr,
    block_size: tl.constexpr,
 ):
    last_cumsum = 0
    off_cnt = num_experts * num_experts
    for i in range(1, num_experts + 1):
        token_cnt = tl.load(tokens_cnts_ptr + off_cnt + i - 1)
        last_cumsum = last_cumsum + tl.cdiv(token_cnt, block_size) * block_size
        tl.store(cumsum_ptr + i, last_cumsum)
    tl.store(total_tokens_post_pad_ptr, last_cumsum)
@triton.jit
 def moe_align_block_size_stage4(
    topk_ids_ptr,
    sorted_token_ids_ptr,
    expert_ids_ptr,
    tokens_cnts_ptr,
    cumsum_ptr,
    num_experts: tl.constexpr,
    block_size: tl.constexpr,
    numel: tl.constexpr,
    tokens_per_thread: tl.constexpr,
 ):
    pid = tl.program_id(0)
    start_idx = tl.load(cumsum_ptr + pid)
    end_idx = tl.load(cumsum_ptr + pid + 1)
    for i in range(start_idx, end_idx, block_size):
        tl.store(expert_ids_ptr + i // block_size, pid)
    start_idx = pid * tokens_per_thread
    off_t = pid * num_experts
    for i in range(start_idx, tl.minimum(start_idx + tokens_per_thread,
                                         numel)):
        expert_id = tl.load(topk_ids_ptr + i)
        token_cnt = tl.load(tokens_cnts_ptr + off_t + expert_id)
        rank_post_pad = token_cnt + tl.load(cumsum_ptr + expert_id)
        tl.store(sorted_token_ids_ptr + rank_post_pad, i)
        tl.store(tokens_cnts_ptr + off_t + expert_id, token_cnt + 1)
 # Triton implementation based on:
 # https://github.com/sgl-project/sglang/commit/ba5112ff691d791a9e38c6c71f59324a5fcb49d0
 def moe_align_block_size_triton(
    topk_ids: torch.Tensor,
    num_experts: int,
    block_size: int,
    sorted_token_ids: torch.Tensor,
    expert_ids: torch.Tensor,
    num_tokens_post_pad: torch.Tensor,
 ) -> None:
    numel = topk_ids.numel()
    grid = (num_experts, )
    tokens_cnts = torch.zeros((num_experts + 1, num_experts),
                              dtype=torch.int32,
                              device=topk_ids.device)
    cumsum = torch.zeros((num_experts + 1, ),
                         dtype=torch.int32,
                         device=topk_ids.device)
    tokens_per_thread = cdiv(numel, num_experts)
    sorted_token_ids.fill_(numel)
    expert_ids.zero_()
    moe_align_block_size_stage1[grid](
        topk_ids,
        tokens_cnts,
        num_experts,
        numel,
        tokens_per_thread,
    )
    moe_align_block_size_stage2[grid](
        tokens_cnts,
        num_experts,
    )
    moe_align_block_size_stage3[(1, )](
        num_tokens_post_pad,
        tokens_cnts,
        cumsum,
        num_experts,
        block_size,
    )
    moe_align_block_size_stage4[grid](
        topk_ids,
        sorted_token_ids,
        expert_ids,
        tokens_cnts,
        cumsum,
        num_experts,
        block_size,
        numel,
        tokens_per_thread,
    )
 def moe_align_block_size(
--- a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
+++ b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
@ -76,43 +76,43 @@ def _moe_unpermute_and_reduce(
 def moe_permute(
    hidden_states: torch.Tensor,
-    topk_weights: torch.Tensor,
+    a1q_scale: Optional[torch.Tensor],
    topk_ids: torch.Tensor,
    token_expert_indices: torch.Tensor,
    topk: int,
    n_expert: int,
-    n_local_expert: int,
+    n_local_expert: int = -1,
    expert_map: Optional[torch.Tensor] = None,
    align_block_size: Optional[int] = None,
    fill_invalid_expert: int = -1
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor, torch.Tensor,
           torch.Tensor]:
    """
    This function expands and permutes activation to gather uncontinuous tokens
      for each expert.
    Parameters:
    - hidden_states (torch.Tensor): The input tensor to the MoE layer.
-    - topk_weights (torch.Tensor): topk expert route weight for each token.
+    - a1q_scale (Optional[torch.Tensor]): quant scale for hidden_states
    - topk_ids (torch.Tensor): topk expert route id for each token.
    - token_expert_indices (torch.Tensor): indice for expanded hidden.
    - topk (int): The number of top-k experts to select.
    - n_expert (int): The number of expert.
    - n_local_expert (int): The number of expert in current EP rank.
    - expert_map (Optional[torch.Tensor]):  A tensor mapping expert indices
-        from the global expert space to the local expert space of the expert
+        from the global expert space to the local expert space of the expert 
        parallel shard.
    - align_block_size (Optional[int]): align group gemm block size for deepgemm
    - fill_invalid_expert(int): fill expert id in m_indices for invalid expert
      to workaround DeepGemm unsupported -1 in m_indices
    Returns:
    - permuted_hidden_states (torch.Tensor): permuted activation.
    - a1q_scale (Optional[torch.Tensor]): quant scale for hidden_states
    - expert_first_token_offset (torch.Tensor): offset of the first token
       of each expert for standard grouped gemm. if enable 'align_block_size'
       expert_first_token_offset will align up to 'align_block_size'.
-    - src_row_id2dst_row_id_map (torch.Tensor): idx map for moe_unpermute.
+    - inv_permuted_idx (torch.Tensor): idx map for moe_unpermute.
    - permuted_idx (torch.Tensor): idx map from hidden to permuted_hidden.
    - m_indices: m_indices for grouped gemm in deepgemm,`m_indices[i]` records
    the group which the j-th row of the LHS belong to.`
    """
    n_token, n_hidden = hidden_states.size()
    topk = topk_ids.size(1)
    assert (n_hidden * hidden_states.element_size()
            ) % 16 == 0, "permue kernel need hidden dim align to 16B"
    permuted_row_size = n_token * topk
@ -120,12 +120,19 @@ def moe_permute(
        permuted_row_size = (permuted_row_size + n_expert *
                             (align_block_size - 1) + align_block_size -
                             1) // align_block_size * align_block_size
-
+    if n_local_expert == -1:
        n_local_expert = n_expert
    permuted_hidden_states = torch.empty(
        (permuted_row_size, n_hidden),
        dtype=hidden_states.dtype,
        device=hidden_states.device,
    )
    token_expert_indices = torch.arange(0,
                                        n_token * topk,
                                        dtype=torch.int32,
                                        device=hidden_states.device).reshape(
                                            (n_token, topk))
    m_indices = torch.full((permuted_row_size, ),
                           fill_invalid_expert,
                           dtype=torch.int32,
@ -133,57 +140,54 @@ def moe_permute(
    expert_first_token_offset = torch.empty(n_local_expert + 1,
                                            dtype=torch.int64,
                                            device=hidden_states.device)
-    src_row_id2dst_row_id_map = torch.empty((n_token, topk),
+    permuted_idx = torch.full((permuted_row_size, ),
-                                            dtype=torch.int32,
+                              n_token * topk,
-                                            device=hidden_states.device)
+                              dtype=torch.int32,
-    torch.ops._moe_C.moe_permute(hidden_states, topk_weights, topk_ids,
+                              device=hidden_states.device)
-                                 token_expert_indices, expert_map, n_expert,
+    inv_permuted_idx = torch.empty((n_token, topk),
-                                 n_local_expert, topk, align_block_size,
+                                   dtype=torch.int32,
-                                 permuted_hidden_states,
+                                   device=hidden_states.device)
-                                 expert_first_token_offset,
+    topk_ids = topk_ids.to(torch.int32)
-                                 src_row_id2dst_row_id_map, m_indices)
+    torch.ops._moe_C.moe_permute(hidden_states, topk_ids, token_expert_indices,
-    return (permuted_hidden_states, expert_first_token_offset,
+                                 expert_map, n_expert, n_local_expert, topk,
-            src_row_id2dst_row_id_map, m_indices)
+                                 align_block_size, permuted_hidden_states,
                                 expert_first_token_offset, inv_permuted_idx,
                                 permuted_idx, m_indices)
    if a1q_scale is not None:
        a1q_scale = a1q_scale[permuted_idx.clamp(max=n_token * topk - 1) //
                              topk]
    return (permuted_hidden_states, a1q_scale, expert_first_token_offset,
            inv_permuted_idx.flatten(), m_indices)
 def moe_unpermute(
    out: torch.Tensor,
    permuted_hidden_states: torch.Tensor,
    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
+    inv_permuted_idx: torch.Tensor,
-    src_row_id2dst_row_id_map: torch.Tensor,
+    expert_first_token_offset: Optional[torch.Tensor] = None,
-    expert_first_token_offset: torch.Tensor,
+) -> None:
    topk: int,
    n_expert: int,
    n_local_expert: int,
 ) -> torch.Tensor:
    """
    This function expands and permutes activation to gathering uncontinuous
      tokens for each expert.
    Parameters:
    - out (torch.Tensor): output tensor
    - permuted_hidden_states (torch.Tensor): permuted activation.
    - topk_weights (torch.Tensor): topk expert route weight for each token.
-    - topk_ids (torch.Tensor): topk expert route id for each token.
+    - inv_permuted_idx (torch.Tensor): row idx map for moe_unpermute.
-    - expert_first_token_offset (torch.Tensor): offset of the first token
+    - expert_first_token_offset (Optional[torch.Tensor]): offset of the first 
-       of each expert for grouped gemm.
+      token of each expert for grouped gemm.
    - topk (int): The number of top-k experts to select.
    - n_expert (int): The number of expert.
    - n_local_expert (int): The number of expert in current EP rank.
    Returns:
    - hidden_states (torch.Tensor): The reduced and unpermuted activation
      tensor.
    """
-    n_token, n_hidden = topk_weights.size(0), permuted_hidden_states.size(-1)
+    topk = topk_weights.size(1)
    n_hidden = permuted_hidden_states.size(-1)
    assert (n_hidden * permuted_hidden_states.element_size()
            ) % 16 == 0, "unpermue kernel need hidden dim align to 16B"
    hidden_states = torch.empty((n_token, n_hidden),
                                dtype=permuted_hidden_states.dtype,
                                device=permuted_hidden_states.device)
    torch.ops._moe_C.moe_unpermute(permuted_hidden_states, topk_weights,
-                                   topk_ids, src_row_id2dst_row_id_map,
+                                   inv_permuted_idx, expert_first_token_offset,
-                                   expert_first_token_offset, n_expert,
+                                   topk, out)
                                   n_local_expert, topk, hidden_states)
    return hidden_states
 def moe_permute_unpermute_supported():
--- a/vllm/model_executor/layers/mamba/mamba_mixer2.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py
@ -24,6 +24,7 @@ from vllm.model_executor.layers.mamba.mamba_utils import (
    extra_groups_for_head_shards, get_mamba_state_shape)
 from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
    causal_conv1d_fn, causal_conv1d_update)
 from vllm.model_executor.layers.mamba.ops.layernorm_gated import rms_norm_gated
 from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
    selective_state_update)
 from vllm.model_executor.layers.mamba.ops.ssd_combined import (
@ -133,21 +134,15 @@ class Mixer2RMSNormGated(CustomOp):
            return x * nn.functional.silu(gate.to(
                torch.float32)).to(input_dtype)
-        if self.tp_size > 1 or self.n_groups != 1:
+        if (((self.n_groups % self.tp_size) != 0) or self.n_groups != 1):
            return self.forward_native(x, gate)
-        from vllm import _custom_ops as ops
+        return rms_norm_gated(x,
-
+                              self.weight.data,
-        # cast x and gate to float32 before silu
+                              bias=None,
-        out = torch.empty_like(x)
+                              z=gate,
-        y = x * nn.functional.silu(gate.to(torch.float32))
+                              eps=self.variance_epsilon,
-        ops.rms_norm(
+                              norm_before_gate=False)
            out,
            y.to(x.dtype),
            self.weight.data,
            self.variance_epsilon,
        )
        return out
 def mamba_v2_sharded_weight_loader(
--- a/vllm/model_executor/layers/mamba/ops/layernorm_gated.py
+++ b/vllm/model_executor/layers/mamba/ops/layernorm_gated.py
@ -0,0 +1,168 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Copyright (c) 2024, Tri Dao.
 # Adapted from https://github.com/state-spaces/mamba/blob/60dadf2e0ee730ac337035d5533de10bc26e4847/mamba_ssm/ops/triton/layernorm_gated.py
 import torch
 from vllm.triton_utils import tl, triton
@triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None})
@triton.heuristics({"HAS_Z": lambda args: args["Z"] is not None})
@triton.jit
 def _layer_norm_fwd_1pass_kernel(
    X,  # pointer to the input
    Y,  # pointer to the output
    W,  # pointer to the weights
    B,  # pointer to the biases
    Z,  # pointer to the other branch
    Mean,  # pointer to the mean
    Rstd,  # pointer to the 1/std
    stride_x_row: tl.int64,
    stride_y_row: tl.int64,
    stride_z_row: tl.int64,
    M: tl.int64,  # number of rows in X
    N: tl.int64,  # number of columns in X
    eps,  # epsilon to avoid division by zero
    BLOCK_N: tl.constexpr,
    HAS_BIAS: tl.constexpr,
    HAS_Z: tl.constexpr,
    NORM_BEFORE_GATE: tl.constexpr,
    IS_RMS_NORM: tl.constexpr,
 ):
    # Map the program id to the row of X and Y it should compute.
    row = tl.program_id(0)
    group = tl.program_id(1)
    X += row * stride_x_row + group * N
    Y += row * stride_y_row + group * N
    if HAS_Z:
        Z += row * stride_z_row + group * N
    if not IS_RMS_NORM:
        Mean += group * M
    Rstd += group * M
    W += group * N
    if HAS_BIAS:
        B += group * N
    # Compute mean and variance
    cols = tl.arange(0, BLOCK_N)
    x = tl.load(X + cols, mask=cols < N, other=0.).to(tl.float32)
    if HAS_Z and not NORM_BEFORE_GATE:
        z = tl.load(Z + cols, mask=cols < N).to(tl.float32)
        x *= z * tl.sigmoid(z)
    if not IS_RMS_NORM:
        mean = tl.sum(x, axis=0) / N
        tl.store(Mean + row, mean)
        xbar = tl.where(cols < N, x - mean, 0.)
        var = tl.sum(xbar * xbar, axis=0) / N
    else:
        xbar = tl.where(cols < N, x, 0.)
        var = tl.sum(xbar * xbar, axis=0) / N
    rstd = 1 / tl.sqrt(var + eps)
    tl.store(Rstd + row, rstd)
    # Normalize and apply linear transformation
    mask = cols < N
    w = tl.load(W + cols, mask=mask).to(tl.float32)
    if HAS_BIAS:
        b = tl.load(B + cols, mask=mask).to(tl.float32)
    x_hat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd
    y = x_hat * w + b if HAS_BIAS else x_hat * w
    if HAS_Z and NORM_BEFORE_GATE:
        z = tl.load(Z + cols, mask=mask).to(tl.float32)
        y *= z * tl.sigmoid(z)
    # Write output
    tl.store(Y + cols, y, mask=mask)
 def _layer_norm_fwd(x,
                    weight,
                    bias,
                    eps,
                    z=None,
                    out=None,
                    group_size=None,
                    norm_before_gate=True,
                    is_rms_norm=False):
    M, N = x.shape
    if group_size is None:
        group_size = N
    assert N % group_size == 0
    ngroups = N // group_size
    assert x.stride(-1) == 1
    if z is not None:
        assert z.stride(-1) == 1
        assert z.shape == (M, N)
    assert weight.shape == (N, )
    assert weight.stride(-1) == 1
    if bias is not None:
        assert bias.stride(-1) == 1
        assert bias.shape == (N, )
    # allocate output
    if out is not None:
        assert out.shape == x.shape
    else:
        out = torch.empty_like(x)
    assert out.stride(-1) == 1
    mean = torch.empty((ngroups * M, ), dtype=torch.float32,
                       device=x.device) if not is_rms_norm else None
    rstd = torch.empty((ngroups * M, ), dtype=torch.float32, device=x.device)
    # Less than 64KB per feature: enqueue fused kernel
    MAX_FUSED_SIZE = 65536 // x.element_size()
    BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(group_size))
    if group_size > BLOCK_N:
        raise RuntimeError(
            "This layer norm doesn't support feature dim >= 64KB.")
    # heuristics for number of warps
    num_warps = min(max(BLOCK_N // 256, 1), 8)
    grid = (M, ngroups)
    with torch.cuda.device(x.device.index):
        _layer_norm_fwd_1pass_kernel[grid](x,
                                           out,
                                           weight,
                                           bias,
                                           z,
                                           mean,
                                           rstd,
                                           x.stride(0),
                                           out.stride(0),
                                           z.stride(0) if z is not None else 0,
                                           M,
                                           group_size,
                                           eps,
                                           BLOCK_N=BLOCK_N,
                                           NORM_BEFORE_GATE=norm_before_gate,
                                           IS_RMS_NORM=is_rms_norm,
                                           num_warps=num_warps)
    return out, mean, rstd
 def rms_norm_gated(x,
                   weight,
                   bias,
                   z=None,
                   eps=1e-6,
                   group_size=None,
                   norm_before_gate=True):
    x_shape_og = x.shape
    # reshape input data into 2D tensor
    x = x.reshape(-1, x.shape[-1])
    if x.stride(-1) != 1:
        x = x.contiguous()
    if z is not None:
        assert z.shape == x_shape_og
        z = z.reshape(-1, z.shape[-1])
        if z.stride(-1) != 1:
            z = z.contiguous()
    weight = weight.contiguous()
    if bias is not None:
        bias = bias.contiguous()
    y, _, _ = _layer_norm_fwd(x,
                              weight,
                              bias,
                              eps,
                              z=z,
                              group_size=group_size,
                              norm_before_gate=norm_before_gate,
                              is_rms_norm=True)
    return y.reshape(x_shape_og)
--- a/vllm/model_executor/layers/pooler.py
+++ b/vllm/model_executor/layers/pooler.py
@ -16,8 +16,9 @@ from vllm.config import ModelConfig, PoolerConfig
 from vllm.model_executor.pooling_metadata import (  # noqa: E501
    PoolingMetadata as V0PoolingMetadata)
 from vllm.model_executor.pooling_metadata import PoolingTensors
-from vllm.pooling_params import PoolingParams, PoolingTask
+from vllm.pooling_params import PoolingParams
 from vllm.sequence import PoolerOutput, PoolingSequenceGroupOutput
 from vllm.tasks import PoolingTask
 from vllm.utils import resolve_obj_by_qualname
 from vllm.v1.pool.metadata import PoolingMetadata as V1PoolingMetadata
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@ -33,7 +33,7 @@ from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
    find_matched_target, is_activation_quantization_format,
    should_ignore_layer)
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
-from vllm.model_executor.layers.quantization.utils.nvfp4_emulation_utils import (  # noqa: E501
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
    cutlass_fp4_supported)
 from vllm.platforms import current_platform
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@ -27,8 +27,8 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
    prepare_moe_fp4_layer_for_marlin)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
    prepare_moe_fp8_layer_for_marlin)
-from vllm.model_executor.layers.quantization.utils.nvfp4_emulation_utils import (  # noqa: E501
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    cutlass_fp4_supported)
+    cutlass_fp4_supported, swizzle_blockscale)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
    all_close_1d, normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize)
 from vllm.model_executor.utils import set_weight_attrs
@ -193,29 +193,6 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
        set_weight_attrs(w2_input_scale, extra_weight_attrs)
    def swizzle_blockscale(self, scale: torch.tensor):
        assert (scale.dtype == torch.float8_e4m3fn)
        # Pad and blockwise interleave weight_scale
        scale_ndim = scale.ndim
        if scale.ndim == 2:
            scale = scale.unsqueeze(0)
        assert scale.ndim == 3
        B, M, K = scale.shape
        round_up_multiple = lambda x, m: (x + m - 1) // m * m
        M_padded = round_up_multiple(M, 128)
        K_padded = round_up_multiple(K, 4)
        padded_scale = torch.zeros((B, M_padded, K_padded), dtype=scale.dtype)
        padded_scale[:B, :M, :K] = scale
        batches, rows, cols = padded_scale.shape
        assert rows % 128 == 0
        assert cols % 4 == 0
        padded_scale = padded_scale.reshape(batches, rows // 128, 4, 32,
                                            cols // 4, 4)
        swizzled_scale = padded_scale.permute((0, 1, 4, 3, 2, 5))
        swizzled_scale = swizzled_scale.contiguous().cuda()
        return (swizzled_scale.reshape(M, K)
                if scale_ndim == 2 else swizzled_scale.reshape(B, M, K))
    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
        # From packed to weight
@ -243,13 +220,13 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
            return
        # swizzle weight scales
-        layer.w13_blockscale_swizzled = torch.nn.Parameter(
+        layer.w13_blockscale_swizzled = torch.nn.Parameter(swizzle_blockscale(
-            self.swizzle_blockscale(layer.w13_weight_scale),
+            layer.w13_weight_scale),
-            requires_grad=False)
+                                                           requires_grad=False)
-        layer.w2_blockscale_swizzled = torch.nn.Parameter(
+        layer.w2_blockscale_swizzled = torch.nn.Parameter(swizzle_blockscale(
-            self.swizzle_blockscale(layer.w2_weight_scale),
+            layer.w2_weight_scale),
-            requires_grad=False)
+                                                          requires_grad=False)
        # w13
        w13_input_global_scale = layer.w13_input_global_scale.max(
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@ -9,8 +9,7 @@ from torch.nn.parameter import Parameter
 import vllm.envs as envs
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
-from vllm._custom_ops import (cutlass_scaled_fp4_mm,
+from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
                              cutlass_scaled_mm_supports_fp4, scaled_fp4_quant)
 from vllm.distributed import get_ep_group
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig
@ -28,7 +27,7 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
    apply_fp4_marlin_linear, is_fp4_marlin_supported,
    prepare_fp4_layer_for_marlin, prepare_moe_fp4_layer_for_marlin)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    GroupShape, is_layer_skipped)
+    GroupShape, cutlass_fp4_supported, is_layer_skipped, swizzle_blockscale)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
    Fp8LinearOp, requantize_with_max_scale)
 from vllm.model_executor.parameter import (ModelWeightParameter,
@ -667,14 +666,6 @@ class ModelOptNvFp4Config(QuantizationConfig):
        return None
 def cutlass_fp4_supported() -> bool:
    if not current_platform.is_cuda():
        return False
    capability_tuple = current_platform.get_device_capability()
    capability = -1 if capability_tuple is None else capability_tuple.to_int()
    return cutlass_scaled_mm_supports_fp4(capability)
 class ModelOptFp8KVCacheMethod(BaseKVCacheMethod):
    """
    Supports loading kv-cache scaling factors from FP8 checkpoints.
@ -772,29 +763,6 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
        layer.register_parameter("weight_scale", weight_scale)
    def swizzle_blockscale(self, scale: torch.tensor):
        assert (scale.dtype == torch.float8_e4m3fn)
        # Pad and blockwise interleave weight_scale
        scale_ndim = scale.ndim
        if scale.ndim == 2:
            scale = scale.unsqueeze(0)
        assert scale.ndim == 3
        B, M, K = scale.shape
        round_up_multiple = lambda x, m: (x + m - 1) // m * m
        M_padded = round_up_multiple(M, 128)
        K_padded = round_up_multiple(K, 4)
        padded_scale = torch.zeros((B, M_padded, K_padded), dtype=scale.dtype)
        padded_scale[:B, :M, :K] = scale
        batches, rows, cols = padded_scale.shape
        assert rows % 128 == 0
        assert cols % 4 == 0
        padded_scale = padded_scale.reshape(batches, rows // 128, 4, 32,
                                            cols // 4, 4)
        swizzled_scale = padded_scale.permute((0, 1, 4, 3, 2, 5))
        swizzled_scale = swizzled_scale.contiguous().cuda()
        return (swizzled_scale.reshape(M, K)
                if scale_ndim == 2 else swizzled_scale.reshape(B, M, K))
    def process_weights_after_loading(self, layer: Module) -> None:
        # global scales:
@ -814,7 +782,7 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
            "Expected weight_scale.dim(1) to be divisible by 16")
        assert (layer.weight_scale.dtype == torch.float8_e4m3fn), (
            "Weight Block scale must be represented as FP8-E4M3")
-        swizzled_weight_scale = self.swizzle_blockscale(layer.weight_scale)
+        swizzled_weight_scale = swizzle_blockscale(layer.weight_scale)
        layer.weight_scale_swizzled = Parameter(swizzled_weight_scale,
                                                requires_grad=False)
@ -1060,29 +1028,6 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                                                 weight_loader=weight_loader)
        layer.register_parameter("w2_input_scale", w2_input_scale)
    def swizzle_blockscale(self, scale: torch.tensor):
        assert (scale.dtype == torch.float8_e4m3fn)
        # Pad and blockwise interleave weight_scale
        scale_ndim = scale.ndim
        if scale.ndim == 2:
            scale = scale.unsqueeze(0)
        assert scale.ndim == 3
        B, M, K = scale.shape
        round_up_multiple = lambda x, m: (x + m - 1) // m * m
        M_padded = round_up_multiple(M, 128)
        K_padded = round_up_multiple(K, 4)
        padded_scale = torch.zeros((B, M_padded, K_padded), dtype=scale.dtype)
        padded_scale[:B, :M, :K] = scale
        batches, rows, cols = padded_scale.shape
        assert rows % 128 == 0
        assert cols % 4 == 0
        padded_scale = padded_scale.reshape(batches, rows // 128, 4, 32,
                                            cols // 4, 4)
        swizzled_scale = padded_scale.permute((0, 1, 4, 3, 2, 5))
        swizzled_scale = swizzled_scale.contiguous().cuda()
        return (swizzled_scale.reshape(M, K)
                if scale_ndim == 2 else swizzled_scale.reshape(B, M, K))
    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
        # GEMM 1
        # The FlashInfer Cutlass fused MoE kernel expects the combined weights
@ -1128,8 +1073,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
            "Expected weight_scale.dim(1) to be divisible by 16")
        assert (layer.w13_weight_scale.dtype == torch.float8_e4m3fn), (
            "Weight Blockscale must be represented as FP8-E4M3")
-        w13_blockscale_swizzled = self.swizzle_blockscale(
+        w13_blockscale_swizzled = swizzle_blockscale(layer.w13_weight_scale)
            layer.w13_weight_scale)
        layer.w13_blockscale_swizzled = Parameter(w13_blockscale_swizzled,
                                                  requires_grad=False)
@ -1151,7 +1095,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
            "Expected weight_scale.dim(1) to be divisible by 16")
        assert (layer.w2_weight_scale.dtype == torch.float8_e4m3fn), (
            "Weight Blockscale must be represented as FP8-E4M3")
-        w2_blockscale_swizzled = self.swizzle_blockscale(layer.w2_weight_scale)
+        w2_blockscale_swizzled = swizzle_blockscale(layer.w2_weight_scale)
        layer.w2_blockscale_swizzled = Parameter(w2_blockscale_swizzled,
                                                 requires_grad=False)
--- a/vllm/model_executor/layers/quantization/rtn.py
+++ b/vllm/model_executor/layers/quantization/rtn.py
@ -3,18 +3,19 @@
 # Copyright © 2025, Oracle and/or its affiliates.
 import os
-from typing import Any, Optional
+from typing import Any, Callable, Optional
 import torch
 import torch.nn.functional as F
 from torch.nn.parameter import Parameter
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import FusedMoE, FusedMoEMethodBase
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                               set_weight_attrs)
 from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
+    QuantizationConfig, QuantizeMethodBase)
 logger = init_logger(__name__)
 """By default, use 8 bit as target precision, but it can be 
@ -71,9 +72,11 @@ class RTNConfig(QuantizationConfig):
        return cls(weight_bits, group_size)
    def get_quant_method(self, layer: torch.nn.Module,
-                         prefix: str) -> Optional["RTNLinearMethod"]:
+                         prefix: str) -> Optional["QuantizeMethodBase"]:
        if isinstance(layer, LinearBase):
            return RTNLinearMethod(self)
        elif isinstance(layer, FusedMoE):
            return RTNMoEMethod(self)
        return None
@ -94,11 +97,18 @@ class RTNTensor:
            self.data.narrow(dim, start // factor, length // factor),
            self.scale.narrow(dim, start, length), self.quant_config)
    def __getitem__(self, key):
        return RTNTensor(self.data[key], self.scale[key], self.quant_config)
    @property
    def shape(self):
        shape = self.data.shape
        factor = 1 if self.quant_config.weight_bits == 8 else 2
-        return torch.Size((shape[0] * factor, shape[1]))
+        batch_present = len(shape) == 3
        if batch_present:
            return torch.Size((shape[0], shape[1] * factor, shape[2]))
        else:
            return torch.Size((shape[0] * factor, shape[1]))
    def copy_(self, loaded_weight: torch.Tensor) -> None:
        qweight, weight_scale = rtn_quantize(loaded_weight.cuda(),
@ -165,7 +175,7 @@ class RTNLinearMethod(LinearMethodBase):
        weight = RTNParameter(data=torch.empty(output_size_per_partition //
                                               factor,
                                               input_size_per_partition,
-                                               dtype=torch.int8),
+                                               dtype=torch.uint8),
                              scale=scale,
                              quant_config=self.quant_config)
@ -180,18 +190,7 @@ class RTNLinearMethod(LinearMethodBase):
        layer.output_size_per_partition = output_size_per_partition
    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        """torch.compile does not know how to deal with a Parameter subclass
+        fix_weights(layer, "weight")
        (aka RTNParameter). As we don't really need RTNParameters for the
        forward pass, we replace them with equivalent instances of Parameters.
        """
        old_weight = layer.weight
        assert isinstance(old_weight, RTNParameter)
        data = old_weight.data.data
        delattr(layer, "weight")
        new_weight = Parameter(data=data, requires_grad=False)
        layer.register_parameter("weight", new_weight)
    def apply(self,
              layer: torch.nn.Module,
@ -209,6 +208,128 @@ class RTNLinearMethod(LinearMethodBase):
        return out
 class RTNMoEMethod(FusedMoEMethodBase):
    def __init__(self, quant_config: RTNConfig):
        self.quant_config = quant_config
    def create_weights(self, layer: torch.nn.Module, num_experts: int,
                       hidden_size: int, intermediate_size_per_partition: int,
                       params_dtype: torch.dtype, **extra_weight_attrs):
        factor = 1 if self.quant_config.weight_bits == 8 else 2
        # Fused gate_up_proj (column parallel)
        num_groups_per_col = (hidden_size // self.quant_config.group_size
                              if self.quant_config.group_size != -1 else 1)
        w13_scale = Parameter(
            torch.empty(num_experts,
                        2 * intermediate_size_per_partition,
                        num_groups_per_col,
                        dtype=params_dtype),
            requires_grad=False,
        )
        layer.register_parameter("w13_scale", w13_scale)
        w13_weight = RTNParameter(data=torch.empty(
            num_experts,
            2 * intermediate_size_per_partition // factor,
            hidden_size,
            dtype=torch.uint8),
                                  scale=w13_scale,
                                  quant_config=self.quant_config)
        layer.register_parameter("w13_weight", w13_weight)
        set_weight_attrs(w13_weight, extra_weight_attrs)
        # down_proj (row parallel)
        num_groups_per_col = (intermediate_size_per_partition //
                              self.quant_config.group_size
                              if self.quant_config.group_size != -1 else 1)
        w2_scale = Parameter(torch.zeros(num_experts,
                                         hidden_size,
                                         num_groups_per_col,
                                         dtype=params_dtype),
                             requires_grad=False)
        layer.register_parameter("w2_scale", w2_scale)
        w2_weight = RTNParameter(data=torch.empty(
            num_experts,
            hidden_size // factor,
            intermediate_size_per_partition,
            dtype=torch.uint8),
                                 scale=w2_scale,
                                 quant_config=self.quant_config)
        layer.register_parameter("w2_weight", w2_weight)
        set_weight_attrs(w2_weight, extra_weight_attrs)
    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
        weight_bits = self.quant_config.weight_bits
        fix_weights(layer, "w13_weight", weight_bits == 4)
        fix_weights(layer, "w2_weight", weight_bits == 4)
    def apply(
        self,
        layer: torch.nn.Module,
        x: torch.Tensor,
        router_logits: torch.Tensor,
        top_k: int,
        renormalize: bool,
        use_grouped_topk: bool = False,
        topk_group: Optional[int] = None,
        num_expert_group: Optional[int] = None,
        global_num_experts: int = -1,
        expert_map: Optional[torch.Tensor] = None,
        custom_routing_function: Optional[Callable] = None,
        scoring_func: str = "softmax",
        e_score_correction_bias: Optional[torch.Tensor] = None,
        apply_router_weight_on_input: bool = False,
        activation: str = "silu",
        enable_eplb: bool = False,
        expert_load_view: Optional[torch.Tensor] = None,
        logical_to_physical_map: Optional[torch.Tensor] = None,
        logical_replica_count: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        if enable_eplb:
            raise NotImplementedError(
                "EPLB not supported for `RTNMoEMethod` yet.")
        from vllm.model_executor.layers.fused_moe import fused_experts
        topk_weights, topk_ids = FusedMoE.select_experts(
            hidden_states=x,
            router_logits=router_logits,
            use_grouped_topk=use_grouped_topk,
            top_k=top_k,
            renormalize=renormalize,
            topk_group=topk_group,
            num_expert_group=num_expert_group,
            custom_routing_function=custom_routing_function,
            scoring_func=scoring_func,
            e_score_correction_bias=e_score_correction_bias)
        weight_bits = self.quant_config.weight_bits
        group_size = self.quant_config.group_size
        ret = fused_experts(
            x,
            layer.w13_weight,
            layer.w2_weight,
            topk_weights=topk_weights,
            topk_ids=topk_ids,
            inplace=True,
            activation=activation,
            use_int4_w4a16=weight_bits == 4,
            use_int8_w8a16=weight_bits == 8,
            global_num_experts=global_num_experts,
            w1_scale=layer.w13_scale,
            w2_scale=layer.w2_scale,
            apply_router_weight_on_input=apply_router_weight_on_input,
            expert_map=expert_map,
            block_shape=[0, group_size])
        return ret
 def rtn_quantize(tensor: torch.Tensor, num_bits: int,
                 group_size: int) -> tuple[torch.Tensor, torch.Tensor]:
    """Quantize a tensor using per-group static scaling factor.
@ -221,34 +342,44 @@ def rtn_quantize(tensor: torch.Tensor, num_bits: int,
                    If equal to -1, each row in the input tensor is treated
                    as one group.
    """
    batch_present = len(tensor.shape) == 3
    if not batch_present:
        tensor = tensor.unsqueeze(0)
    q_range = 2**num_bits
-    num_groups = (tensor.shape[0] * tensor.shape[1] //
+    num_groups = (tensor.shape[1] * tensor.shape[2] //
-                  group_size if group_size != -1 else tensor.shape[0])
+                  group_size if group_size != -1 else tensor.shape[1])
    """Calculate a scaling factor per input group.
    """
-    input_flat = tensor.reshape(num_groups, -1)
+    input_flat = tensor.reshape(tensor.shape[0], num_groups, -1)
-    input_min = torch.min(input_flat, dim=1, keepdim=True)[0]
+    input_min = torch.min(input_flat, dim=2, keepdim=True)[0]
-    input_max = torch.max(input_flat, dim=1, keepdim=True)[0]
+    input_max = torch.max(input_flat, dim=2, keepdim=True)[0]
    input_max_abs = torch.max(input_min.abs(), input_max.abs())
    scale = (input_max_abs * 2.0 / (q_range - 1))
-    """Scale each input group, truncate and round to the nearest integer.
+    """Scale each input group, round to the nearest integer, shift 
    the range and truncate.
    """
    scaled_input = input_flat / scale
    scaled_input = scaled_input.clamp(-q_range // 2, q_range // 2 - 1)
    scaled_input = scaled_input.round()
    scaled_input += q_range // 2
    scaled_input = scaled_input.clamp(0, q_range - 1)
-    scale = scale.reshape(tensor.shape[0], -1).contiguous()
+    scale = scale.reshape(tensor.shape[0], tensor.shape[1], -1).contiguous()
-    inputs_q = scaled_input.reshape(tensor.shape).to(torch.int8)
+    inputs_q = scaled_input.reshape(tensor.shape).to(torch.uint8)
    inputs_q = inputs_q.contiguous()
    if num_bits == 4:
        """Pack two 4-bit values into each byte.
        """
-        inputs_q = (inputs_q[:, 1::2] << 4) | (inputs_q[:, ::2] & 0xf)
+        inputs_q = (inputs_q[:, :, 1::2] << 4) | (inputs_q[:, :, ::2] & 0xf)
-        inputs_q = inputs_q.reshape(tensor.shape[0] // 2, tensor.shape[1])
+        inputs_q = inputs_q.reshape(tensor.shape[0], tensor.shape[1] // 2,
                                    tensor.shape[2])
        inputs_q = inputs_q.contiguous()
    if not batch_present:
        inputs_q = inputs_q.squeeze(0)
        scale = scale.squeeze(0)
    return inputs_q, scale
@ -259,31 +390,60 @@ def rtn_dequantize(tensor: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
        tensor: The input tensor.
        scale: The tensor with per-group scale factors.
    """
    batch_present = len(tensor.shape) == 3
    if not batch_present:
        tensor = tensor.unsqueeze(0)
        scale = scale.unsqueeze(0)
-    num_groups = scale.size(0) * scale.size(1)
+    num_groups = scale.size(1) * scale.size(2)
-    input_dim, output_dim = tensor.shape
+    batch, input_dim, output_dim = tensor.shape
-    num_bits = 8 if input_dim == scale.size(0) else 4
+    num_bits = 8 if input_dim == scale.size(1) else 4
    q_range = 2**num_bits
    if num_bits == 4:
        input_dim *= 2
-    data = torch.empty((input_dim, output_dim),
+    data = torch.empty((batch, input_dim, output_dim),
                       dtype=scale.dtype,
                       device=tensor.device)
    if num_bits == 8:
        data.copy_(tensor)
        data -= q_range // 2
    else:
        """Unpack two 4-bit values from each byte.
        """
-        tensor = tensor.reshape(input_dim, output_dim // 2)
+        tensor = tensor.reshape(batch, input_dim, output_dim // 2)
        for i in range(2):
-            data[:, i::2] = (tensor << 4 * (1 - i)) >> 4
+            data[:, :, i::2] = ((tensor << 4 *
                                 (1 - i)) >> 4).to(torch.int8) - q_range // 2
    """Scale each input group with its scaling factor.
    """
-    scale = scale.reshape(num_groups, -1)
+    scale = scale.reshape(batch, num_groups, -1)
-    data = data.reshape(num_groups, -1)
+    data = data.reshape(batch, num_groups, -1)
    data = torch.mul(data, scale)
-    input_deq = data.reshape((input_dim, output_dim)).contiguous()
+    input_deq = data.reshape((batch, input_dim, output_dim)).contiguous()
    if not batch_present:
        input_deq = input_deq.squeeze(0)
    return input_deq
 def fix_weights(layer: torch.nn.Module,
                param_name: str,
                reshape: bool = False):
    """torch.compile does not know how to deal with a Parameter subclass
    (aka RTNParameter). As we don't really need RTNParameters for the
    forward pass, we replace them with equivalent instances of Parameters.
    """
    old_weight = getattr(layer, param_name)
    assert isinstance(old_weight, RTNParameter)
    data = old_weight.data.data
    delattr(layer, param_name)
    if reshape:
        data = data.reshape(old_weight.shape[0], old_weight.shape[1] * 2, -1)
    new_weight = Parameter(data=data, requires_grad=False)
    layer.register_parameter(param_name, new_weight)
--- a/vllm/model_executor/layers/quantization/utils/int8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/int8_utils.py
@ -238,13 +238,20 @@ def per_token_group_quant_int8(
    int8_min = iinfo.min
    x_q = torch.empty_like(x, device=x.device, dtype=dtype)
    M = x.numel() // group_size
    N = group_size
    x_s = torch.empty(
        x.shape[:-1] + (x.shape[-1] // group_size, ),
        device=x.device,
        dtype=torch.float32,
    )
    # prefer CUDA kernel if available
    if current_platform.is_cuda():
        torch.ops._C.per_token_group_quant_int8(x, x_q, x_s, group_size, eps,
                                                float(int8_min),
                                                float(int8_max))
        return x_q, x_s
    M = x.numel() // group_size
    N = group_size
    BLOCK = triton.next_power_of_2(N)
    # heuristics for number of warps
--- a/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py
@ -2,13 +2,12 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import torch
 from vllm._custom_ops import cutlass_scaled_mm_supports_fp4
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 __all__ = [
-    "break_fp4_bytes", "dequantize_to_dtype", "ref_nvfp4_quant",
+    "break_fp4_bytes",
-    "cutlass_fp4_supported"
+    "dequantize_to_dtype",
    "ref_nvfp4_quant",
 ]
 FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
@ -17,14 +16,6 @@ kE2M1ToFloat = torch.tensor([0., 0.5, 1., 1.5, 2., 3., 4., 6.],
                            dtype=torch.float32)
 def cutlass_fp4_supported() -> bool:
    if not current_platform.is_cuda():
        return False
    capability_tuple = current_platform.get_device_capability()
    capability = -1 if capability_tuple is None else capability_tuple.to_int()
    return cutlass_scaled_mm_supports_fp4(capability)
 def break_fp4_bytes(a, dtype):
    assert a.dtype == torch.uint8
    m, n = a.shape
--- a/vllm/model_executor/layers/quantization/utils/quant_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py
@ -8,8 +8,10 @@ from typing import ClassVar, NamedTuple, Optional
 import numpy
 import torch
 from vllm._custom_ops import cutlass_scaled_mm_supports_fp4
 from vllm.model_executor.layers.quantization.qqq import (
    MARLIN_QQQ_SUPPORTED_NUM_BITS)
 from vllm.platforms import current_platform
 from vllm.scalar_type import ScalarType, scalar_types
@ -592,3 +594,56 @@ def awq_pack(
    q_w = q_w.reshape((-1, size_n)).contiguous()
    return pack_cols(q_w, num_bits, size_k, size_n)
 def swizzle_blockscale(scale: torch.Tensor) -> torch.Tensor:
    """
    Pad and block-interleave the FP4 block-scales so that they match the data
    layout expected by the CUTLASS / FlashInfer kernels.
    Parameters
    ----------
    scale: torch.Tensor
    Returns
    -------
    torch.Tensor
        The swizzled tensor with the same logical shape as *scale*.
    """
    assert scale.dtype == torch.float8_e4m3fn, (
        "swizzle_blockscale expects the input tensor to be in "
        "torch.float8_e4m3fn format.")
    scale_ndim = scale.ndim
    if scale_ndim == 2:
        scale = scale.unsqueeze(0)  # (1, M, K)
    assert scale.ndim == 3, "Expected a 2-D or 3-D tensor for block scales."
    B, M, K = scale.shape
    def _round_up(x: int, m: int) -> int:
        return (x + m - 1) // m * m
    M_padded = _round_up(M, 128)
    K_padded = _round_up(K, 4)
    padded = torch.zeros((B, M_padded, K_padded),
                         dtype=scale.dtype,
                         device=scale.device)
    padded[:B, :M, :K] = scale
    # Reshape / permute to the layout required by the kernel.
    padded = padded.reshape(B, M_padded // 128, 4, 32, K_padded // 4, 4)
    swizzled = padded.permute(0, 1, 4, 3, 2, 5).contiguous().cuda()
    if scale_ndim == 2:
        return swizzled.reshape(M, K)
    return swizzled.reshape(B, M, K)
 def cutlass_fp4_supported() -> bool:
    if not current_platform.is_cuda():
        return False
    capability_tuple = current_platform.get_device_capability()
    capability = -1 if capability_tuple is None else capability_tuple.to_int()
    return cutlass_scaled_mm_supports_fp4(capability)
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Optional, TypedDict, Union
+from typing import Annotated, Optional, Union
 import torch
 import torch.nn as nn
@ -29,6 +29,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                        PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 # yapf: disable
 from .idefics2_vision_model import Idefics2VisionConfig
@ -42,15 +43,26 @@ from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
                    merge_multimodal_embeddings)
-class AriaImagePixelInputs(TypedDict):
+class AriaImagePixelInputs(TensorSchema):
    pixel_values: torch.Tensor
    pixel_mask: Optional[torch.Tensor]
    """
-    Shape:
+    Dimensions:
-        pixel_values: `(batch_size * num_images, num_channels, height, width)`
+        - b: Batch size
-        pixel_mask: `(batch_size * num_images, height, width)`
+        - n: Number of images
        - c: Number of channels
        - h: Height of each image
        - w: Width of each image
    """
    pixel_values: Annotated[
        torch.Tensor,
        TensorShape("bn", 3, "h", "w"),
    ]
    pixel_mask: Annotated[
        Optional[torch.Tensor],
        TensorShape("bn", "h", "w"),
    ]
 class AriaVisionTransformer(Idefics3VisionTransformer, SupportsQuant):
    packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
@ -540,12 +552,6 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                self.vocab_size, logit_scale)
    def _validate_image_sizes(
            self, images: list[torch.Tensor]) -> list[torch.Tensor]:
        if not all(img.shape == images[0].shape for img in images):
            raise ValueError("All images must be the same size")
        return images
    def _parse_and_validate_image_input(
            self, **kwargs: object) -> Optional[AriaImagePixelInputs]:
        pixel_values = kwargs.pop("pixel_values", None)
@ -554,23 +560,9 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
        if pixel_values is None:
            return None
        if not isinstance(pixel_values, (torch.Tensor, list)):
            raise ValueError("Incorrect type of pixel values. "
                             f"Got type: {type(pixel_values)}")
        pixel_values = self._validate_image_sizes(pixel_values)
        pixel_values = flatten_bn(pixel_values, concat=True)
        if pixel_mask is not None:
            if not isinstance(pixel_mask, (torch.Tensor, list)):
                raise ValueError("Incorrect type of pixel mask. "
                                 f"Got type: {type(pixel_mask)}")
            pixel_mask = flatten_bn(pixel_mask, concat=True)
        return AriaImagePixelInputs(
-            pixel_values=pixel_values,
+            pixel_values=flatten_bn(pixel_values, concat=True),
-            pixel_mask=pixel_mask,
+            pixel_mask=flatten_bn(pixel_mask, concat=True),
        )
    def _create_patch_attention_mask(
--- a/Show More
+++ b/Show More