Address conflict

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2026-07-03 17:07:18 +08:00 · 2025-12-03 06:10:36 +00:00 · 2025-12-03 06:10:36 +00:00 · 83556e9d85
commit 83556e9d85
parent 92ed13c1c8 d7284a2604
783 changed files with 23991 additions and 13021 deletions
--- a/.buildkite/performance-benchmarks/README.md
+++ b/.buildkite/performance-benchmarks/README.md
@ -108,6 +108,65 @@ The number of this test is less stable compared to the delay and latency benchma
 WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
 #### Default Parameters Field
 We can specify default parameters in a JSON field with key `defaults`. Parameters defined in the field are applied globally to all serving tests, and can be overridden in test case fields. Here is an example:
 <details>
 <summary> An Example of default parameters field </summary>
 ```json
 {
  "defaults": {
    "qps_list": [
      "inf"
    ],
    "server_environment_variables": {
      "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1
    },
    "server_parameters": {
      "tensor_parallel_size": 1,
      "dtype": "bfloat16",
      "block_size": 128,
      "disable_log_stats": "",
      "load_format": "dummy"
    },
    "client_parameters": {
      "backend": "vllm",
      "dataset_name": "random",
      "random-input-len": 128,
      "random-output-len": 128,
      "num_prompts": 200,
      "ignore-eos": ""
    }
  },
  "tests": [
    {
      "test_name": "serving_llama3B_tp2_random_128_128",
      "server_parameters": {
        "model": "meta-llama/Llama-3.2-3B-Instruct",
        "tensor_parallel_size": 2,
      },
      "client_parameters": {
        "model": "meta-llama/Llama-3.2-3B-Instruct",
      }
    },
    {
      "test_name": "serving_qwen3_tp4_random_128_128",
      "server_parameters": {
        "model": "Qwen/Qwen3-14B",
        "tensor_parallel_size": 4,
      },
      "client_parameters": {
        "model": "Qwen/Qwen3-14B",
      }
    },
  ]
 }
 ```
 </details>
 ### Visualizing the results
 The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.
--- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
@ -110,7 +110,8 @@ json2envs() {
 wait_for_server() {
  # wait for vllm server to start
  # return 1 if vllm server crashes
-  timeout 1200 bash -c '
+  local timeout_val="1200"
  timeout "$timeout_val" bash -c '
    until curl -X POST localhost:8000/v1/completions; do
      sleep 1
    done' && return 0 || return 1
@ -316,12 +317,44 @@ run_throughput_tests() {
 run_serving_tests() {
  # run serving tests using `vllm bench serve` command
  # $1: a json file specifying serving test cases
  #
  # Supported JSON formats:
  # 1) Plain format: top-level array
  #    [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
  #
  # 2) Default parameters field + plain format tests
  #    {
  #      "defaults": { ... },
  #      "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
  #    }
  local serving_test_file
  serving_test_file=$1
  # Iterate over serving tests
-  jq -c '.[]' "$serving_test_file" | while read -r params; do
+  jq -c '
    if type == "array" then
      # Plain format: test cases array
      .[]
    elif (type == "object" and has("tests")) then
      # merge the default parameters into each test cases
      . as $root
      | ($root.defaults // {}) as $d
      | ($root.tests // [])[]
      # default qps / max_concurrency from defaults if missing
      | .qps_list = (.qps_list // $d.qps_list)
      | .max_concurrency_list = (.max_concurrency_list // $d.max_concurrency_list)
      # merge envs / params: test overrides defaults
      | .server_environment_variables =
          (($d.server_environment_variables // {}) + (.server_environment_variables // {}))
      | .server_parameters =
          (($d.server_parameters // {}) + (.server_parameters // {}))
      | .client_parameters =
          (($d.client_parameters // {}) + (.client_parameters // {}))
    else
      error("Unsupported serving test file format: must be array or object with .tests")
    end
  ' "$serving_test_file" | while read -r params; do
    # get the test name, and append the GPU type back to it.
    test_name=$(echo "$params" | jq -r '.test_name')
    if [[ ! "$test_name" =~ ^serving_ ]]; then
@ -335,20 +368,25 @@ run_serving_tests() {
      continue
    fi
-    # get client and server arguments
+    # get client and server arguments (after merged the default parameters)
    server_params=$(echo "$params" | jq -r '.server_parameters')
    server_envs=$(echo "$params" | jq -r '.server_environment_variables')
    client_params=$(echo "$params" | jq -r '.client_parameters')
    server_args=$(json2args "$server_params")
    server_envs=$(json2envs "$server_envs")
    client_args=$(json2args "$client_params")
    # qps_list
    qps_list=$(echo "$params" | jq -r '.qps_list')
    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
    echo "Running over qps list $qps_list"
    # max_concurrency_list (fallback to num_prompts if missing)
    max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list')
    if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then
-        num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
+      num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
-        max_concurrency_list="[$num_prompts]"
+      max_concurrency_list="[$num_prompts]"
    fi
    max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh')
    echo "Running over max concurrency list $max_concurrency_list"
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json
@ -1,610 +0,0 @@
 [
    {
        "test_name": "serving_llama8B_bf16_tp1_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_bf16_tp2_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_bf16_tp4_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_bf16_tp1_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_bf16_tp2_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_bf16_tp4_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int8_tp1_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int8_tp2_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int8_tp4_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int8_tp1_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int8_tp2_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int8_tp4_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int4_tp1_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int4_tp2_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int4_tp4_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int4_tp1_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int4_tp2_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int4_tp4_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    }
 ]
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
@ -1,276 +1,246 @@
-[
+{
-    {
+  "defaults": {
-        "test_name": "serving_llama8B_tp1_sharegpt",
+    "qps_list": [
-        "qps_list": [1, 4, 16, "inf"],
+      "inf"
-        "max_concurrency_list": [32],
+    ],
-        "server_environment_variables": {
+    "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-            "VLLM_RPC_TIMEOUT": 100000,
+    "server_environment_variables": {
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+      "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+      "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_CPU_SGL_KERNEL": 1,
+      "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
+      "VLLM_CPU_SGL_KERNEL": 1,
-        },
+      "VLLM_CPU_KVCACHE_SPACE": 40
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 32
        }
    },
-    {
+    "server_parameters": {
-        "test_name": "serving_llama8B_tp2_sharegpt",
+      "model": "meta-llama/Llama-3.1-8B-Instruct",
-        "qps_list": [1, 4, 16, "inf"],
+      "tensor_parallel_size": 1,
-        "max_concurrency_list": [32],
+      "dtype": "bfloat16",
-        "server_environment_variables": {
+      "distributed_executor_backend": "mp",
-            "VLLM_RPC_TIMEOUT": 100000,
+      "block_size": 128,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+      "trust_remote_code": "",
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+      "disable_log_stats": "",
-	    "VLLM_CPU_SGL_KERNEL": 1,
+      "enforce_eager": "",
-	    "VLLM_CPU_KVCACHE_SPACE": 40
+      "max_num_batched_tokens": 2048,
-        },
+      "max_num_seqs": 256,
-        "server_parameters": {
+      "load_format": "dummy"
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 32
        }
    },
-    {
+    "client_parameters": {
-        "test_name": "serving_llama8B_tp1_random_128_128",
+      "model": "meta-llama/Llama-3.1-8B-Instruct",
-        "qps_list": [1, 4, 16, "inf"],
+      "backend": "vllm",
-        "max_concurrency_list": [32],
+      "ignore-eos": "",
-        "server_environment_variables": {
+      "num_prompts": 200
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 32
        }
    },
    {
        "test_name": "serving_llama8B_tp2_random_128_128",
        "qps_list": [1, 4, 16, "inf"],
        "max_concurrency_list": [32],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 32
        }
    },
    {
        "test_name": "serving_llama8B_tp1_random_128_2048",
        "qps_list": [1, 4, 16, "inf"],
        "max_concurrency_list": [32],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 2048,
 	    "ignore-eos": "",
            "num_prompts": 32
        }
    },
    {
        "test_name": "serving_llama8B_tp2_random_128_2048",
        "qps_list": [1, 4, 16, "inf"],
        "max_concurrency_list": [32],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 2048,
 	    "ignore-eos": "",
            "num_prompts": 32
        }
    },
    {
        "test_name": "serving_llama8B_tp1_random_2048_128",
        "qps_list": [1, 4, 16, "inf"],
        "max_concurrency_list": [32],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 2048,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 32
        }
    },
    {
        "test_name": "serving_llama8B_tp2_random_2048_128",
        "qps_list": [1, 4, 16, "inf"],
        "max_concurrency_list": [32],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 2048,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 32
        }
    }
-]
+  },
  "tests": [
    {
      "test_name": "serving_llama8B_tp1_sharegpt",
      "server_parameters": {
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "dataset_name": "sharegpt",
        "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
      }
    },
    {
      "test_name": "serving_llama8B_tp2_sharegpt",
      "server_parameters": {
        "tensor_parallel_size": 2
      },
      "client_parameters": {
        "dataset_name": "sharegpt",
        "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
      }
    },
    {
      "test_name": "serving_llama8B_tp1_random_128_128",
      "server_parameters": {
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_tp2_random_128_128",
      "server_parameters": {
        "tensor_parallel_size": 2
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_tp4_random_128_128",
      "server_parameters": {
        "tensor_parallel_size": 4
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_tp1_random_128_2048",
      "server_parameters": {
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 2048
      }
    },
    {
      "test_name": "serving_llama8B_tp2_random_128_2048",
      "server_parameters": {
        "tensor_parallel_size": 2
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 2048
      }
    },
    {
      "test_name": "serving_llama8B_tp4_random_128_2048",
      "server_parameters": {
        "tensor_parallel_size": 4
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 2048
      }
    },
    {
      "test_name": "serving_llama8B_tp1_random_2048_128",
      "server_parameters": {
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 2048,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_tp2_random_2048_128",
      "server_parameters": {
        "tensor_parallel_size": 2
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 2048,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_tp4_random_2048_128",
      "server_parameters": {
        "tensor_parallel_size": 4
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 2048,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama3B_tp1_random_128_128",
      "server_parameters": {
        "model": "meta-llama/Llama-3.2-3B-Instruct",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "meta-llama/Llama-3.2-3B-Instruct",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_granite2B_tp1_random_128_128",
      "server_parameters": {
        "model": "ibm-granite/granite-3.2-2b-instruct",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "ibm-granite/granite-3.2-2b-instruct",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_qwen1.7B_tp1_random_128_128",
      "server_parameters": {
        "model": "Qwen/Qwen3-1.7B",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "Qwen/Qwen3-1.7B",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_qwen4B_tp1_random_128_128",
      "server_parameters": {
        "model": "Qwen/Qwen3-4B",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "Qwen/Qwen3-4B",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_qwen8B_tp1_random_128_128",
      "server_parameters": {
        "model": "Qwen/Qwen3-8B",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "Qwen/Qwen3-8B",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_glm9B_tp1_random_128_128",
      "server_parameters": {
        "model": "zai-org/glm-4-9b-hf",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "zai-org/glm-4-9b-hf",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_gemma7B_tp1_random_128_128",
      "server_parameters": {
        "model": "google/gemma-7b",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "google/gemma-7b",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    }
  ]
 }
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -8,7 +8,7 @@ steps:
    commands:
      # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
      # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-wheels.sh"
@ -30,19 +30,6 @@ steps:
      DOCKER_BUILDKIT: "1"
  # x86 + CUDA builds
  - label: "Build wheel - CUDA 12.8"
    depends_on: ~
    id: build-wheel-cuda-12-8
    agents:
      queue: cpu_queue_postmerge
    commands:
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"
  - label: "Build wheel - CUDA 12.9"
    depends_on: ~
    id: build-wheel-cuda-12-9
@ -109,7 +96,6 @@ steps:
  - label: "Annotate release workflow"
    depends_on:
      - create-multi-arch-manifest
      - build-wheel-cuda-12-8
    id: annotate-release-workflow
    agents:
      queue: cpu_queue_postmerge
--- a/.buildkite/scripts/generate-nightly-index.py
+++ b/.buildkite/scripts/generate-nightly-index.py
@ -0,0 +1,369 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # do not complain about line length (for docstring)
 # ruff: noqa: E501
 import argparse
 import json
 import re
 import sys
 from dataclasses import asdict, dataclass
 from pathlib import Path
 from typing import Any
 from urllib.parse import quote
 if not sys.version_info >= (3, 12):
    raise RuntimeError("This script requires Python 3.12 or higher.")
 INDEX_HTML_TEMPLATE = """<!DOCTYPE html>
 <html>
  <meta name="pypi:repository-version" content="1.0">
  <body>
 {items}
  </body>
 </html>
 """
@dataclass
 class WheelFileInfo:
    package_name: str
    version: str
    build_tag: str | None
    python_tag: str
    abi_tag: str
    platform_tag: str
    variant: str | None
    filename: str
 def parse_from_filename(file: str) -> WheelFileInfo:
    """
    Parse wheel file name to extract metadata.
    The format of wheel names:
        {package_name}-{version}(-{build_tag})?-{python_tag}-{abi_tag}-{platform_tag}.whl
    All versions could contain a variant like '+cu129' or '.cpu' or `.rocm` (or not).
    Example:
        vllm-0.11.0-cp38-abi3-manylinux1_x86_64.whl
        vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl
        vllm-0.11.1rc8.dev14+gaa384b3c0-cp38-abi3-manylinux2014_aarch64.whl
        vllm-0.11.1rc8.dev14+gaa384b3c0.cu130-cp38-abi3-manylinux1_x86_64.whl
    """
    wheel_file_re = re.compile(
        r"^(?P<package_name>.+)-(?P<version>[^-]+?)(-(?P<build_tag>[^-]+))?-(?P<python_tag>[^-]+)-(?P<abi_tag>[^-]+)-(?P<platform_tag>[^-]+)\.whl$"
    )
    match = wheel_file_re.match(file)
    if not match:
        raise ValueError(f"Invalid wheel file name: {file}")
    package_name = match.group("package_name")
    version = match.group("version")
    build_tag = match.group("build_tag")
    python_tag = match.group("python_tag")
    abi_tag = match.group("abi_tag")
    platform_tag = match.group("platform_tag")
    # extract variant from version
    variant = None
    if "dev" in version:
        ver_after_dev = version.split("dev")[-1]
        if "." in ver_after_dev:
            variant = ver_after_dev.split(".")[-1]
            version = version.removesuffix("." + variant)
    else:
        if "+" in version:
            version, variant = version.split("+")
    return WheelFileInfo(
        package_name=package_name,
        version=version,
        build_tag=build_tag,
        python_tag=python_tag,
        abi_tag=abi_tag,
        platform_tag=platform_tag,
        variant=variant,
        filename=file,
    )
 def generate_project_list(subdir_names: list[str]) -> str:
    """
    Generate project list HTML content linking to each project & variant sub-directory.
    """
    href_tags = []
    for name in sorted(subdir_names):
        name = name.strip("/").strip(".")
        href_tags.append(f'    <a href="{name}/">{name}/</a><br/>')
    return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
 def generate_package_index_and_metadata(
    wheel_files: list[WheelFileInfo], wheel_base_dir: Path, index_base_dir: Path
 ) -> tuple[str, str]:
    """
    Generate package index HTML content for a specific package, linking to actual wheel files.
    """
    href_tags = []
    metadata = []
    for file in sorted(wheel_files, key=lambda x: x.filename):
        relative_path = (
            wheel_base_dir.relative_to(index_base_dir, walk_up=True) / file.filename
        )
        # handle with '+' in URL, and avoid double-encoding '/' and already-encoded '%2B'
        # NOTE: this is AWS S3 specific behavior!
        file_path_quoted = quote(relative_path.as_posix(), safe=":%/")
        href_tags.append(f'    <a href="{file_path_quoted}">{file.filename}</a><br/>')
        file_meta = asdict(file)
        file_meta["path"] = file_path_quoted
        metadata.append(file_meta)
    index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
    metadata_str = json.dumps(metadata, indent=2)
    return index_str, metadata_str
 def generate_index_and_metadata(
    whl_files: list[str],
    wheel_base_dir: Path,
    index_base_dir: Path,
    default_variant: str | None = None,
    alias_to_default: str | None = None,
 ):
    """
    Generate index for all wheel files.
    Args:
        whl_files (list[str]): List of wheel files (must be directly under `wheel_base_dir`).
        wheel_base_dir (Path): Base directory for wheel files.
        index_base_dir (Path): Base directory to store index files.
        default_variant (str | None): The default variant name, if any.
        alias_to_default (str | None): Alias variant name for the default variant, if any.
    First, parse all wheel files to extract metadata.
    We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
    The index for the default variant (if any) is generated in the root index directory.
    If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index
    is purely a copy of the corresponding variant index, with only the links adjusted.
    Otherwise, all wheels without variant suffixes are treated as the default variant.
    If `alias_to_default` is provided, an additional alias sub-directory is created, it has the same content
    as the default variant index, but the links are adjusted accordingly.
    Index directory structure:
        index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/)
            index.html  # project list, linking to "vllm/" and other packages, and all variant sub-directories
            vllm/
                index.html # package index, pointing to actual files in wheel_base_dir (relative path)
                metadata.json # machine-readable metadata for all wheels in this package
            cpu/ # cpu variant sub-directory
                index.html
                vllm/
                    index.html
                    metadata.json
            cu129/ # cu129 is actually the alias to default variant
                index.html
                vllm/
                    index.html
                    metadata.json
            cu130/ # cu130 variant sub-directory
                index.html
                vllm/
                    index.html
                    metadata.json
            ...
    metadata.json stores a dump of all wheel files' metadata in a machine-readable format:
        [
            {
                "package_name": "vllm",
                "version": "0.10.2rc2",
                "build_tag": null,
                "python_tag": "cp38",
                "abi_tag": "abi3",
                "platform_tag": "manylinux2014_aarch64",
                "variant": "cu129",
                "filename": "vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl",
                "path": "../vllm-0.10.2rc2%2Bcu129-cp38-abi3-manylinux2014_aarch64.whl" # to be concatenated with the directory URL and URL-encoded
            },
            ...
        ]
    """
    parsed_files = [parse_from_filename(f) for f in whl_files]
    if not parsed_files:
        print("No wheel files found, skipping index generation.")
        return
    # Group by variant
    variant_to_files: dict[str, list[WheelFileInfo]] = {}
    for file in parsed_files:
        variant = file.variant or "default"
        if variant not in variant_to_files:
            variant_to_files[variant] = []
        variant_to_files[variant].append(file)
    print(f"Found variants: {list(variant_to_files.keys())}")
    # sanity check for default variant
    if default_variant:
        if "default" in variant_to_files:
            raise ValueError(
                "All wheel files must have variant suffixes when `default_variant` is specified."
            )
        if default_variant not in variant_to_files:
            raise ValueError(
                f"Default variant '{default_variant}' not found among wheel files."
            )
    if alias_to_default:
        if "default" not in variant_to_files:
            # e.g. only some wheels are uploaded to S3 currently
            print(
                "[WARN] Alias to default variant specified, but no default variant found."
            )
        elif alias_to_default in variant_to_files:
            raise ValueError(
                f"Alias variant name '{alias_to_default}' already exists among wheel files."
            )
        else:
            variant_to_files[alias_to_default] = variant_to_files["default"].copy()
            print(f"Alias variant '{alias_to_default}' created for default variant.")
    # Generate index for each variant
    subdir_names = set()
    for variant, files in variant_to_files.items():
        if variant == "default":
            variant_dir = index_base_dir
        else:
            variant_dir = index_base_dir / variant
            subdir_names.add(variant)
        variant_dir.mkdir(parents=True, exist_ok=True)
        # gather all package names in this variant
        packages = set(f.package_name for f in files)
        if variant == "default":
            # these packages should also appear in the "project list"
            # generate after all variants are processed
            subdir_names = subdir_names.union(packages)
        else:
            # generate project list for this variant directly
            project_list_str = generate_project_list(sorted(packages))
            with open(variant_dir / "index.html", "w") as f:
                f.write(project_list_str)
        for package in packages:
            # filter files belonging to this package only
            package_files = [f for f in files if f.package_name == package]
            package_dir = variant_dir / package
            package_dir.mkdir(parents=True, exist_ok=True)
            index_str, metadata_str = generate_package_index_and_metadata(
                package_files, wheel_base_dir, package_dir
            )
            with open(package_dir / "index.html", "w") as f:
                f.write(index_str)
            with open(package_dir / "metadata.json", "w") as f:
                f.write(metadata_str)
    # Generate top-level project list index
    project_list_str = generate_project_list(sorted(subdir_names))
    with open(index_base_dir / "index.html", "w") as f:
        f.write(project_list_str)
 if __name__ == "__main__":
    """
    Arguments:
        --version <version> : version string for the current build (e.g., commit hash)
        --current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory
        --output-dir <output_directory> : directory to store generated index files
        --alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant
    """
    parser = argparse.ArgumentParser(
        description="Process nightly build wheel files to generate indices."
    )
    parser.add_argument(
        "--version",
        type=str,
        required=True,
        help="Version string for the current build (e.g., commit hash)",
    )
    parser.add_argument(
        "--current-objects",
        type=str,
        required=True,
        help="Path to JSON file containing current S3 objects listing in this version directory",
    )
    parser.add_argument(
        "--output-dir",
        type=str,
        required=True,
        help="Directory to store generated index files",
    )
    parser.add_argument(
        "--alias-to-default",
        type=str,
        default=None,
        help="Alias variant name for the default variant",
    )
    args = parser.parse_args()
    version = args.version
    if "/" in version or "\\" in version:
        raise ValueError("Version string must not contain slashes.")
    current_objects_path = Path(args.current_objects)
    output_dir = Path(args.output_dir)
    if not output_dir.exists():
        output_dir.mkdir(parents=True, exist_ok=True)
    # Read current objects JSON
    with open(current_objects_path) as f:
        current_objects: dict[str, list[dict[str, Any]]] = json.load(f)
    # current_objects looks like from list_objects_v2 S3 API:
    """
    "Contents": [
        {
            "Key": "e2f56c309d2a28899c68975a7e104502d56deb8f/vllm-0.11.2.dev363+ge2f56c309-cp38-abi3-manylinux1_x86_64.whl",
            "LastModified": "2025-11-28T14:00:32+00:00",
            "ETag": "\"37a38339c7cdb61ca737021b968075df-52\"",
            "ChecksumAlgorithm": [
                "CRC64NVME"
            ],
            "ChecksumType": "FULL_OBJECT",
            "Size": 435649349,
            "StorageClass": "STANDARD"
        },
        ...
    ]
    """
    # Extract wheel file keys
    wheel_files = []
    for item in current_objects.get("Contents", []):
        key: str = item["Key"]
        if key.endswith(".whl"):
            wheel_files.append(key.split("/")[-1])  # only the filename is used
    print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")
    # Generate index and metadata, assuming wheels and indices are stored as:
    # s3://vllm-wheels/{version}/<wheel files>
    # s3://vllm-wheels/<anything>/<index files>
    wheel_base_dir = Path(output_dir).parent / version
    index_base_dir = Path(output_dir)
    generate_index_and_metadata(
        whl_files=wheel_files,
        wheel_base_dir=wheel_base_dir,
        index_base_dir=index_base_dir,
        default_variant=None,
        alias_to_default=args.alias_to_default,
    )
    print(f"Successfully generated index and metadata in {output_dir}")
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
@ -7,53 +7,51 @@ set -ex
 # allow to bind to different cores
 CORE_RANGE=${CORE_RANGE:-0-16}
 OMP_CORE_RANGE=${OMP_CORE_RANGE:-0-16}
 NUMA_NODE=${NUMA_NODE:-0}
-export CMAKE_BUILD_PARALLEL_LEVEL=32
+export CMAKE_BUILD_PARALLEL_LEVEL=16
 # Setup cleanup
 remove_docker_container() {
    set -e;
-    docker rm -f cpu-test-"$NUMA_NODE" || true;
+    docker rm -f cpu-test || true;
 }
 trap remove_docker_container EXIT
 remove_docker_container
 # Try building the docker image
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
+docker build --tag cpu-test --target vllm-test -f docker/Dockerfile.cpu .
-# Run the image, setting --shm-size=4g for tensor parallel.
+# Run the image
-docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
+docker run -itd --cpuset-cpus="$CORE_RANGE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test cpu-test
 function cpu_tests() {
  set -e
  export NUMA_NODE=$2
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
+  docker exec cpu-test bash -c "
    set -e
    pip list"
  # offline inference
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
+  docker exec cpu-test bash -c "
    set -e
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
  # Run kernel tests
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
+  docker exec cpu-test bash -c "
    set -e
    pytest -x -v -s tests/kernels/test_onednn.py
    pytest -x -v -s tests/kernels/attention/test_cpu_attn.py"
  # basic online serving
-  docker exec cpu-test-"$NUMA_NODE" bash -c '
+  docker exec cpu-test bash -c '
    set -e
-    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve meta-llama/Llama-3.2-3B-Instruct --max-model-len 2048 &
+    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve Qwen/Qwen3-0.6B --max-model-len 2048 &
    server_pid=$!
    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
    vllm bench serve \
      --backend vllm \
      --dataset-name random \
-      --model meta-llama/Llama-3.2-3B-Instruct \
+      --model Qwen/Qwen3-0.6B \
      --num-prompts 20 \
      --endpoint /v1/completions
    kill -s SIGTERM $server_pid &'
@ -61,4 +59,4 @@ function cpu_tests() {
 # All of CPU tests are expected to be finished less than 40 mins.
 export -f cpu_tests
-timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
+timeout 2h bash -c cpu_tests
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@ -21,8 +21,8 @@ trap remove_docker_container EXIT
 remove_docker_container
 # Try building the docker image
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
 # Run the image, setting --shm-size=4g for tensor parallel.
 docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@ -35,7 +35,7 @@ docker run \
    echo $ZE_AFFINITY_MASK
    pip install tblib==3.1.0
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
    VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@ -2,6 +2,28 @@
 set -ex
 # ======== part 0: setup ========
 BUCKET="vllm-wheels"
 INDICES_OUTPUT_DIR="indices"
 DEFAULT_VARIANT_ALIAS="cu129" # align with vLLM_MAIN_CUDA_VERSION in vllm/envs.py
 PYTHON=${PYTHON_PROG:=python3} # try to read from env var, otherwise use python3
 SUBPATH=$BUILDKITE_COMMIT
 S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
 # detect if python3.10+ is available
 has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,12) else 0)")
 if [[ "$has_new_python" -eq 0 ]]; then
    # use new python from docker
    docker pull python:3-slim
    PYTHON="docker run --rm -v $(pwd):/app -w /app python:3-slim python3"
 fi
 echo "Using python interpreter: $PYTHON"
 echo "Python version: $($PYTHON --version)"
 # ========= part 1: collect, rename & upload the wheel ==========
 # Assume wheels are in artifacts/dist/*.whl
 wheel_files=(artifacts/dist/*.whl)
@ -10,74 +32,69 @@ if [[ ${#wheel_files[@]} -ne 1 ]]; then
  echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
  exit 1
 fi
 # Get the single wheel file
 wheel="${wheel_files[0]}"
-# Detect architecture and rename 'linux' to appropriate manylinux version
+# current build image uses ubuntu 20.04, which corresponds to manylinux_2_31
-arch=$(uname -m)
+# refer to https://github.com/mayeut/pep600_compliance?tab=readme-ov-file#acceptable-distros-to-build-wheels
-if [[ $arch == "x86_64" ]]; then
+manylinux_version="manylinux_2_31"
    manylinux_version="manylinux1"
 elif [[ $arch == "aarch64" ]]; then
    manylinux_version="manylinux2014"
 else
    echo "Warning: Unknown architecture $arch, using manylinux1 as default"
    manylinux_version="manylinux1"
 fi
 # Rename 'linux' to the appropriate manylinux version in the wheel filename
 if [[ "$wheel" != *"linux"* ]]; then
  echo "Error: Wheel filename does not contain 'linux': $wheel"
  exit 1
 fi
 new_wheel="${wheel/linux/$manylinux_version}"
 mv -- "$wheel" "$new_wheel"
 wheel="$new_wheel"
 echo "Renamed wheel to: $wheel"
 # Extract the version from the wheel
 version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
-echo "Version: $version"
+echo "Version in wheel: $version"
 pure_version="${version%%+*}"
 echo "Pure version (without variant): $pure_version"
-normal_wheel="$wheel" # Save the original wheel filename
+# copy wheel to its own bucket
 aws s3 cp "$wheel" "$S3_COMMIT_PREFIX"
-# If the version contains "dev", rename it to v1.0.0.dev for consistency
+# ========= part 2: generate and upload indices ==========
-if [[ $version == *dev* ]]; then
+# generate indices for all existing wheels in the commit directory
-    suffix="${version##*.}"
+# this script might be run multiple times if there are multiple variants being built
-    if [[ $suffix == cu* ]]; then
+# so we need to guarantee there is little chance for "TOCTOU" issues
-        new_version="1.0.0.dev+${suffix}"
+# i.e., one process is generating indices while another is uploading a new wheel
-    else
+# so we need to ensure no time-consuming operations happen below
        new_version="1.0.0.dev"
    fi
    new_wheel="${wheel/$version/$new_version}"
    # use cp to keep both files in the artifacts directory
    cp -- "$wheel" "$new_wheel"
    wheel="$new_wheel"
    version="$new_version"
 fi
-# Upload the wheel to S3
+# list all wheels in the commit directory
-python3 .buildkite/generate_index.py --wheel "$normal_wheel"
+echo "Existing wheels on S3:"
 aws s3 ls "$S3_COMMIT_PREFIX"
 obj_json="objects.json"
 aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json"
 mkdir -p "$INDICES_OUTPUT_DIR"
-# generate index for this commit
+# call script to generate indicies for all existing wheels
-aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
+# this indices have relative paths that could work as long as it is next to the wheel directory in s3
-aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
+# i.e., the wheels are always in s3://vllm-wheels/<commit>/
-
+# and indices can be placed in /<commit>/, or /nightly/, or /<version>/
-if [[ $normal_wheel == *"cu129"* ]]; then
+if [[ ! -z "$DEFAULT_VARIANT_ALIAS" ]]; then
-    # only upload index.html for cu129 wheels (default wheels) as it
+    alias_arg="--alias-to-default $DEFAULT_VARIANT_ALIAS"
    # is available on both x86 and arm64
    aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
    aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
 else
-    echo "Skipping index files for non-cu129 wheels"
+    alias_arg=""
 fi
-# generate index for nightly
+$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" $alias_arg
 aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
 aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
-if [[ $normal_wheel == *"cu129"* ]]; then
+# copy indices to /<commit>/ unconditionally
-    # only upload index.html for cu129 wheels (default wheels) as it
+echo "Uploading indices to $S3_COMMIT_PREFIX"
-    # is available on both x86 and arm64
+aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "$S3_COMMIT_PREFIX"
-    aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
+
-else
+# copy to /nightly/ only if it is on the main branch and not a PR 
-    echo "Skipping index files for non-cu129 wheels"
+if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]]; then
    echo "Uploading indices to overwrite /nightly/"
    aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/nightly/"
 fi
-aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
+# copy to /<pure_version>/ only if it does not have "dev" in the version
-aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"
+if [[ "$version" != *"dev"* ]]; then
    echo "Uploading indices to overwrite /$pure_version/"
    aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
 fi
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@ -39,9 +39,9 @@ steps:
  # if this test fails, it means the nightly torch version is not compatible with some
  # of the dependencies. Please check the error message and add the package to whitelist
  # in /vllm/tools/pre_commit/generate_nightly_torch_test.py
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
  agent_pool: mi325_1
-  # grade: Blocking
+  grade: Blocking
  soft_fail: true
  source_file_dependencies:
  - requirements/nightly_torch_test.txt
@ -50,9 +50,9 @@ steps:
 - label: Async Engine, Inputs, Utils, Worker Test # 10min
  timeout_in_minutes: 15
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
  agent_pool: mi325_1
-  # grade: Blocking
+  grade: Blocking
  source_file_dependencies:
  - vllm/
  - tests/multimodal
@ -61,17 +61,18 @@ steps:
  - pytest -v -s -m 'not cpu_test' multimodal
  - pytest -v -s utils_
- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 4 mins
+- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 15min
-  timeout_in_minutes: 10
+  timeout_in_minutes: 20
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
  agent_pool: mi325_1
-  # grade: Blocking
+  grade: Blocking
  source_file_dependencies:
  - vllm/
  - tests/test_inputs.py
  - tests/test_outputs.py
  - tests/multimodal
  - tests/standalone_tests/lazy_imports.py
  - tests/tokenizers_
  - tests/transformers_utils
  - tests/config
  no_gpu: true
@ -80,6 +81,7 @@ steps:
  - pytest -v -s test_inputs.py
  - pytest -v -s test_outputs.py
  - pytest -v -s -m 'cpu_test' multimodal
  - pytest -v -s tokenizers_
  - pytest -v -s transformers_utils
  - pytest -v -s config
@ -113,9 +115,9 @@ steps:
  - pytest -v -s basic_correctness/test_cpu_offload.py
 - label: Entrypoints Unit Tests # 5min
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
  agent_pool: mi325_1
-  # grade: Blocking
+  grade: Blocking
  timeout_in_minutes: 10
  working_dir: "/vllm-workspace/tests"
  fast_check: true
@ -212,6 +214,7 @@ steps:
  # test with internal dp
  - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
@ -250,9 +253,9 @@ steps:
  - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
 - label: EPLB Algorithm Test # 5min
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
  agent_pool: mi325_1
-  # grade: Blocking
+  grade: Blocking
  timeout_in_minutes: 15
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
@ -308,23 +311,20 @@ steps:
  - pytest -v -s test_regression.py
  working_dir: "/vllm-workspace/tests" # optional
- label: Engine Test # 25min
+- label: Engine Test # 9min
-  timeout_in_minutes: 40
+  timeout_in_minutes: 15
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  source_file_dependencies:
  - vllm/
  - tests/engine
  - tests/tokenization
  - tests/test_sequence
  - tests/test_config
  - tests/test_logger
  - tests/test_vllm_port
  commands:
  - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
  # OOM in the CI unless we run this separately
  - pytest -v -s tokenization
 - label: V1 Test e2e + engine # 30min
  timeout_in_minutes: 45
@ -342,9 +342,9 @@ steps:
 - label: V1 Test entrypoints # 35min
  timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
  agent_pool: mi325_1
-  # grade: Blocking
+  grade: Blocking
  source_file_dependencies:
    - vllm/
    - tests/v1
@ -392,6 +392,20 @@ steps:
  commands:
    - pytest -v -s v1/attention
 - label: Batch Invariance Tests (H100) # 10min
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
  timeout_in_minutes: 25
  gpu: h100
  source_file_dependencies:
    - vllm/
    - tests/v1/determinism/
  commands:
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    - pip install pytest-timeout pytest-forked
    - pytest -v -s v1/determinism/test_batch_invariance.py
    - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
 - label: V1 Test attention (B200) # 10min
  timeout_in_minutes: 30
  gpu: b200
@ -402,9 +416,9 @@ steps:
    - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
 - label: V1 Test others (CPU) # 5 mins
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
  agent_pool: mi325_1
-  # grade: Blocking
+  grade: Blocking
  source_file_dependencies:
    - vllm/
    - tests/v1
@ -496,7 +510,7 @@ steps:
 - label: PyTorch Compilation Unit Tests # 15min
  timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  torch_nightly: true
@ -513,7 +527,7 @@ steps:
 - label: PyTorch Fullgraph Smoke Test # 15min
  timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  torch_nightly: true
@ -569,7 +583,7 @@ steps:
 - label: Kernels Attention Test %N # 23min
  timeout_in_minutes: 35
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_8
  # grade: Blocking
  source_file_dependencies:
@ -596,7 +610,7 @@ steps:
 - label: Kernels MoE Test %N # 40min
  timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_8
  # grade: Blocking
  source_file_dependencies:
@ -623,6 +637,26 @@ steps:
  commands:
    - pytest -v -s kernels/mamba
 - label: Kernels DeepGEMM Test (H100) # Nvidia-centric
 # Not replicating for CUTLAS & CuTe
  timeout_in_minutes: 45
  gpu: h100
  num_gpus: 1
  source_file_dependencies:
  - tools/install_deepgemm.sh
  - vllm/utils/deep_gemm.py
  - vllm/model_executor/layers/fused_moe
  - vllm/model_executor/layers/quantization
  - tests/kernels/quantization/test_block_fp8.py
  - tests/kernels/moe/test_deepgemm.py
  - tests/kernels/moe/test_batched_deepgemm.py
  - tests/kernels/attention/test_deepgemm_attention.py
  commands:
    - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
    - pytest -v -s kernels/moe/test_deepgemm.py
    - pytest -v -s kernels/moe/test_batched_deepgemm.py
    - pytest -v -s kernels/attention/test_deepgemm_attention.py
 - label: Model Executor Test # 23min
  timeout_in_minutes: 35
  torch_nightly: true
@ -681,6 +715,7 @@ steps:
  # we can only upgrade after this is resolved
  # TODO(jerryzh168): resolve the above comment
  - uv pip install --system torchao==0.13.0
  - uv pip install --system conch-triton-kernels
  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
 - label: LM Eval Small Models # 15min
@ -900,6 +935,18 @@ steps:
  commands:
    - pytest -v -s models/language/pooling_mteb_test
 - label: Multi-Modal Processor Test (CPU)
  timeout_in_minutes: 60
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  no_gpu: true
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
 - label: Multi-Modal Processor Test # 44min
  timeout_in_minutes: 60
  mirror_hardwares: [amdexperimental]
@ -1056,6 +1103,7 @@ steps:
    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
    - pytest -v -s tests/kernels/moe/test_flashinfer.py
    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
 - label: Blackwell Fusion and Compile Tests # 30 min
  timeout_in_minutes: 40
@ -1065,11 +1113,19 @@ steps:
  - csrc/quantization/fp4/
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
  - vllm/v1/worker/
  - vllm/v1/cudagraph_dispatcher.py
  - vllm/compilation/
  # can affect pattern matching
  - vllm/model_executor/layers/layernorm.py
  - vllm/model_executor/layers/activation.py
  - vllm/model_executor/layers/quantization/input_quant_fp8.py
  - vllm/model_executor/layers/fused_moe/layer.py
  - tests/compile/test_fusion_attn.py
  - tests/compile/test_silu_mul_quant_fusion.py
  - tests/compile/distributed/test_fusion_all_reduce.py
  - tests/compile/distributed/test_fusions_e2e.py
  - tests/compile/fullgraph/test_full_graph.py
  commands:
    - nvidia-smi
    - pytest -v -s tests/compile/test_fusion_attn.py
@ -1080,7 +1136,7 @@ steps:
    # Wrap with quotes to escape yaml
    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
-    - pytest -v -s tests/compile/distributed/test_full_graph.py::test_fp8_kv_scale_compile
+    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
 - label: Blackwell Fusion E2E Tests # 30 min
  timeout_in_minutes: 40
@ -1102,7 +1158,7 @@ steps:
  commands:
    - nvidia-smi
    # Run all e2e fusion tests
-    - pytest -v -s tests/compile/test_fusions_e2e.py
+    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
 - label: ROCm GPT-OSS Eval
  timeout_in_minutes: 60
@ -1217,6 +1273,7 @@ steps:
  - tests/v1/worker/test_worker_memory_snapshot.py
  commands:
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
  - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
  - pytest -v -s entrypoints/llm/test_collective_rpc.py
@ -1252,7 +1309,7 @@ steps:
 - label: Plugin Tests (2 GPUs) # 40min
  timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_2
  # grade: Blocking
  working_dir: "/vllm-workspace/tests"
@ -1328,7 +1385,7 @@ steps:
 - label: Weight Loading Multiple GPU Test  # 33min
  timeout_in_minutes: 45
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_2
  # grade: Blocking
  working_dir: "/vllm-workspace/tests"
@ -1428,14 +1485,14 @@ steps:
  working_dir: "/vllm-workspace/"
  num_gpus: 2
  commands:
-    - pytest -v -s tests/compile/distributed/test_async_tp.py
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
    #- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
-    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
+    - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
-    - pytest -v -s tests/compile/distributed/test_sequence_parallel.py
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
    - pytest -v -s tests/distributed/test_context_parallel.py
-    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
+    - HIP_VISIBLE_DEVICES=0,1 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
    - pytest -v -s tests/v1/distributed/test_dbo.py
 ##### B200 test #####
@ -1465,7 +1522,7 @@ steps:
    - bash .buildkite/scripts/run-prime-rl-test.sh
 - label: DeepSeek V2-Lite Accuracy
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_4
  # grade: Blocking
  timeout_in_minutes: 60
@ -1476,8 +1533,8 @@ steps:
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
- label: Qwen3-30B-A3B-FP8-block Accuracy
+- label: Qwen3-30B-A3B-FP8-block Accuracy (H100)
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_4
  # grade: Blocking
  timeout_in_minutes: 60
@ -1487,3 +1544,12 @@ steps:
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
 - label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
  timeout_in_minutes: 60
  gpu: b200
  optional: true
  num_gpus: 2
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -57,14 +57,15 @@ steps:
  - pytest -v -s -m 'not cpu_test' multimodal
  - pytest -v -s utils_
- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 4 mins
+- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 15min
-  timeout_in_minutes: 10
+  timeout_in_minutes: 20
  source_file_dependencies:
  - vllm/
  - tests/test_inputs.py
  - tests/test_outputs.py
  - tests/multimodal
  - tests/standalone_tests/lazy_imports.py
  - tests/tokenizers_
  - tests/transformers_utils
  - tests/config
  no_gpu: true
@ -73,6 +74,7 @@ steps:
  - pytest -v -s test_inputs.py
  - pytest -v -s test_outputs.py
  - pytest -v -s -m 'cpu_test' multimodal
  - pytest -v -s tokenizers_
  - pytest -v -s transformers_utils
  - pytest -v -s config
@ -192,6 +194,7 @@ steps:
  # test with internal dp
  - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
@ -275,21 +278,18 @@ steps:
  - pytest -v -s test_regression.py
  working_dir: "/vllm-workspace/tests" # optional
- label: Engine Test # 25min
+- label: Engine Test # 9min
-  timeout_in_minutes: 40
+  timeout_in_minutes: 15
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
  - tests/engine
  - tests/tokenization
  - tests/test_sequence
  - tests/test_config
  - tests/test_logger
  - tests/test_vllm_port
  commands:
  - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
  # OOM in the CI unless we run this separately
  - pytest -v -s tokenization
 - label: V1 Test e2e + engine # 30min
  timeout_in_minutes: 45
@ -390,20 +390,24 @@ steps:
  - examples/
  commands:
    - pip install tensorizer # for tensorizer test
    # for basic
    - python3 offline_inference/basic/chat.py
    - python3 offline_inference/basic/generate.py --model facebook/opt-125m
    - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
    - python3 offline_inference/basic/chat.py
    - python3 offline_inference/prefix_caching.py
    - python3 offline_inference/llm_engine_example.py
    - python3 offline_inference/audio_language.py --seed 0
    - python3 offline_inference/vision_language.py --seed 0
    - python3 offline_inference/vision_language_pooling.py --seed 0
    - python3 offline_inference/vision_language_multi_image.py --seed 0
    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
    - python3 offline_inference/basic/classify.py
    - python3 offline_inference/basic/embed.py
    - python3 offline_inference/basic/score.py
    # for multi-modal models
    - python3 offline_inference/audio_language.py --seed 0
    - python3 offline_inference/vision_language.py --seed 0
    - python3 offline_inference/vision_language_multi_image.py --seed 0
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
    # for pooling models
    - python3 pooling/pooling/vision_language_pooling.py --seed 0
    # for features demo
    - python3 offline_inference/prefix_caching.py
    - python3 offline_inference/llm_engine_example.py
    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
    # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
@ -631,6 +635,7 @@ steps:
  # we can only upgrade after this is resolved
  # TODO(jerryzh168): resolve the above comment
  - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
  - uv pip install --system conch-triton-kernels
  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
 - label: LM Eval Small Models # 53min
@ -818,14 +823,24 @@ steps:
  commands:
    - pytest -v -s models/language/pooling_mteb_test
- label: Multi-Modal Processor Test # 44min
+- label: Multi-Modal Processor Test (CPU)
  timeout_in_minutes: 60
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  no_gpu: true
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
 - label: Multi-Modal Processor Test
  timeout_in_minutes: 60
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/processing
+    - pytest -v -s models/multimodal/processing/test_tensor_schema.py
 - label: Multi-Modal Models Test (Standard) # 60min
  timeout_in_minutes: 80
@ -902,11 +917,12 @@ steps:
 - label: Transformers Nightly Models Test
  working_dir: "/vllm-workspace/"
  optional: true
  soft_fail: true
  commands:
    - pip install --upgrade git+https://github.com/huggingface/transformers
-    - pytest -v -s tests/models/test_initialization.py -k 'not (Ultravox or Phi4Multimodal or MiniCPMO or Lfm2Moe or RobertaForSequenceClassification or Ovis2_5 or DeepseekOCR or KimiVL)'
+    - pytest -v -s tests/models/test_initialization.py
    - pytest -v -s tests/models/test_transformers.py
-    # - pytest -v -s tests/models/multimodal/processing/
+    - pytest -v -s tests/models/multimodal/processing/
    - pytest -v -s tests/models/multimodal/test_mapping.py
    - python3 examples/offline_inference/basic/chat.py
    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
@ -1116,6 +1132,7 @@ steps:
  # https://github.com/NVIDIA/nccl/issues/1838
  - export NCCL_CUMEM_HOST_ENABLE=0
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
  - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
  - pytest -v -s entrypoints/llm/test_collective_rpc.py
@ -1299,11 +1316,11 @@ steps:
  working_dir: "/vllm-workspace/"
  num_gpus: 2
  commands:
-    - pytest -v -s tests/compile/distributed/test_async_tp.py
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
-    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
+    - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
-    - pytest -v -s tests/distributed/test_sequence_parallel.py
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
    - pytest -v -s tests/distributed/test_context_parallel.py
    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
    - pytest -v -s tests/v1/distributed/test_dbo.py
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -146,9 +146,10 @@ mkdocs.yaml @hmellor
 /requirements/kv_connectors.txt @NickLucche
 # Pooling models
-/examples/*/pooling/ @noooop
+/examples/pooling @noooop
 /tests/models/*/pooling* @noooop
 /tests/entrypoints/pooling @noooop
 /vllm/entrypoints/pooling @noooop
 /vllm/config/pooler.py @noooop
 /vllm/pooling_params.py @noooop
 /vllm/model_executor/layers/pooler.py @noooop
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@ -13,10 +13,10 @@ jobs:
    steps:
      - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
      - name: Set up Python
-        uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
+        uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
        with:
          python-version: '3.12'
--- a/.github/workflows/issue_autolabel.yml
+++ b/.github/workflows/issue_autolabel.yml
@ -105,6 +105,31 @@ jobs:
                  }
                ],
              },
              cpu: {
                // Keyword search - matches whole words only (with word boundaries)
                keywords: [
                  {
                    term: "CPU Backend",
                    searchIn: "title"
                  },
                  {
                    term: "x86",
                    searchIn: "title"
                  },
                  {
                    term: "ARM",
                    searchIn: "title"
                  },
                  {
                    term: "Apple Silicon",
                    searchIn: "title"
                  },
                  {
                    term: "IBM Z",
                    searchIn: "title"
                  },
                ],
              },
              // Add more label configurations here as needed
              // example: {
              //   keywords: [...],
--- a/.github/workflows/macos-smoke-test.yml
+++ b/.github/workflows/macos-smoke-test.yml
@ -12,7 +12,7 @@ jobs:
    timeout-minutes: 30
    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6
      - uses: astral-sh/setup-uv@v7
        with:
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -16,8 +16,8 @@ jobs:
  pre-commit:
    runs-on: ubuntu-latest
    steps:
-    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+    - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
-    - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
+    - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
      with:
        python-version: "3.12"
    - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -136,7 +136,7 @@ elseif(HIP_FOUND)
  # ROCm 5.X and 6.X
  if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND
-      NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM})
+      Torch_VERSION VERSION_LESS ${TORCH_SUPPORTED_VERSION_ROCM})
    message(WARNING "Pytorch version >= ${TORCH_SUPPORTED_VERSION_ROCM} "
      "expected for ROCm build, saw ${Torch_VERSION} instead.")
  endif()
@ -354,8 +354,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # Only build Marlin kernels if we are building for at least some compatible archs.
  # Keep building Marlin for 9.0 as there are some group sizes and shapes that
  # are not supported by Machete yet.
-  # 9.0 for latest bf16 atomicAdd PTX
+
-  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
+  # marlin arches for fp16 output
  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
  # marlin arches for bf16 output (we need 9.0 for bf16 atomicAdd PTX)
  cuda_archs_loose_intersection(MARLIN_BF16_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
  # marlin arches for fp8 input
  # - sm80 doesn't support fp8 computation
  # - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
  # so we only enable fp8 computation for SM89 (e.g. RTX 40x0)  and 12.0 (e.g. RTX 50x0)
  cuda_archs_loose_intersection(MARLIN_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
  if (MARLIN_ARCHS)
    #
@ -365,16 +374,18 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    set(MARLIN_GEN_SCRIPT
      ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py)
    file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH)
    list(JOIN CUDA_ARCHS "," CUDA_ARCHS_STR)
    set(MARLIN_GEN_SCRIPT_HASH_AND_ARCH "${MARLIN_GEN_SCRIPT_HASH}(ARCH:${CUDA_ARCHS_STR})")
-    message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH}")
+    message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
-    message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH}")
+    message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
-    if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH}
+    if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
-        OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH})
+        OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH})
      execute_process(
        COMMAND ${CMAKE_COMMAND} -E env
        PYTHONPATH=$PYTHONPATH
-          ${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT}
+          ${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR}
        RESULT_VARIABLE marlin_generation_result
        OUTPUT_VARIABLE marlin_generation_result
        OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
@ -387,15 +398,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
                            "\nCheck the log for details: "
                            "${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log")
      else()
-        set(MARLIN_GEN_SCRIPT_HASH ${MARLIN_GEN_SCRIPT_HASH}
+        set(MARLIN_GEN_SCRIPT_HASH_AND_ARCH ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
-            CACHE STRING "Last run Marlin generate script hash" FORCE)
+            CACHE STRING "Last run Marlin generate script hash and arch" FORCE)
        message(STATUS "Marlin generation completed successfully.")
      endif()
    else()
      message(STATUS "Marlin generation script has not changed, skipping generation.")
    endif()
-    file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/kernel_*.cu")
+    file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_float16.cu")
    set_gencode_flags_for_srcs(
      SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
      CUDA_ARCHS "${MARLIN_ARCHS}")
@ -403,12 +414,34 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
      set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC}
        PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
    endif()
    list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
    file(GLOB MARLIN_TEMPLATE_BF16_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_bfloat16.cu")
    set_gencode_flags_for_srcs(
      SRCS "${MARLIN_TEMPLATE_BF16_KERNEL_SRC}"
      CUDA_ARCHS "${MARLIN_BF16_ARCHS}")
    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
      set_source_files_properties(${MARLIN_TEMPLATE_BF16_KERNEL_SRC}
        PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
    endif()
    list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_BF16_KERNEL_SRC})
    if (MARLIN_FP8_ARCHS) 
      file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/gptq_marlin/sm89_kernel_*.cu")
      set_gencode_flags_for_srcs(
        SRCS "${MARLIN_TEMPLATE_FP8_KERNEL_SRC}"
        CUDA_ARCHS "${MARLIN_FP8_ARCHS}")
      if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
        set_source_files_properties(${MARLIN_TEMPLATE_FP8_KERNEL_SRC}
          PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
      endif()
      list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_FP8_KERNEL_SRC})
    endif()
    set(MARLIN_SRCS
       "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
       "csrc/quantization/gptq_marlin/gptq_marlin.cu"
       "csrc/quantization/gptq_marlin/marlin_int4_fp8_preprocess.cu"
       "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
       "csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
    set_gencode_flags_for_srcs(
@ -604,12 +637,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    set(SRCS
      "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
      "csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
-      "csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu")
+      "csrc/quantization/fp4/nvfp4_experts_quant.cu"
      "csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu"
      "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${FP4_ARCHS}")
    list(APPEND VLLM_EXT_SRC "${SRCS}")
    list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM120=1")
    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM120=1")
    message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
  else()
    message(STATUS "Not building NVFP4 as no compatible archs were found.")
@ -938,8 +974,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    CUDA_ARCHS "${CUDA_ARCHS}")
  list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
-  # 9.0 for latest bf16 atomicAdd PTX
+  # moe marlin arches
-  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
+  # note that we always set `use_atomic_add=False` for moe marlin now,
  # so we don't need 9.0 for bf16 atomicAdd PTX
  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
  # moe marlin arches for fp8 input
  # - sm80 doesn't support fp8 computation
  # - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
  # so we only enable fp8 computation for SM89 (e.g. RTX 40x0)  and 12.0 (e.g. RTX 50x0)
  cuda_archs_loose_intersection(MARLIN_MOE_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
  if (MARLIN_MOE_ARCHS)
    #
@ -949,16 +992,18 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    set(MOE_MARLIN_GEN_SCRIPT
      ${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py)
    file(MD5 ${MOE_MARLIN_GEN_SCRIPT} MOE_MARLIN_GEN_SCRIPT_HASH)
    list(JOIN CUDA_ARCHS "," CUDA_ARCHS_STR)
    set(MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH "${MOE_MARLIN_GEN_SCRIPT_HASH}(ARCH:${CUDA_ARCHS_STR})")
-    message(STATUS "Marlin MOE generation script hash: ${MOE_MARLIN_GEN_SCRIPT_HASH}")
+    message(STATUS "Marlin MOE generation script hash with arch: ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
-    message(STATUS "Last run Marlin MOE generate script hash: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}")
+    message(STATUS "Last run Marlin MOE generate script hash with arch: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
-    if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}
+    if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
-        OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH})
+        OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH})
      execute_process(
        COMMAND ${CMAKE_COMMAND} -E env
        PYTHONPATH=$PYTHONPATH
-          ${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT}
+          ${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR}
        RESULT_VARIABLE moe_marlin_generation_result
        OUTPUT_VARIABLE moe_marlin_generation_output
        OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
@ -971,7 +1016,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
                            "\nCheck the log for details: "
                            "${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log")
      else()
-        set(MOE_MARLIN_GEN_SCRIPT_HASH ${MOE_MARLIN_GEN_SCRIPT_HASH}
+        set(MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
            CACHE STRING "Last run Marlin MOE generate script hash" FORCE)
        message(STATUS "Marlin MOE generation completed successfully.")
      endif()
@ -979,16 +1024,28 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
      message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
    endif()
-    file(GLOB MOE_WNAA16_MARLIN_SRC "csrc/moe/marlin_moe_wna16/*.cu")
+    file(GLOB MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/sm80_kernel_*.cu")
    list(APPEND MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/ops.cu")
    set_gencode_flags_for_srcs(
-      SRCS "${MOE_WNAA16_MARLIN_SRC}"
+      SRCS "${MARLIN_MOE_SRC}"
      CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
-      set_source_files_properties(${MOE_WNAA16_MARLIN_SRC}
+      set_source_files_properties(${MARLIN_MOE_SRC}
        PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
    endif()
    list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SRC})
-    list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})
+    if (MARLIN_MOE_FP8_ARCHS)
      file(GLOB MARLIN_MOE_FP8_SRC "csrc/moe/marlin_moe_wna16/sm89_kernel_*.cu")
      set_gencode_flags_for_srcs(
        SRCS "${MARLIN_MOE_FP8_SRC}"
        CUDA_ARCHS "${MARLIN_MOE_FP8_ARCHS}")
      if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
        set_source_files_properties(${MARLIN_MOE_FP8_SRC}
          PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
      endif()
      list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_FP8_SRC})
    endif()
    message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
  else()
--- a/README.md
+++ b/README.md
@ -21,6 +21,7 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio
 *Latest News* 🔥
 - [2025/11] We hosted [vLLM Bangkok Meetup](https://luma.com/v0f647nv). We explored vLLM and LMCache inference and low-resource language adaptation with speakers from Embedded LLM, AMD, and Red Hat. Please find the meetup slides [here](https://drive.google.com/drive/folders/1H0DS57F8HQ5q3kSOSoRmucPJWL3E0A_X?usp=sharing).
 - [2025/11] We hosted [the first vLLM Europe Meetup in Zurich](https://luma.com/0gls27kb) focused on quantization, distributed inference, and reinforcement learning at scale with speakers from Mistral, IBM, and Red Hat. Please find the meetup slides [here](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) and recording [here](https://www.youtube.com/watch?v=6m6ZE6yVEDI)
 - [2025/11] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w) focusing on distributed inference and diverse accelerator support with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link).
 - [2025/10] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg) focused on hands-on vLLM inference optimization! Please find the meetup slides [here](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6).
--- a/benchmarks/auto_tune/README.md
+++ b/benchmarks/auto_tune/README.md
@ -83,7 +83,7 @@ MIN_CACHE_HIT_PCT=0
 MAX_LATENCY_ALLOWED_MS=100000000000 # A very large number
 ```
-#### 2. Maximize Throughput with a Latency Requirement
+### 2. Maximize Throughput with a Latency Requirement
 - **Goal**: Find the best server parameters when P99 end-to-end latency must be below 500ms.
 - **Configuration**:
@ -96,7 +96,7 @@ MIN_CACHE_HIT_PCT=0
 MAX_LATENCY_ALLOWED_MS=500
 ```
-#### 3. Maximize Throughput with Prefix Caching and Latency Requirements
+### 3. Maximize Throughput with Prefix Caching and Latency Requirements
 - **Goal**: Find the best server parameters assuming a 60% prefix cache hit rate and a latency requirement of 500ms.
 - **Configuration**:
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@ -620,7 +620,7 @@ def get_tokenizer(
        kwargs["use_fast"] = False
    if tokenizer_mode == "mistral":
        try:
-            from vllm.transformers_utils.tokenizer import MistralTokenizer
+            from vllm.tokenizers import MistralTokenizer
        except ImportError as e:
            raise ImportError(
                "MistralTokenizer requires vllm package.\n"
--- a/benchmarks/benchmark_ngram_proposer.py
+++ b/benchmarks/benchmark_ngram_proposer.py
@ -108,7 +108,10 @@ def benchmark_batched_propose(args):
        device_config=DeviceConfig(device=current_platform.device_type),
        parallel_config=ParallelConfig(),
        load_config=LoadConfig(),
-        scheduler_config=SchedulerConfig(),
+        scheduler_config=SchedulerConfig(
            max_model_len=model_config.max_model_len,
            is_encoder_decoder=model_config.is_encoder_decoder,
        ),
    )
    # monkey patch vllm.v1.worker.gpu_model_runner.get_pp_group
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@ -40,7 +40,7 @@ from vllm.engine.arg_utils import EngineArgs
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 try:
-    from vllm.transformers_utils.tokenizer import get_tokenizer
+    from vllm.tokenizers import get_tokenizer
 except ImportError:
    from backend_request_func import get_tokenizer
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@ -46,7 +46,7 @@ from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase
 try:
-    from vllm.transformers_utils.tokenizer import get_tokenizer
+    from vllm.tokenizers import get_tokenizer
 except ImportError:
    from backend_request_func import get_tokenizer
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@ -237,6 +237,7 @@ def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable:
            b_q_weight=w_q,
            b_bias=None,
            b_scales=w_s,
            a_scales=None,
            global_scale=None,
            b_zeros=w_zp,
            g_idx=g_idx,
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@ -263,7 +263,7 @@ def bench_run(
    results.append(
        benchmark.Timer(
-            stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)",  # noqa: E501
+            stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)",  # noqa: E501
            globals=globals,
            label=label,
            sub_label=sub_label,
@ -273,7 +273,7 @@ def bench_run(
    results.append(
        benchmark.Timer(
-            stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)",  # noqa: E501
+            stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)",  # noqa: E501
            globals=globals,
            label=label,
            sub_label=sub_label,
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@ -495,7 +495,13 @@ function (define_extension_target MOD_NAME)
    set(SOABI_KEYWORD "")
  endif()
-  if (ARG_USE_SABI)
+  run_python(IS_FREETHREADED_PYTHON
    "import sysconfig; print(1 if sysconfig.get_config_var(\"Py_GIL_DISABLED\") else 0)"
    "Failed to determine whether interpreter is free-threaded")
  # Free-threaded Python doesn't yet support the stable ABI (see PEP 803/809),
  # so avoid using the stable ABI under free-threading only.
  if (ARG_USE_SABI AND NOT IS_FREETHREADED_PYTHON)
    Python_add_library(${MOD_NAME} MODULE USE_SABI ${ARG_USE_SABI} ${SOABI_KEYWORD} "${ARG_SOURCES}")
  else()
    Python_add_library(${MOD_NAME} MODULE ${SOABI_KEYWORD} "${ARG_SOURCES}")
--- a/csrc/attention/merge_attn_states.cu
+++ b/csrc/attention/merge_attn_states.cu
@ -16,7 +16,8 @@ __global__ void merge_attn_states_kernel(
    scalar_t* output, float* output_lse, const scalar_t* prefix_output,
    const float* prefix_lse, const scalar_t* suffix_output,
    const float* suffix_lse, const uint num_tokens, const uint num_heads,
-    const uint head_size) {
+    const uint head_size, const uint prefix_head_stride,
    const uint output_head_stride) {
  using pack_128b_t = uint4;
  const uint pack_size = 16 / sizeof(scalar_t);
  const uint threads_per_head = head_size / pack_size;
@ -34,11 +35,13 @@ __global__ void merge_attn_states_kernel(
  const uint head_idx = token_head_idx % num_heads;
  const uint pack_offset = pack_idx * pack_size;  // (0~15)*8, etc.
-  const uint head_offset =
+  const uint src_head_offset = token_idx * num_heads * prefix_head_stride +
-      token_idx * num_heads * head_size + head_idx * head_size;
+                               head_idx * prefix_head_stride;
-  const scalar_t* prefix_head_ptr = prefix_output + head_offset;
+  const uint dst_head_offset = token_idx * num_heads * output_head_stride +
-  const scalar_t* suffix_head_ptr = suffix_output + head_offset;
+                               head_idx * output_head_stride;
-  scalar_t* output_head_ptr = output + head_offset;
+  const scalar_t* prefix_head_ptr = prefix_output + src_head_offset;
  const scalar_t* suffix_head_ptr = suffix_output + src_head_offset;
  scalar_t* output_head_ptr = output + dst_head_offset;
  float p_lse = prefix_lse[head_idx * num_tokens + token_idx];
  float s_lse = suffix_lse[head_idx * num_tokens + token_idx];
@ -140,7 +143,7 @@ __global__ void merge_attn_states_kernel(
            reinterpret_cast<float*>(prefix_lse.data_ptr()),                \
            reinterpret_cast<scalar_t*>(suffix_output.data_ptr()),          \
            reinterpret_cast<float*>(suffix_lse.data_ptr()), num_tokens,    \
-            num_heads, head_size);                                          \
+            num_heads, head_size, prefix_head_stride, output_head_stride);  \
  }
 /*@brief Merges the attention states from prefix and suffix
@ -166,17 +169,11 @@ void merge_attn_states_launcher(torch::Tensor& output,
  const uint num_tokens = output.size(0);
  const uint num_heads = output.size(1);
  const uint head_size = output.size(2);
  const uint prefix_head_stride = prefix_output.stride(1);
  const uint output_head_stride = output.stride(1);
  const uint pack_size = 16 / sizeof(scalar_t);
  TORCH_CHECK(head_size % pack_size == 0,
              "headsize must be multiple of pack_size:", pack_size);
  TORCH_CHECK(output.stride(-2) == head_size && output.stride(-1) == 1,
              "output heads must be contiguous in memory");
  TORCH_CHECK(
      prefix_output.stride(-2) == head_size && prefix_output.stride(-1) == 1,
      "prefix_output heads must be contiguous in memory");
  TORCH_CHECK(
      suffix_output.stride(-2) == head_size && suffix_output.stride(-1) == 1,
      "suffix_output heads must be contiguous in memory");
  float* output_lse_ptr = nullptr;
  if (output_lse.has_value()) {
    output_lse_ptr = output_lse.value().data_ptr<float>();
--- a/csrc/cpu/utils.cpp
+++ b/csrc/cpu/utils.cpp
@ -51,12 +51,13 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
      if (node_id != -1) {
        node_ids.insert(node_id);
      }
-      TORCH_WARN(node_id == mem_node_id, "CPU ", cpu_id, " is on NUMA node ",
+      if (node_id != mem_node_id) {
-                 node_id, ", but CPU ", omp_cpu_ids.front(),
+        TORCH_WARN("CPU ", cpu_id, " is on NUMA node ", node_id, ", but CPU ",
-                 " is on NUMA node ", mem_node_id,
+                   omp_cpu_ids.front(), " is on NUMA node ", mem_node_id,
-                 ". All CPUs should be on the same NUMA node for optimal "
+                   ". All CPUs should be on the same NUMA node for optimal "
-                 "performance. Memory will be bound to NUMA node ",
+                   "performance. Memory will be bound to NUMA node ",
-                 mem_node_id, ".");
+                   mem_node_id, ".");
      }
    }
    // Concatenate all node_ids into a single comma-separated string
    if (!node_ids.empty()) {
--- a/csrc/moe/dynamic_4bit_int_moe_cpu.cpp
+++ b/csrc/moe/dynamic_4bit_int_moe_cpu.cpp
@ -93,16 +93,16 @@ torch::Tensor dynamic_4bit_int_moe_cpu(
  }
  auto Y_all = at::empty({offsets[E], H}, x_c.options());
-  at::parallel_for(0, E, 1, [&](int64_t e_begin, int64_t e_end) {
+  at::parallel_for(0, offsets[E], 0, [&](int64_t idx_begin, int64_t idx_end) {
    c10::InferenceMode guard;
-    for (int64_t e = e_begin; e < e_end; ++e) {
+    for (int64_t e = 0; e < E; ++e) {
-      const int64_t te = counts[e];
+      int64_t start = std::max(offsets[e], idx_begin);
-      if (te == 0) {
+      int64_t end = std::min(offsets[e + 1], idx_end);
      int64_t te = end - start;
      if (te <= 0) {
        continue;
      }
      const int64_t start = offsets[e];
      auto x_e = X_all.narrow(/*dim=*/0, /*start=*/start, /*length=*/te);
      auto w13_e = w13_packed.select(/*dim=*/0, e);
--- a/csrc/moe/marlin_moe_wna16/.gitignore
+++ b/csrc/moe/marlin_moe_wna16/.gitignore
@ -1 +1,2 @@
-kernel_*.cu
+sm*_kernel_*.cu
 kernel_selector.h
--- a/csrc/moe/marlin_moe_wna16/generate_kernels.py
+++ b/csrc/moe/marlin_moe_wna16/generate_kernels.py
@ -4,134 +4,282 @@ import glob
 import itertools
 import os
 import subprocess
 import sys
 import jinja2
-FILE_HEAD = """
+ARCHS = []
-// auto generated by generate.py
+SUPPORT_FP8 = False
-// clang-format off
+for arch in sys.argv[1].split(","):
    arch = arch[: arch.index(".") + 2].replace(".", "")
    arch = int(arch)
    # only SM89 and SM120 fully support
    # mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32.
    # SM90 and SM100 can use this PTX, but it’s simulated
    # with FP16 MMA, so it cannot achieve any acceleration.
    if arch in [89, 120]:
        SUPPORT_FP8 = True
 FILE_HEAD_COMMENT = """
 // auto generated by generate_kernels.py
 // clang-format off
 """.lstrip()
 FILE_HEAD = (
    FILE_HEAD_COMMENT
    + """
 #include "kernel.h"
 #include "marlin_template.h"
 namespace MARLIN_NAMESPACE_NAME {
-""".strip()
+"""
 )
 TEMPLATE = (
    "template __global__ void Marlin<"
-    "{{scalar_t}}, "
+    "{{a_type_id}}, "
-    "{{w_type_id}}, "
+    "{{b_type_id}}, "
    "{{c_type_id}}, "
    "{{s_type_id}}, "
    "{{threads}}, "
    "{{thread_m_blocks}}, "
    "{{thread_n_blocks}}, "
    "{{thread_k_blocks}}, "
-    "{{'true' if m_block_size_8 else 'false'}}, "
+    "{{m_block_size_8}}, "
    "{{stages}}, "
    "{{group_blocks}}, "
-    "{{'true' if is_zp_float else 'false'}}>"
+    "{{is_zp_float}}>"
    "( MARLIN_KERNEL_PARAMS );"
 )
 # int8 with zero point case (vllm::kU8) is also supported,
 # we don't add it to reduce wheel size.
 SCALAR_TYPES = [
    "vllm::kU4",
    "vllm::kU4B8",
    "vllm::kU8B128",
    "vllm::kFE4M3fn",
    "vllm::kFE2M1f",
 ]
 THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128)]
 THREAD_M_BLOCKS = [0.5, 1, 2, 3, 4]
-# group_blocks:
+
-#   = 0 : act order case
+QUANT_CONFIGS = [
-#   = -1 : channelwise quantization
+    # AWQ-INT4
-#   > 0 : group_size=16*group_blocks
+    {
-GROUP_BLOCKS = [0, -1, 1, 2, 4, 8]
+        "b_type": "kU4",
-DTYPES = ["fp16", "bf16"]
+        "thread_configs": THREAD_CONFIGS,
        "thread_m_blocks": THREAD_M_BLOCKS,
        "group_blocks": [-1, 2, 4, 8],
    },
    # GPTQ-INT4
    {
        "b_type": "kU4B8",
        "thread_configs": THREAD_CONFIGS,
        "thread_m_blocks": THREAD_M_BLOCKS,
        "group_blocks": [-1, 0, 2, 4, 8],
    },
    # AWQ-INT8
    {
        "b_type": "kU8B128",
        "thread_configs": THREAD_CONFIGS,
        "thread_m_blocks": THREAD_M_BLOCKS,
        "group_blocks": [-1, 0, 2, 4, 8],
    },
    # FP8
    {
        "b_type": "kFE4M3fn",
        "thread_configs": THREAD_CONFIGS,
        "thread_m_blocks": THREAD_M_BLOCKS,
        "group_blocks": [-1, 8],
    },
    # NVFP4
    {
        "b_type": "kFE2M1f",
        "s_type": "kFE4M3fn",
        "thread_configs": THREAD_CONFIGS,
        "thread_m_blocks": THREAD_M_BLOCKS,
        "group_blocks": [1],
    },
    # MXFP4
    {
        "a_type": ["kBFloat16"],
        "b_type": "kFE2M1f",
        "s_type": "kFE8M0fnu",
        "thread_configs": THREAD_CONFIGS,
        "thread_m_blocks": THREAD_M_BLOCKS,
        "group_blocks": [2],
    },
    # AWQ-INT4 with INT8 activation
    {
        "a_type": ["kS8"],
        "b_type": "kU4",
        "thread_configs": THREAD_CONFIGS,
        "thread_m_blocks": [1, 2, 3, 4],
        "group_blocks": [-1, 2, 4, 8],
    },
    # GPTQ-INT4 with INT8 activation
    {
        "a_type": ["kS8"],
        "b_type": "kU4B8",
        "thread_configs": THREAD_CONFIGS,
        "thread_m_blocks": [1, 2, 3, 4],
        "group_blocks": [-1, 2, 4, 8],
    },
    # GPTQ-INT4 with FP8 activation
    {
        "a_type": ["kFE4M3fn"],
        "b_type": "kU4B8",
        "thread_configs": THREAD_CONFIGS,
        "thread_m_blocks": [1, 2, 3, 4],
        "group_blocks": [-1, 2, 4, 8],
    },
    # AWQ-INT4 with FP8 activation
    {
        "a_type": ["kFE4M3fn"],
        "b_type": "kU4",
        "thread_configs": THREAD_CONFIGS,
        "thread_m_blocks": [1, 2, 3, 4],
        "group_blocks": [-1, 2, 4, 8],
    },
    # MXFP4 with FP8 activation
    {
        "a_type": ["kFE4M3fn"],
        "b_type": "kFE2M1f",
        "c_type": ["kBFloat16"],
        "s_type": "kFE8M0fnu",
        "thread_configs": THREAD_CONFIGS,
        "thread_m_blocks": [1, 2, 3, 4],
        "group_blocks": [2],
    },
 ]
 def remove_old_kernels():
-    for filename in glob.glob(os.path.dirname(__file__) + "/kernel_*.cu"):
+    for filename in glob.glob(os.path.dirname(__file__) + "/*kernel_*.cu"):
        subprocess.call(["rm", "-f", filename])
    filename = os.path.dirname(__file__) + "/kernel_selector.h"
    subprocess.call(["rm", "-f", filename])
 def generate_new_kernels():
-    for scalar_type, dtype in itertools.product(SCALAR_TYPES, DTYPES):
+    result_dict = {}
    for quant_config in QUANT_CONFIGS:
        c_types = quant_config.get("c_type", ["kFloat16", "kBFloat16"])
        a_types = quant_config.get("a_type", ["kFloat16", "kBFloat16"])
        b_type = quant_config["b_type"]
        all_group_blocks = quant_config["group_blocks"]
        all_m_blocks = quant_config["thread_m_blocks"]
        all_thread_configs = quant_config["thread_configs"]
        for a_type, c_type in itertools.product(a_types, c_types):
            if not SUPPORT_FP8 and a_type == "kFE4M3fn":
                continue
            if "16" in a_type and "16" in c_type and a_type != c_type:
                continue
            s_type = quant_config.get("s_type", c_type)
            if (a_type, b_type, c_type) not in result_dict:
                result_dict[(a_type, b_type, c_type)] = []
            for group_blocks, m_blocks, thread_configs in itertools.product(
                all_group_blocks, all_m_blocks, all_thread_configs
            ):
                thread_k, thread_n, threads = thread_configs
                if threads == 256:
                    # for small batch (m_blocks == 1),
                    #     we only need (128, 128, 256)
                    # for large batch (m_blocks > 1),
                    #     we only need (64, 256, 256)
                    if m_blocks <= 1 and (thread_k, thread_n) != (128, 128):
                        continue
                    if m_blocks > 1 and (thread_k, thread_n) != (64, 256):
                        continue
                config = {
                    "threads": threads,
                    "s_type": s_type,
                    "thread_m_blocks": max(m_blocks, 1),
                    "thread_k_blocks": thread_k // 16,
                    "thread_n_blocks": thread_n // 16,
                    "m_block_size_8": "true" if m_blocks == 0.5 else "false",
                    "stages": "pipe_stages",
                    "group_blocks": group_blocks,
                    "is_zp_float": "false",
                }
                result_dict[(a_type, b_type, c_type)].append(config)
    kernel_selector_str = FILE_HEAD_COMMENT
    for (a_type, b_type, c_type), config_list in result_dict.items():
        all_template_str_list = []
-
+        for config in config_list:
-        for group_blocks, m_blocks, thread_configs in itertools.product(
+            s_type = config["s_type"]
            GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS
        ):
            # act order case only support gptq-int4 and gptq-int8
            if group_blocks == 0 and scalar_type not in [
                "vllm::kU4B8",
                "vllm::kU8B128",
            ]:
                continue
            if thread_configs[2] == 256:
                # for small batch (m_blocks == 1), we only need (128, 128, 256)
                # for large batch (m_blocks > 1), we only need (64, 256, 256)
                if m_blocks <= 1 and thread_configs[0] != 128:
                    continue
                if m_blocks > 1 and thread_configs[0] != 64:
                    continue
            # we only support channelwise quantization and group_size == 128
            # for fp8
            if scalar_type == "vllm::kFE4M3fn" and group_blocks not in [-1, 8]:
                continue
            # nvfp4 only supports group_size == 16
            # mxfp4 only supports group_size == 32
            if scalar_type == "vllm::kFE2M1f" and group_blocks not in [1, 2]:
                continue
            # other quantization methods don't support group_size = 16
            if scalar_type != "vllm::kFE2M1f" and group_blocks == 1:
                continue
            k_blocks = thread_configs[0] // 16
            n_blocks = thread_configs[1] // 16
            threads = thread_configs[2]
            c_dtype = "half" if dtype == "fp16" else "nv_bfloat16"
            if scalar_type == "vllm::kFE2M1f" and group_blocks == 1:
                s_type = "vllm::kFE4M3fn"
            elif scalar_type == "vllm::kFE2M1f" and group_blocks == 2:
                s_type = "vllm::kFE8M0fnu"
                if dtype == "fp16":
                    # we cannot safely dequantize e8m0 to fp16, so skip this
                    continue
            elif dtype == "fp16":
                s_type = "vllm::kFloat16"
            elif dtype == "bf16":
                s_type = "vllm::kBFloat16"
            template_str = jinja2.Template(TEMPLATE).render(
-                scalar_t=c_dtype,
+                a_type_id=f"vllm::{a_type}.id()",
-                w_type_id=scalar_type + ".id()",
+                b_type_id=f"vllm::{b_type}.id()",
-                s_type_id=s_type + ".id()",
+                c_type_id=f"vllm::{c_type}.id()",
-                threads=threads,
+                s_type_id=f"vllm::{s_type}.id()",
-                thread_m_blocks=max(m_blocks, 1),
+                **config,
-                thread_n_blocks=n_blocks,
+            )
-                thread_k_blocks=k_blocks,
+            all_template_str_list.append(template_str)
-                m_block_size_8=m_blocks == 0.5,
+
-                stages="pipe_stages",
+            conditions = [
-                group_blocks=group_blocks,
+                f"a_type == vllm::{a_type}",
-                is_zp_float=False,
+                f"b_type == vllm::{b_type}",
                f"c_type == vllm::{c_type}",
                f"s_type == vllm::{s_type}",
                f"threads == {config['threads']}",
                f"thread_m_blocks == {config['thread_m_blocks']}",
                f"thread_n_blocks == {config['thread_n_blocks']}",
                f"thread_k_blocks == {config['thread_k_blocks']}",
                f"m_block_size_8 == {config['m_block_size_8']}",
                f"group_blocks == {config['group_blocks']}",
                f"is_zp_float == {config['is_zp_float']}",
            ]
            conditions = " && ".join(conditions)
            if kernel_selector_str == FILE_HEAD_COMMENT:
                kernel_selector_str += f"if ({conditions})\n  kernel = "
            else:
                kernel_selector_str += f"else if ({conditions})\n  kernel = "
            kernel_template2 = (
                "Marlin<{{a_type_id}}, {{b_type_id}}, {{c_type_id}}, "
                "{{s_type_id}}, {{threads}}, {{thread_m_blocks}}, "
                "{{thread_n_blocks}}, {{thread_k_blocks}}, "
                "{{m_block_size_8}}, {{stages}}, {{group_blocks}}, "
                "{{is_zp_float}}>;"
            )
-            all_template_str_list.append(template_str)
+            kernel_selector_str += (
                jinja2.Template(kernel_template2).render(
                    a_type_id=f"vllm::{a_type}.id()",
                    b_type_id=f"vllm::{b_type}.id()",
                    c_type_id=f"vllm::{c_type}.id()",
                    s_type_id=f"vllm::{s_type}.id()",
                    **config,
                )
                + "\n"
            )
        file_content = FILE_HEAD + "\n\n"
        file_content += "\n\n".join(all_template_str_list) + "\n\n}\n"
-        filename = f"kernel_{dtype}_{scalar_type[6:].lower()}.cu"
+        if a_type == "kFE4M3fn":
            filename = f"sm89_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
        else:
            filename = f"sm80_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
        filename = filename.lower()
        with open(os.path.join(os.path.dirname(__file__), filename), "w") as f:
            f.write(file_content)
    if not SUPPORT_FP8 and kernel_selector_str != FILE_HEAD_COMMENT:
        kernel_selector_str += (
            "else if (a_type == vllm::kFE4M3fn)\n"
            "  TORCH_CHECK(false, "
            '"marlin kernel with fp8 activation is not built.");'
        )
    with open(os.path.join(os.path.dirname(__file__), "kernel_selector.h"), "w") as f:
        f.write(kernel_selector_str)
 if __name__ == "__main__":
    remove_old_kernels()
--- a/csrc/moe/marlin_moe_wna16/kernel.h
+++ b/csrc/moe/marlin_moe_wna16/kernel.h
@ -11,8 +11,9 @@
  const int4 *__restrict__ A, const int4 *__restrict__ B,                     \
      int4 *__restrict__ C, int4 *__restrict__ C_tmp,                         \
      const int4 *__restrict__ b_bias_ptr,                                    \
      const float *__restrict__ a_scales_ptr,                                 \
      const int4 *__restrict__ scales_ptr,                                    \
-      const uint16_t *__restrict__ scale2_ptr,                                \
+      const uint16_t *__restrict__ global_scale_ptr,                          \
      const int4 *__restrict__ zp_ptr, const int *__restrict__ g_idx,         \
      const int32_t *__restrict__ sorted_token_ids_ptr,                       \
      const int32_t *__restrict__ expert_ids_ptr,                             \
@ -20,12 +21,13 @@
      const float *__restrict__ topk_weights_ptr, int top_k,                  \
      bool mul_topk_weights, bool is_ep, int num_groups, int prob_m,          \
      int prob_n, int prob_k, int *locks, bool has_bias, bool use_atomic_add, \
-      bool use_fp32_reduce, int max_shared_mem
+      bool use_fp32_reduce
 namespace MARLIN_NAMESPACE_NAME {
-template <typename scalar_t,  // compute dtype, half or nv_float16
+template <const vllm::ScalarTypeId a_type_id,  // A ScalarType id
-          const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
+          const vllm::ScalarTypeId b_type_id,  // B ScalarType id
-          const vllm::ScalarTypeId s_type_id,  // weight scale ScalarType id
+          const vllm::ScalarTypeId c_type_id,  // C ScalarType id
          const vllm::ScalarTypeId s_type_id,  // B_SCALE ScalarType id
          const int threads,          // number of threads in a threadblock
          const int thread_m_blocks,  // number of 16x16 blocks in the m
                                      // dimension (batchsize) of the
--- a/csrc/moe/marlin_moe_wna16/marlin_template.h
+++ b/csrc/moe/marlin_moe_wna16/marlin_template.h
--- a/csrc/moe/marlin_moe_wna16/ops.cu
+++ b/csrc/moe/marlin_moe_wna16/ops.cu
@ -37,39 +37,6 @@ __global__ void MarlinDefault(MARLIN_KERNEL_PARAMS){};
 using MarlinFuncPtr = void (*)(MARLIN_KERNEL_PARAMS);
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
 template <int moe_block_size>
 __global__ void permute_cols_kernel(
    int4 const* __restrict__ a_int4_ptr, int const* __restrict__ perm_int_ptr,
    int4* __restrict__ out_int4_ptr,
    const int32_t* __restrict__ sorted_token_ids_ptr,
    const int32_t* __restrict__ expert_ids_ptr,
    const int32_t* __restrict__ num_tokens_past_padded_ptr, int size_m,
    int size_k, int top_k) {};
 }  // namespace marlin
 torch::Tensor moe_wna16_marlin_gemm(
    torch::Tensor& a, std::optional<torch::Tensor> c_or_none,
    torch::Tensor& b_q_weight,
    std::optional<torch::Tensor> const& b_bias_or_none, torch::Tensor& b_scales,
    std::optional<torch::Tensor> const& b_zeros_or_none,
    std::optional<torch::Tensor> const& g_idx_or_none,
    std::optional<torch::Tensor> const& perm_or_none, torch::Tensor& workspace,
    torch::Tensor& sorted_token_ids, torch::Tensor& expert_ids,
    torch::Tensor& num_tokens_past_padded, torch::Tensor& topk_weights,
    int64_t moe_block_size, int64_t top_k, bool mul_topk_weights, bool is_ep,
    vllm::ScalarTypeId const& b_q_type_id, int64_t size_m, int64_t size_n,
    int64_t size_k, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce,
    bool is_zp_float) {
  TORCH_CHECK_NOT_IMPLEMENTED(false,
                              "marlin_gemm(..) requires CUDA_ARCH >= 8.0");
  return torch::empty({1, 1});
 }
 #else
 // For a given "a" of size [M,K] performs a permutation of the K columns based
 // on the given "perm" indices.
 template <int moe_block_size>
@ -207,7 +174,7 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8,
                          int thread_m_blocks, int prob_m, int prob_n,
                          int prob_k, int num_bits, int group_size,
                          bool has_act_order, bool is_k_full, int has_zp,
-                          int is_zp_float) {
+                          int is_zp_float, bool is_a_8bit) {
  int pack_factor = 32 / num_bits;
  // Get B size
@ -217,8 +184,8 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8,
  // shm size for block_sorted_ids/rd_block_sorted_ids/block_topk_weights
  // both of them requires tb_m * 4 bytes (tb_m * int32 or tb_m * float32)
-  int sh_block_meta_size = tb_m * 4;
+  int sh_block_meta_size = tb_m * 16;
-  int sh_a_size = pipe_stages * (tb_m * tb_k) * 2;
+  int sh_a_size = pipe_stages * (tb_m * tb_k) * (is_a_8bit ? 1 : 2);
  int sh_b_size = pipe_stages * (tb_k * tb_n / pack_factor) * 4;
  int sh_red_size = tb_m * (tb_n + 8) * 2;
  int sh_bias_size = tb_n * 2;
@ -250,7 +217,7 @@ bool is_valid_config(thread_config_t const& th_config, bool m_block_size_8,
                     int thread_m_blocks, int prob_m, int prob_n, int prob_k,
                     int num_bits, int group_size, bool has_act_order,
                     bool is_k_full, int has_zp, int is_zp_float,
-                     int max_shared_mem) {
+                     int max_shared_mem, bool is_a_8bit) {
  // Sanity
  if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
      th_config.num_threads == -1) {
@ -273,188 +240,34 @@ bool is_valid_config(thread_config_t const& th_config, bool m_block_size_8,
  }
  // Check that pipeline fits into cache
-  int cache_size = get_kernel_cache_size(
+  int cache_size =
-      th_config, m_block_size_8, thread_m_blocks, prob_m, prob_n, prob_k,
+      get_kernel_cache_size(th_config, m_block_size_8, thread_m_blocks, prob_m,
-      num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float);
+                            prob_n, prob_k, num_bits, group_size, has_act_order,
-  return cache_size + 512 <= max_shared_mem;
+                            is_k_full, has_zp, is_zp_float, is_a_8bit);
  return cache_size <= max_shared_mem;
 }
-  #define _GET_IF(W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,   \
+MarlinFuncPtr get_marlin_kernel(
-                  M_BLOCK_SIZE_8, GROUP_BLOCKS, NUM_THREADS, IS_ZP_FLOAT)      \
+    const vllm::ScalarType a_type, const vllm::ScalarType b_type,
-    else if (q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS &&         \
+    const vllm::ScalarType c_type, const vllm::ScalarType s_type,
-             thread_n_blocks == THREAD_N_BLOCKS &&                             \
+    int thread_m_blocks, int thread_n_blocks, int thread_k_blocks,
-             thread_k_blocks == THREAD_K_BLOCKS &&                             \
+    bool m_block_size_8, bool has_act_order, bool has_zp, int group_blocks,
-             m_block_size_8 == M_BLOCK_SIZE_8 &&                               \
+    int threads, bool is_zp_float) {
-             group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS &&     \
+  int num_bits = b_type.size_bits();
             is_zp_float == IS_ZP_FLOAT) {                                     \
      constexpr auto S_TYPE =                                                  \
          W_TYPE == vllm::kFE2M1f                                              \
              ? (GROUP_BLOCKS == 1 ? vllm::kFE4M3fn : vllm::kFE8M0fnu)         \
              : (std::is_same<scalar_t, half>::value ? vllm::kFloat16          \
                                                     : vllm::kBFloat16);       \
      kernel = Marlin<scalar_t, W_TYPE.id(), S_TYPE.id(), NUM_THREADS,         \
                      THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,       \
                      M_BLOCK_SIZE_8, pipe_stages, GROUP_BLOCKS, IS_ZP_FLOAT>; \
    }
  // COMMON: cases for (group_blocks in [-1, 2, 4, 8] and is_zp_float == false)
  //         this is the most common cases
  // BIGGROUP: cases for big group size (group_blocks in [-1, 8])
  // FZP: cases for float-zero-point (is_zp_float = true)
  // ACT: cases for act order case (group_blocks == 0)
  // FP4: cases for nvfp4(e2m1) (group_blocks == 1)
  #define COMMON_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, -1, NUM_THREADS, false)  \
    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 2, NUM_THREADS, false)   \
    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 4, NUM_THREADS, false)   \
    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 8, NUM_THREADS, false)   \
    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)  \
    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false)  \
    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
  #define COMMON_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)  \
    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false)  \
    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
                                                                          \
    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)  \
    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false)  \
    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
                                                                          \
    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)  \
    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false)  \
    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
  #define COMMON_GET_IF(W_TYPE)            \
    COMMON_GET_IF_M1(W_TYPE, 8, 8, 256)    \
    COMMON_GET_IF_M1(W_TYPE, 8, 4, 128)    \
    COMMON_GET_IF_M234(W_TYPE, 16, 4, 256) \
    COMMON_GET_IF_M234(W_TYPE, 8, 4, 128)
  #define BIGGROUP_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, -1, NUM_THREADS, false)  \
    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 8, NUM_THREADS, false)   \
    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
  #define BIGGROUP_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)   \
    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
  #define BIGGROUP_GET_IF(W_TYPE)            \
    BIGGROUP_GET_IF_M1(W_TYPE, 8, 8, 256)    \
    BIGGROUP_GET_IF_M1(W_TYPE, 8, 4, 128)    \
    BIGGROUP_GET_IF_M234(W_TYPE, 16, 4, 256) \
    BIGGROUP_GET_IF_M234(W_TYPE, 8, 4, 128)
  #define NVFP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)      \
    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 1, NUM_THREADS, false) \
    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)
  #define NVFP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)
  #define NVFP4_GET_IF(W_TYPE)            \
    NVFP4_GET_IF_M1(W_TYPE, 8, 8, 256)    \
    NVFP4_GET_IF_M1(W_TYPE, 8, 4, 128)    \
    NVFP4_GET_IF_M234(W_TYPE, 16, 4, 256) \
    NVFP4_GET_IF_M234(W_TYPE, 8, 4, 128)
  #define MXFP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)      \
    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 2, NUM_THREADS, false) \
    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)
  #define MXFP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \
    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \
    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)
  #define MXFP4_GET_IF(W_TYPE)            \
    MXFP4_GET_IF_M1(W_TYPE, 8, 8, 256)    \
    MXFP4_GET_IF_M1(W_TYPE, 8, 4, 128)    \
    MXFP4_GET_IF_M234(W_TYPE, 16, 4, 256) \
    MXFP4_GET_IF_M234(W_TYPE, 8, 4, 128)
  // We currently have 4-bit models only with group_blocks == 4
  #define FZP_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 4, NUM_THREADS, true) \
    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true)
  #define FZP_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)      \
    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true) \
    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true) \
    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true)
  #define FZP_GET_IF(W_TYPE)            \
    FZP_GET_IF_M1(W_TYPE, 8, 8, 256)    \
    FZP_GET_IF_M1(W_TYPE, 8, 4, 128)    \
    FZP_GET_IF_M234(W_TYPE, 16, 4, 256) \
    FZP_GET_IF_M234(W_TYPE, 8, 4, 128)
  // We currently have 4-bit models only with group_blocks == 4
  #define ACT_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)        \
    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS, false) \
    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false)
  #define ACT_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false) \
    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false) \
    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false)
  #define ACT_GET_IF(W_TYPE)            \
    ACT_GET_IF_M1(W_TYPE, 8, 8, 256)    \
    ACT_GET_IF_M1(W_TYPE, 8, 4, 128)    \
    ACT_GET_IF_M234(W_TYPE, 16, 4, 256) \
    ACT_GET_IF_M234(W_TYPE, 8, 4, 128)
 template <typename scalar_t>
 MarlinFuncPtr get_marlin_kernel(const vllm::ScalarType q_type,
                                int thread_m_blocks, int thread_n_blocks,
                                int thread_k_blocks, bool m_block_size_8,
                                bool has_act_order, bool has_zp,
                                int group_blocks, int num_threads,
                                bool is_zp_float) {
  int num_bits = q_type.size_bits();
  auto kernel = MarlinDefault;
  if (false) {
  }
-  COMMON_GET_IF(vllm::kU4)
+#include "kernel_selector.h"
  COMMON_GET_IF(vllm::kU4B8)
  COMMON_GET_IF(vllm::kU8B128)
  NVFP4_GET_IF(vllm::kFE2M1f)
  BIGGROUP_GET_IF(vllm::kFE4M3fn)
  ACT_GET_IF(vllm::kU4B8)
  ACT_GET_IF(vllm::kU8B128)
  if (std::is_same<scalar_t, nv_bfloat16>::value) {
    if (false) {
    }
    MXFP4_GET_IF(vllm::kFE2M1f)
  }
  return kernel;
 }
-template <typename scalar_t>
+exec_config_t determine_exec_config(
-exec_config_t determine_exec_config(const vllm::ScalarType& q_type, int prob_m,
+    const vllm::ScalarType& a_type, const vllm::ScalarType& b_type,
-                                    int prob_n, int prob_k, int thread_m_blocks,
+    const vllm::ScalarType& c_type, const vllm::ScalarType& s_type, int prob_m,
-                                    bool m_block_size_8, int num_bits,
+    int prob_n, int prob_k, int num_experts, int top_k, int thread_m_blocks,
-                                    int group_size, bool has_act_order,
+    bool m_block_size_8, int num_bits, int group_size, bool has_act_order,
-                                    bool is_k_full, bool has_zp,
+    bool is_k_full, bool has_zp, bool is_zp_float, int max_shared_mem, int sms,
-                                    bool is_zp_float, int max_shared_mem) {
+    bool is_a_8bit) {
  exec_config_t exec_cfg = exec_config_t{1, thread_config_t{-1, -1, -1}};
  thread_config_t* thread_configs = thread_m_blocks > 1
                                        ? large_batch_thread_configs
@ -471,73 +284,69 @@ exec_config_t determine_exec_config(const vllm::ScalarType& q_type, int prob_m,
    if (!is_valid_config(th_config, m_block_size_8, thread_m_blocks, prob_m,
                         prob_n, prob_k, num_bits, group_size, has_act_order,
-                         is_k_full, has_zp, is_zp_float, max_shared_mem)) {
+                         is_k_full, has_zp, is_zp_float, max_shared_mem - 512,
                         is_a_8bit)) {
      continue;
    }
    int cache_size = get_kernel_cache_size(
        th_config, m_block_size_8, thread_m_blocks, prob_m, prob_n, prob_k,
-        num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float);
+        num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float,
        is_a_8bit);
    int group_blocks = 0;
    if (!has_act_order) {
      group_blocks = group_size == -1 ? -1 : (group_size / 16);
    }
-    auto kernel = get_marlin_kernel<scalar_t>(
+    auto kernel =
-        q_type, thread_m_blocks, th_config.thread_n / 16,
+        get_marlin_kernel(a_type, b_type, c_type, s_type, thread_m_blocks,
-        th_config.thread_k / 16, m_block_size_8, has_act_order, has_zp,
+                          th_config.thread_n / 16, th_config.thread_k / 16,
-        group_blocks, th_config.num_threads, is_zp_float);
+                          m_block_size_8, has_act_order, has_zp, group_blocks,
                          th_config.num_threads, is_zp_float);
    if (kernel == MarlinDefault) continue;
-    if (thread_m_blocks > 1) {
+    cudaFuncAttributes attr;
-      exec_cfg = {1, th_config};
+    cudaFuncGetAttributes(&attr, kernel);
-      break;
+    int reg_size = max(attr.numRegs, 1) * th_config.num_threads * 4;
-    } else {
+    int allow_count = min(device_max_reg_size / reg_size,
-      cudaFuncAttributes attr;
+                          max_shared_mem / (cache_size + 1536));
-      cudaFuncGetAttributes(&attr, kernel);
+    if (thread_m_blocks == 1)
      int reg_size = max(attr.numRegs, 1) * th_config.num_threads * 4;
      int allow_count = min(device_max_reg_size / reg_size,
                            max_shared_mem / (cache_size + 1024));
      allow_count = max(min(allow_count, 4), 1);
-      if (allow_count > count) {
+    else
-        count = allow_count;
+      allow_count = max(min(allow_count, 2), 1);
-        exec_cfg = {count, th_config};
+
-      };
+    if (prob_n / th_config.thread_n * prob_m * top_k * 4 < sms * allow_count) {
      allow_count =
          max(prob_n / th_config.thread_n * prob_m * top_k * 4 / sms, 1);
    }
    if (allow_count > count) {
      count = allow_count;
      exec_cfg = {count, th_config};
    };
  }
  return exec_cfg;
 }
 template <typename scalar_t>
 void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
-               void* s, void* s2, void* zp, void* g_idx, void* perm,
+               void* a_s, void* b_s, void* g_s, void* zp, void* g_idx,
-               void* a_tmp, void* sorted_token_ids, void* expert_ids,
+               void* perm, void* a_tmp, void* sorted_token_ids,
-               void* num_tokens_past_padded, void* topk_weights,
+               void* expert_ids, void* num_tokens_past_padded,
-               int moe_block_size, int top_k, bool mul_topk_weights, bool is_ep,
+               void* topk_weights, int moe_block_size, int num_experts,
-               int prob_m, int prob_n, int prob_k, void* workspace,
+               int top_k, bool mul_topk_weights, bool is_ep, int prob_m,
-               vllm::ScalarType const& q_type, bool has_bias,
+               int prob_n, int prob_k, void* workspace,
-               bool has_act_order, bool is_k_full, bool has_zp, int num_groups,
+               vllm::ScalarType const& a_type, vllm::ScalarType const& b_type,
-               int group_size, int dev, cudaStream_t stream, int thread_k,
+               vllm::ScalarType const& c_type, vllm::ScalarType const& s_type,
-               int thread_n, int sms, bool use_atomic_add, bool use_fp32_reduce,
+               bool has_bias, bool has_act_order, bool is_k_full, bool has_zp,
-               bool is_zp_float) {
+               int num_groups, int group_size, int dev, cudaStream_t stream,
               int thread_k, int thread_n, int sms, int blocks_per_sm,
               bool use_atomic_add, bool use_fp32_reduce, bool is_zp_float) {
  int thread_m_blocks = div_ceil(moe_block_size, 16);
  bool m_block_size_8 = moe_block_size == 8;
-
+  bool is_a_8bit = a_type.size_bits() == 8;
  if (has_zp) {
    TORCH_CHECK(
        q_type == vllm::kU4 || q_type == vllm::kU8,
        "q_type must be u4 or u8 when has_zp = True. Got = ", q_type.str());
  } else {
    TORCH_CHECK(
        q_type == vllm::kU4B8 || q_type == vllm::kU8B128 ||
            q_type == vllm::kFE4M3fn || q_type == vllm::kFE2M1f,
        "q_type must be uint4b8, uint8b128, float8_e4m3fn or float4_e2m1f when "
        "has_zp = False. Got = ",
        q_type.str());
  }
  TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
              ", ", prob_n, ", ", prob_k, "]");
@ -563,14 +372,15 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
    }
  }
-  int num_bits = q_type.size_bits();
+  int num_bits = b_type.size_bits();
  const int4* A_ptr = (const int4*)A;
  const int4* B_ptr = (const int4*)B;
  int4* C_ptr = (int4*)C;
  int4* C_tmp_ptr = (int4*)C_tmp;
  const int4* bias_ptr = (const int4*)b_bias;
-  const int4* s_ptr = (const int4*)s;
+  const float* a_s_ptr = (const float*)a_s;
-  const uint16_t* s2_ptr = (const uint16_t*)s2;
+  const int4* b_s_ptr = (const int4*)b_s;
  const uint16_t* g_s_ptr = (const uint16_t*)g_s;
  const int4* zp_ptr = (const int4*)zp;
  const int* g_idx_ptr = (const int*)g_idx;
  const int* perm_ptr = (const int*)perm;
@ -618,22 +428,41 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
  TORCH_CHECK(max_shared_mem > 0);
  int major_capability, minor_capability;
  cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor,
                         dev);
  cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor,
                         dev);
  TORCH_CHECK(major_capability * 10 + minor_capability >= 80,
              "marlin kernel only support Ampere or newer GPUs.");
  if (a_type == vllm::kFE4M3fn) {
    TORCH_CHECK(major_capability * 10 + minor_capability >= 89,
                "FP8 only support Ada Lovelace or newer GPUs.");
    TORCH_CHECK(
        major_capability * 10 + minor_capability == 89 ||
            major_capability * 10 + minor_capability == 120,
        "Marlin W4A8-FP8 only support SM89 or SM120 device (It is slower than "
        "Marlin W4A16 on other devices).");
  }
  // Set thread config
  exec_config_t exec_cfg;
  thread_config_t thread_tfg;
  if (thread_k != -1 && thread_n != -1) {
-    thread_tfg = thread_config_t{thread_k, thread_n, default_threads};
+    thread_tfg = thread_config_t{thread_k, thread_n, thread_k * thread_n / 64};
-    exec_cfg = exec_config_t{1, thread_tfg};
+    if (blocks_per_sm == -1) blocks_per_sm = 1;
    exec_cfg = exec_config_t{blocks_per_sm, thread_tfg};
    TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n,
                " is not divisible by thread_n = ", thread_n);
    TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k,
                " is not divisible by thread_k = ", thread_k);
  } else {
    // Auto config
-    exec_cfg = determine_exec_config<scalar_t>(
+    exec_cfg = determine_exec_config(
-        q_type, prob_m, prob_n, prob_k, thread_m_blocks, m_block_size_8,
+        a_type, b_type, c_type, s_type, prob_m, prob_n, prob_k, num_experts,
-        num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float,
+        top_k, thread_m_blocks, m_block_size_8, num_bits, group_size,
-        max_shared_mem);
+        has_act_order, is_k_full, has_zp, is_zp_float, max_shared_mem, sms,
        is_a_8bit);
    thread_tfg = exec_cfg.tb_cfg;
  }
@ -647,22 +476,29 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
  int thread_k_blocks = thread_k / 16;
  int thread_n_blocks = thread_n / 16;
-  TORCH_CHECK(
+  TORCH_CHECK(is_valid_config(thread_tfg, m_block_size_8, thread_m_blocks,
-      is_valid_config(thread_tfg, m_block_size_8, thread_m_blocks, prob_m,
+                              prob_m, prob_n, prob_k, num_bits, group_size,
-                      prob_n, prob_k, num_bits, group_size, has_act_order,
+                              has_act_order, is_k_full, has_zp, is_zp_float,
-                      is_k_full, has_zp, is_zp_float, max_shared_mem),
+                              max_shared_mem, is_a_8bit),
-      "Invalid thread config: thread_m_blocks = ", thread_m_blocks,
+              "Invalid thread config: thread_m_blocks = ", thread_m_blocks,
-      ", thread_k = ", thread_tfg.thread_k,
+              ", thread_k = ", thread_tfg.thread_k,
-      ", thread_n = ", thread_tfg.thread_n,
+              ", thread_n = ", thread_tfg.thread_n,
-      ", num_threads = ", thread_tfg.num_threads, " for MKN = [", prob_m, ", ",
+              ", num_threads = ", thread_tfg.num_threads, " for MKN = [",
-      prob_k, ", ", prob_n, "] and num_bits = ", num_bits,
+              prob_m, ", ", prob_k, ", ", prob_n, "] and num_bits = ", num_bits,
-      ", group_size = ", group_size, ", has_act_order = ", has_act_order,
+              ", group_size = ", group_size,
-      ", is_k_full = ", is_k_full, ", has_zp = ", has_zp,
+              ", has_act_order = ", has_act_order, ", is_k_full = ", is_k_full,
-      ", is_zp_float = ", is_zp_float, ", max_shared_mem = ", max_shared_mem);
+              ", has_zp = ", has_zp, ", is_zp_float = ", is_zp_float,
              ", max_shared_mem = ", max_shared_mem);
-  auto kernel = get_marlin_kernel<scalar_t>(
+  int sh_cache_size =
-      q_type, thread_m_blocks, thread_n_blocks, thread_k_blocks, m_block_size_8,
+      get_kernel_cache_size(thread_tfg, m_block_size_8, thread_m_blocks, prob_m,
-      has_act_order, has_zp, group_blocks, num_threads, is_zp_float);
+                            prob_n, prob_k, num_bits, group_size, has_act_order,
                            is_k_full, has_zp, is_zp_float, is_a_8bit);
  auto kernel = get_marlin_kernel(
      a_type, b_type, c_type, s_type, thread_m_blocks, thread_n_blocks,
      thread_k_blocks, m_block_size_8, has_act_order, has_zp, group_blocks,
      num_threads, is_zp_float);
  if (kernel == MarlinDefault) {
    TORCH_CHECK(false, "Unsupported shapes: MNK = [", prob_m, ", ", prob_n,
@ -679,19 +515,20 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
  // avoid ">>>" being formatted to "> > >"
  // clang-format off
  kernel<<<blocks, num_threads, max_shared_mem, stream>>>(
-      A_ptr, B_ptr, C_ptr, C_tmp_ptr, bias_ptr, s_ptr, s2_ptr, zp_ptr, g_idx_ptr,
+      A_ptr, B_ptr, C_ptr, C_tmp_ptr, bias_ptr, a_s_ptr, b_s_ptr, g_s_ptr, zp_ptr, g_idx_ptr,
      sorted_token_ids_ptr, expert_ids_ptr, num_tokens_past_padded_ptr,
      topk_weights_ptr, top_k, mul_topk_weights, is_ep, num_groups, prob_m,
-      prob_n, prob_k, locks, has_bias, use_atomic_add, use_fp32_reduce, max_shared_mem);
+      prob_n, prob_k, locks, has_bias, use_atomic_add, use_fp32_reduce);
  // clang-format on
 }
 }  // namespace MARLIN_NAMESPACE_NAME
 torch::Tensor moe_wna16_marlin_gemm(
-    torch::Tensor& a, std::optional<torch::Tensor> const& c_or_none,
+    torch::Tensor& a, std::optional<torch::Tensor> c_or_none,
    torch::Tensor& b_q_weight,
    std::optional<torch::Tensor> const& b_bias_or_none, torch::Tensor& b_scales,
    std::optional<torch::Tensor> const& a_scales_or_none,
    std::optional<torch::Tensor> const& global_scale_or_none,
    std::optional<torch::Tensor> const& b_zeros_or_none,
    std::optional<torch::Tensor> const& g_idx_or_none,
@ -699,11 +536,70 @@ torch::Tensor moe_wna16_marlin_gemm(
    torch::Tensor& sorted_token_ids, torch::Tensor& expert_ids,
    torch::Tensor& num_tokens_past_padded, torch::Tensor& topk_weights,
    int64_t moe_block_size, int64_t top_k, bool mul_topk_weights, bool is_ep,
-    vllm::ScalarTypeId const& b_q_type_id, int64_t size_m, int64_t size_n,
+    vllm::ScalarTypeId const& b_type_id, int64_t size_m, int64_t size_n,
    int64_t size_k, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce,
-    bool is_zp_float) {
+    bool is_zp_float, int64_t thread_k, int64_t thread_n,
-  vllm::ScalarType const b_q_type = vllm::ScalarType::from_id(b_q_type_id);
+    int64_t blocks_per_sm) {
-  int pack_factor = 32 / b_q_type.size_bits();
+  vllm::ScalarTypeId a_type_id, c_type_id, s_type_id;
  auto c_dtype = a.dtype();
  if (a.scalar_type() == at::ScalarType::Half) {
    a_type_id = vllm::kFloat16.id();
    c_type_id = vllm::kFloat16.id();
  } else if (a.scalar_type() == at::ScalarType::BFloat16) {
    a_type_id = vllm::kBFloat16.id();
    c_type_id = vllm::kBFloat16.id();
  } else {
    c_dtype = b_scales.dtype();
    if (b_scales.scalar_type() == at::ScalarType::Half) {
      c_type_id = vllm::kFloat16.id();
    } else if (b_scales.scalar_type() == at::ScalarType::BFloat16) {
      c_type_id = vllm::kBFloat16.id();
    } else {
      c_type_id = vllm::kBFloat16.id();
      TORCH_CHECK(c_or_none.has_value(), "c must be passed for W4A8-FP4");
      torch::Tensor c = c_or_none.value();
      c_dtype = c.dtype();
      if (c.scalar_type() == at::ScalarType::Half) {
        c_type_id = vllm::kFloat16.id();
      } else if (c.scalar_type() == at::ScalarType::BFloat16) {
        c_type_id = vllm::kBFloat16.id();
      } else {
        TORCH_CHECK(false, "unsupported c dtype");
      }
    }
    if (a.scalar_type() == at::ScalarType::Float8_e4m3fn) {
      a_type_id = vllm::kFE4M3fn.id();
    } else if (a.scalar_type() == at::ScalarType::Char) {
      a_type_id = vllm::kS8.id();
    } else {
      TORCH_CHECK(false, "unsupported `a` scalar_type");
    }
  }
  s_type_id = c_type_id;
  if (b_type_id == vllm::kFE2M1f.id()) {
    if (b_scales.scalar_type() == at::ScalarType::Float8_e4m3fn) {
      s_type_id = vllm::kFE4M3fn.id();
    } else if (b_scales.scalar_type() == at::ScalarType::Float8_e8m0fnu) {
      s_type_id = vllm::kFE8M0fnu.id();
    } else {
      TORCH_CHECK(false,
                  "When b_type = float4_e2m1f, b_scale scalar type must be",
                  "float8_e4m3fn (for NVFP4) or float8_e8m0fnu (for MXFP4).");
    }
  }
  vllm::ScalarType a_type = vllm::ScalarType::from_id(a_type_id);
  vllm::ScalarType b_type = vllm::ScalarType::from_id(b_type_id);
  vllm::ScalarType c_type = vllm::ScalarType::from_id(c_type_id);
  vllm::ScalarType s_type = vllm::ScalarType::from_id(s_type_id);
  int pack_factor = 32 / b_type.size_bits();
  int num_experts = b_q_weight.size(0);
  if (moe_block_size != 8) {
    TORCH_CHECK(moe_block_size % 16 == 0,
@ -745,19 +641,27 @@ torch::Tensor moe_wna16_marlin_gemm(
  TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
  TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");
-  // thread_k: `k` size of a thread_tile in `weights` (can usually be left as
+  torch::Tensor a_scales;
-  // auto -1)
+  auto options = torch::TensorOptions().dtype(c_dtype).device(a.device());
-  int thread_k = -1;
+  auto options_fp32 =
-  // thread_n: `n` size of a thread_tile in `weights` (can usually be left as
+      torch::TensorOptions().dtype(at::kFloat).device(a.device());
-  // auto -1)
+
-  int thread_n = -1;
+  if (a_scales_or_none.has_value()) {
    a_scales = a_scales_or_none.value();
    TORCH_CHECK(a_type.size_bits() == 8,
                "a_scales can only be used for 8bit activation.");
  } else {
    a_scales = torch::empty({0}, options_fp32);
    TORCH_CHECK(a_type.size_bits() != 8,
                "the a_scales parameter must be passed for 8bit activation.");
  }
  // sms: number of SMs to use for the kernel
  int sms = -1;
  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, a.get_device());
  // Alloc buffers
  const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
  auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
  torch::Tensor c;
  if (c_or_none.has_value()) {
    c = c_or_none.value();
@ -774,8 +678,6 @@ torch::Tensor moe_wna16_marlin_gemm(
  // Alloc C tmp buffer that is going to be used for the global reduce
  torch::Tensor c_tmp;
  auto options_fp32 =
      torch::TensorOptions().dtype(at::kFloat).device(a.device());
  if (use_fp32_reduce && !use_atomic_add) {
    // max num of threadblocks is sms * 4
    long max_c_tmp_size = min(
@ -846,11 +748,11 @@ torch::Tensor moe_wna16_marlin_gemm(
  torch::Tensor global_scale;
  if (global_scale_or_none.has_value()) {
    global_scale = global_scale_or_none.value();
-    TORCH_CHECK(b_q_type == vllm::kFE2M1f && group_size == 16,
+    TORCH_CHECK(b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn,
                "global_scale can only be used for nvfp4 format.");
  } else {
    global_scale = torch::empty({0}, options);
-    TORCH_CHECK(!(b_q_type == vllm::kFE2M1f && group_size == 16),
+    TORCH_CHECK(!(b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn),
                "the global_scale parameter must be passed for nvfp4 format.");
  }
@ -877,15 +779,15 @@ torch::Tensor moe_wna16_marlin_gemm(
  bool has_zp = b_zeros.size(-1) > 0;
  if (has_zp) {
    TORCH_CHECK(
-        b_q_type == vllm::kU4 || b_q_type == vllm::kU8,
+        b_type == vllm::kU4 || b_type == vllm::kU8,
-        "b_q_type must be u4 or u8 when has_zp = True. Got = ", b_q_type.str());
+        "b_type must be u4 or u8 when has_zp = True. Got = ", b_type.str());
  } else {
-    TORCH_CHECK(b_q_type == vllm::kU4B8 || b_q_type == vllm::kU8B128 ||
+    TORCH_CHECK(b_type == vllm::kU4B8 || b_type == vllm::kU8B128 ||
-                    b_q_type == vllm::kFE4M3fn || b_q_type == vllm::kFE2M1f,
+                    b_type == vllm::kS4 || b_type == vllm::kS8 ||
-                "b_q_type must be uint4b8, uint8b128, float8_e4m3fn or "
+                    b_type == vllm::kFE4M3fn || b_type == vllm::kFE2M1f,
-                "float4_e2m1f when "
+                "b_type must be uint4b8, uint8b128, int4, int8, "
-                "has_zp = False. Got = ",
+                "float8_e4m3fn or float4_e2m1f when has_zp = False. Got = ",
-                b_q_type.str());
+                b_type.str());
  }
  if (has_zp && is_zp_float) {
@ -929,71 +831,33 @@ torch::Tensor moe_wna16_marlin_gemm(
              " is below min_workspace_size = ", min_workspace_size);
  int dev = a.get_device();
  if (a.scalar_type() == at::ScalarType::Half) {
    void* scales_ptr;
    if (b_q_type == vllm::kFE2M1f) {
      if (group_size == 16)
        scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
      else if (group_size == 32)
        scales_ptr = b_scales.data_ptr<at::Float8_e8m0fnu>();
      else
        TORCH_CHECK(false,
                    "float4_e2m1f only supports group_size == 16 (NVFP4) ",
                    "and group_size == 32 (MXFP4)");
    } else {
      scales_ptr = b_scales.data_ptr<at::Half>();
    }
-    MARLIN_NAMESPACE_NAME::marlin_mm<half>(
+  TORCH_CHECK(a_scales.scalar_type() == at::ScalarType::Float,
-        a.data_ptr<at::Half>(), b_q_weight.data_ptr(), c.data_ptr<at::Half>(),
+              "scalar type of a_scales must be float");
-        c_tmp.data_ptr<float>(), b_bias.data_ptr<at::Half>(), scales_ptr,
+  TORCH_CHECK(global_scale.scalar_type() == c.scalar_type(),
-        global_scale.data_ptr<at::Half>(), b_zeros.data_ptr(), g_idx.data_ptr(),
+              "scalar type of global_scale must be the same with c");
-        perm.data_ptr(), a_tmp.data_ptr<at::Half>(),
+  if (a_type.size_bits() == 16) {
-        sorted_token_ids.data_ptr(), expert_ids.data_ptr(),
+    TORCH_CHECK(
-        num_tokens_past_padded.data_ptr(), topk_weights.data_ptr(),
+        a.scalar_type() == c.scalar_type(),
-        moe_block_size, top_k, mul_topk_weights, is_ep, size_m, size_n, size_k,
+        "scalar type of a must be the same with c for 16 bit activation");
        workspace.data_ptr(), b_q_type, has_bias, has_act_order, is_k_full,
        has_zp, num_groups, group_size, dev,
        at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
        use_atomic_add, use_fp32_reduce, is_zp_float);
  } else if (a.scalar_type() == at::ScalarType::BFloat16) {
    void* scales_ptr;
    if (b_q_type == vllm::kFE2M1f) {
      if (group_size == 16)
        scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
      else if (group_size == 32)
        scales_ptr = b_scales.data_ptr<at::Float8_e8m0fnu>();
      else
        TORCH_CHECK(false,
                    "float4_e2m1f only supports group_size == 16 (NVFP4) ",
                    "and group_size == 32 (MXFP4)");
    } else {
      scales_ptr = b_scales.data_ptr<at::BFloat16>();
    }
    MARLIN_NAMESPACE_NAME::marlin_mm<nv_bfloat16>(
        a.data_ptr<at::BFloat16>(), b_q_weight.data_ptr(),
        c.data_ptr<at::BFloat16>(), c_tmp.data_ptr<float>(),
        b_bias.data_ptr<at::BFloat16>(), scales_ptr,
        global_scale.data_ptr<at::BFloat16>(), b_zeros.data_ptr(),
        g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr<at::BFloat16>(),
        sorted_token_ids.data_ptr(), expert_ids.data_ptr(),
        num_tokens_past_padded.data_ptr(), topk_weights.data_ptr(),
        moe_block_size, top_k, mul_topk_weights, is_ep, size_m, size_n, size_k,
        workspace.data_ptr(), b_q_type, has_bias, has_act_order, is_k_full,
        has_zp, num_groups, group_size, dev,
        at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
        use_atomic_add, use_fp32_reduce, is_zp_float);
  } else {
    TORCH_CHECK(false,
                "moe_wna16_marlin_gemm only supports bfloat16 and float16");
  }
  MARLIN_NAMESPACE_NAME::marlin_mm(
      a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(), c_tmp.data_ptr(),
      b_bias.data_ptr(), a_scales.data_ptr(), b_scales.data_ptr(),
      global_scale.data_ptr(), b_zeros.data_ptr(), g_idx.data_ptr(),
      perm.data_ptr(), a_tmp.data_ptr(), sorted_token_ids.data_ptr(),
      expert_ids.data_ptr(), num_tokens_past_padded.data_ptr(),
      topk_weights.data_ptr(), moe_block_size, num_experts, top_k,
      mul_topk_weights, is_ep, size_m, size_n, size_k, workspace.data_ptr(),
      a_type, b_type, c_type, s_type, has_bias, has_act_order, is_k_full,
      has_zp, num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
      thread_k, thread_n, sms, blocks_per_sm, use_atomic_add, use_fp32_reduce,
      is_zp_float);
  return c;
 }
 #endif
 TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
  m.impl("moe_wna16_marlin_gemm", &moe_wna16_marlin_gemm);
 }
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@ -63,16 +63,18 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
  m.def(
      "moe_wna16_marlin_gemm(Tensor! a, Tensor? c_or_none,"
      "Tensor! b_q_weight, Tensor? b_bias_or_none,"
-      "Tensor! b_scales, Tensor? global_scale, Tensor? "
+      "Tensor! b_scales, Tensor? a_scales, Tensor? global_scale, Tensor? "
      "b_zeros_or_none,"
      "Tensor? g_idx_or_none, Tensor? perm_or_none, Tensor! workspace,"
      "Tensor sorted_token_ids,"
      "Tensor! expert_ids, Tensor! num_tokens_past_padded,"
      "Tensor! topk_weights, int moe_block_size, int top_k, "
-      "bool mul_topk_weights, bool is_ep, int b_q_type_id,"
+      "bool mul_topk_weights, bool is_ep, int b_type_id,"
      "int size_m, int size_n, int size_k,"
      "bool is_full_k, bool use_atomic_add,"
-      "bool use_fp32_reduce, bool is_zp_float) -> Tensor");
+      "bool use_fp32_reduce, bool is_zp_float,"
      "int thread_k, int thread_n, int blocks_per_sm) -> Tensor");
  m.def(
      "marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, "
      "Tensor! topk_weights, Tensor! topk_ids, Tensor! b_scales, Tensor! "
--- a/csrc/ops.h
+++ b/csrc/ops.h
@ -52,14 +52,13 @@ void paged_attention_v2(
    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
    const int64_t blocksparse_head_sliding_step);
 #ifndef USE_ROCM
 void merge_attn_states(torch::Tensor& output,
                       std::optional<torch::Tensor> output_lse,
                       const torch::Tensor& prefix_output,
                       const torch::Tensor& prefix_lse,
                       const torch::Tensor& suffix_output,
                       const torch::Tensor& suffix_lse);
-
+#ifndef USE_ROCM
 void convert_vertical_slash_indexes(
    torch::Tensor& block_count,      // [BATCH, N_HEADS, NUM_ROWS]
    torch::Tensor& block_offset,     // [BATCH, N_HEADS, NUM_ROWS, NNZ_S]
--- a/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
+++ b/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
@ -22,6 +22,7 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/cuda/CUDAStream.h>
 #include "cutlass_extensions/common.hpp"
 #include "cute/tensor.hpp"
 #include "cutlass/tensor_ref.h"
@ -173,7 +174,7 @@ void run_get_group_gemm_starts(
 }
 template <typename OutType>
-void run_fp4_blockwise_scaled_group_mm(
+void run_fp4_blockwise_scaled_group_mm_sm100(
    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
    const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
    const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
@ -343,17 +344,225 @@ void run_fp4_blockwise_scaled_group_mm(
  auto can_implement_status = gemm_op.can_implement(args);
  TORCH_CHECK(can_implement_status == cutlass::Status::kSuccess,
-              "Failed to implement GEMM");
+              "Failed to implement GEMM: status=", (int)can_implement_status);
  // Run the GEMM
  auto status = gemm_op.initialize(args, workspace.data_ptr());
-  TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to initialize GEMM");
+  TORCH_CHECK(status == cutlass::Status::kSuccess,
              "Failed to initialize GEMM: status=", (int)status,
              " workspace_size=", workspace_size, " num_experts=", num_experts,
              " M=", M, " N=", N, " K=", K);
  status = gemm_op.run(args, workspace.data_ptr(), stream);
  TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM");
 }
 void run_fp4_blockwise_scaled_group_mm_sm120(
    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
    const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
    const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
    const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets, int M,
    int N, int K) {
  using ProblemShape =
      cutlass::gemm::GroupProblemShape<Shape<int32_t, int32_t, int32_t>>;
  using ElementType = cutlass::float_e2m1_t;
  using ElementSFType = cutlass::float_ue4m3_t;
  using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
  using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
  // NOTE: For SM120 it seems templating the output type is not supported and
  // we need to hardcode the output type to bfloat16
  using ElementC = cutlass::bfloat16_t;
  using ElementD = ElementC;
  using ElementAccumulator = float;
  // Layout definitions
  using LayoutA = cutlass::layout::RowMajor;
  using LayoutB = cutlass::layout::ColumnMajor;
  using LayoutC = cutlass::layout::RowMajor;
  using LayoutD = LayoutC;
  // Alignment constraints
  static constexpr int AlignmentA = 32;
  static constexpr int AlignmentB = 32;
  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
  // Architecture definitions
  using ArchTag = cutlass::arch::Sm120;
  using OperatorClass = cutlass::arch::OpClassBlockScaledTensorOp;
  using ClusterShape = Shape<_1, _1, _1>;
  using MmaTileShape = Shape<_128, _128, _128>;
  using FusionOperation = cutlass::epilogue::fusion::LinearCombination<
      ElementD, ElementAccumulator, ElementC, ElementAccumulator>;
  using CollectiveEpilogue =
      typename cutlass::epilogue::collective::CollectiveBuilder<
          ArchTag, OperatorClass, MmaTileShape, ClusterShape,
          cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
          ElementAccumulator, ElementC, LayoutC*, AlignmentC, ElementD,
          LayoutD*, AlignmentD,
          cutlass::epilogue::collective::EpilogueScheduleAuto,
          FusionOperation>::CollectiveOp;
  using CollectiveMainloop =
      typename cutlass::gemm::collective::CollectiveBuilder<
          ArchTag, OperatorClass, ElementA, LayoutA*, AlignmentA, ElementB,
          LayoutB*, AlignmentB, ElementAccumulator, MmaTileShape, ClusterShape,
          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
              sizeof(typename CollectiveEpilogue::SharedStorage))>,
          cutlass::gemm::collective::KernelScheduleAuto>::CollectiveOp;
  using GemmKernel =
      cutlass::gemm::kernel::GemmUniversal<ProblemShape, CollectiveMainloop,
                                           CollectiveEpilogue>;
  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
  using StrideA = typename Gemm::GemmKernel::InternalStrideA;
  using StrideB = typename Gemm::GemmKernel::InternalStrideB;
  using StrideC = typename Gemm::GemmKernel::InternalStrideC;
  using StrideD = typename Gemm::GemmKernel::InternalStrideD;
  using LayoutSFA =
      typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFA;
  using LayoutSFB =
      typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFB;
  using ScaleConfig =
      typename Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
  using UnderlyingProblemShape = ProblemShape::UnderlyingProblemShape;
  int num_experts = static_cast<int>(expert_offsets.size(0));
  auto options_int =
      torch::TensorOptions().dtype(torch::kInt64).device(a.device());
  torch::Tensor a_ptrs = torch::empty(num_experts, options_int);
  torch::Tensor b_ptrs = torch::empty(num_experts, options_int);
  torch::Tensor out_ptrs = torch::empty(num_experts, options_int);
  torch::Tensor a_scales_ptrs = torch::empty(num_experts, options_int);
  torch::Tensor b_scales_ptrs = torch::empty(num_experts, options_int);
  torch::Tensor alpha_ptrs = torch::empty(num_experts, options_int);
  torch::Tensor layout_sfa = torch::empty({num_experts, 5}, options_int);
  torch::Tensor layout_sfb = torch::empty({num_experts, 5}, options_int);
  torch::Tensor c_strides1 =
      torch::full({num_experts}, output.stride(0), options_int);
  torch::Tensor a_strides1 =
      torch::full({num_experts}, a.stride(0) * 2, options_int);
  torch::Tensor b_strides1 =
      torch::full({num_experts}, b.stride(1) * 2, options_int);
  run_get_group_gemm_starts<LayoutSFA, LayoutSFB, ScaleConfig>(
      a_ptrs, b_ptrs, out_ptrs, a_scales_ptrs, b_scales_ptrs, alpha_ptrs,
      layout_sfa, layout_sfb, a, b, output, a_blockscale, b_blockscales, alphas,
      expert_offsets, sf_offsets, problem_sizes, M, N, K);
  // Create an instance of the GEMM
  Gemm gemm_op;
  // Initialize problem_sizes_as_shapes correctly
  UnderlyingProblemShape* problem_sizes_as_shapes =
      static_cast<UnderlyingProblemShape*>(problem_sizes.data_ptr());
  // Set the Scheduler info
  cutlass::KernelHardwareInfo hw_info;
  using RasterOrderOptions = cutlass::gemm::kernel::detail::RasterOrderOptions;
  typename Gemm::GemmKernel::TileSchedulerArguments scheduler;
  scheduler.raster_order = RasterOrderOptions::AlongM;
  hw_info.device_id = a.get_device();
  static std::unordered_map<int, int> cached_sm_counts;
  if (cached_sm_counts.find(hw_info.device_id) == cached_sm_counts.end()) {
    cached_sm_counts[hw_info.device_id] =
        cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
            hw_info.device_id);
  }
  hw_info.sm_count = min(cached_sm_counts[hw_info.device_id], INT_MAX);
  // Mainloop Arguments
  typename GemmKernel::MainloopArguments mainloop_args{
      static_cast<const ElementType**>(a_ptrs.data_ptr()),
      static_cast<StrideA*>(a_strides1.data_ptr()),
      static_cast<const ElementType**>(b_ptrs.data_ptr()),
      static_cast<StrideB*>(b_strides1.data_ptr()),
      static_cast<const ElementSFType**>(a_scales_ptrs.data_ptr()),
      reinterpret_cast<LayoutSFA*>(layout_sfa.data_ptr()),
      static_cast<const ElementSFType**>(b_scales_ptrs.data_ptr()),
      reinterpret_cast<LayoutSFB*>(layout_sfb.data_ptr())};
  // Epilogue Arguments
  typename GemmKernel::EpilogueArguments epilogue_args{
      {},  // epilogue.thread
      nullptr,
      static_cast<StrideC*>(c_strides1.data_ptr()),
      static_cast<ElementD**>(out_ptrs.data_ptr()),
      static_cast<StrideC*>(c_strides1.data_ptr())};
  auto& fusion_args = epilogue_args.thread;
  fusion_args.alpha_ptr_array =
      reinterpret_cast<float**>(alpha_ptrs.data_ptr());
  fusion_args.dAlpha = {_0{}, _0{}, 1};
  fusion_args.beta = 0.0f;
  // Gemm Arguments
  typename GemmKernel::Arguments args{
      cutlass::gemm::GemmUniversalMode::kGrouped,
      {num_experts, problem_sizes_as_shapes, nullptr},
      mainloop_args,
      epilogue_args,
      hw_info,
      scheduler};
  size_t workspace_size = Gemm::get_workspace_size(args);
  auto const workspace_options =
      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
  auto workspace = torch::empty(workspace_size, workspace_options);
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream(a.get_device());
  auto can_implement_status = gemm_op.can_implement(args);
  TORCH_CHECK(can_implement_status == cutlass::Status::kSuccess,
              "Failed to implement GEMM: status=", (int)can_implement_status);
  // Run the GEMM
  auto status = gemm_op.initialize(args, workspace.data_ptr());
  TORCH_CHECK(status == cutlass::Status::kSuccess,
              "Failed to initialize GEMM: status=", (int)status,
              " workspace_size=", workspace_size, " num_experts=", num_experts,
              " M=", M, " N=", N, " K=", K);
  status = gemm_op.run(args, workspace.data_ptr(), stream);
  TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM");
 }
 template <typename OutType>
 void run_fp4_blockwise_scaled_group_mm(
    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
    const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
    const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
    const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets, int M,
    int N, int K) {
  int32_t version_num = get_sm_version_num();
 #if defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120
  if (version_num >= 120 && version_num < 130) {
    run_fp4_blockwise_scaled_group_mm_sm120(
        output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes,
        expert_offsets, sf_offsets, M, N, K);
    return;
  }
 #endif
 #if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
  if (version_num >= 100 && version_num < 120) {
    run_fp4_blockwise_scaled_group_mm_sm100<OutType>(
        output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes,
        expert_offsets, sf_offsets, M, N, K);
    return;
  }
 #endif
  TORCH_CHECK_NOT_IMPLEMENTED(
      false,
      "No compiled cutlass_fp4_group_mm kernel for CUDA device capability: ",
      version_num, ". Required capability: 100 or 120");
 }
 #if (defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100) || \
    (defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120)
 constexpr auto FLOAT4_E2M1X2 = at::ScalarType::Byte;
 constexpr auto SF_DTYPE = at::ScalarType::Float8_e4m3fn;
 #endif
@ -374,7 +583,8 @@ void cutlass_fp4_group_mm(
    const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
    const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
    const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets) {
-#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
+#if (defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100) || \
    (defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120)
  // Input validation
  CHECK_INPUT(a, FLOAT4_E2M1X2, "a");
  CHECK_INPUT(b, FLOAT4_E2M1X2, "b");
@ -408,6 +618,14 @@ void cutlass_fp4_group_mm(
        output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes,
        expert_offsets, sf_offsets, M, N, K);
  } else {
  #if defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120
    int32_t version_num = get_sm_version_num();
    if (version_num >= 120 && version_num < 130) {
      TORCH_CHECK_NOT_IMPLEMENTED(
          false, "SM120 NVFP4 MOE only supports bfloat16 output, got: ",
          output.scalar_type());
    }
  #endif
    run_fp4_blockwise_scaled_group_mm<cutlass::half_t>(
        output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes,
        expert_offsets, sf_offsets, M, N, K);
@ -416,8 +634,8 @@ void cutlass_fp4_group_mm(
  TORCH_CHECK_NOT_IMPLEMENTED(
      false,
      "No compiled cutlass_fp4_group_mm kernel, vLLM must "
-      "be compiled with ENABLE_NVFP4_SM100 for SM100+ and CUDA "
+      "be compiled with ENABLE_NVFP4_SM100 or ENABLE_NVFP4_SM120 for SM100/120 "
-      "12.8 or above.");
+      "and CUDA 12.8 or above.");
 #endif
 }
--- a/csrc/quantization/fp4/nvfp4_experts_quant.cu
+++ b/csrc/quantization/fp4/nvfp4_experts_quant.cu
@ -307,7 +307,7 @@ constexpr auto FLOAT = at::ScalarType::Float;
 constexpr auto INT = at::ScalarType::Int;
 constexpr auto UINT8 = at::ScalarType::Byte;
-void scaled_fp4_experts_quant_sm100a(
+void scaled_fp4_experts_quant_sm1xxa(
    torch::Tensor& output, torch::Tensor& output_scale,
    torch::Tensor const& input, torch::Tensor const& input_global_scale,
    torch::Tensor const& input_offset_by_experts,
--- a/csrc/quantization/fp4/nvfp4_quant_entry.cu
+++ b/csrc/quantization/fp4/nvfp4_quant_entry.cu
@ -24,8 +24,9 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
                             torch::Tensor const& input_sf);
 #endif
-#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
+#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
-void scaled_fp4_experts_quant_sm100a(
+    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
 void scaled_fp4_experts_quant_sm1xxa(
    torch::Tensor& output, torch::Tensor& output_scale,
    torch::Tensor const& input, torch::Tensor const& input_global_scale,
    torch::Tensor const& input_offset_by_experts,
@ -54,8 +55,9 @@ void scaled_fp4_experts_quant(
    torch::Tensor const& input, torch::Tensor const& input_global_scale,
    torch::Tensor const& input_offset_by_experts,
    torch::Tensor const& output_scale_offset_by_experts) {
-#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
+#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
-  return scaled_fp4_experts_quant_sm100a(
+    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
  return scaled_fp4_experts_quant_sm1xxa(
      output, output_scale, input, input_global_scale, input_offset_by_experts,
      output_scale_offset_by_experts);
 #endif
--- a/csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu
+++ b/csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu
@ -15,6 +15,8 @@
 */
 #include <torch/all.h>
 #include <c10/cuda/CUDAGuard.h>
 #include "cutlass_extensions/common.hpp"
 #if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
 void cutlass_scaled_fp4_mm_sm100a(torch::Tensor& D, torch::Tensor const& A,
@ -32,23 +34,34 @@ void cutlass_scaled_fp4_mm_sm120a(torch::Tensor& D, torch::Tensor const& A,
                                  torch::Tensor const& alpha);
 #endif
-void cutlass_scaled_fp4_mm(torch::Tensor& D, torch::Tensor const& A,
+void cutlass_scaled_fp4_mm(torch::Tensor& D, const torch::Tensor& A,
-                           torch::Tensor const& B, torch::Tensor const& A_sf,
+                           const torch::Tensor& B, const torch::Tensor& A_sf,
-                           torch::Tensor const& B_sf,
+                           const torch::Tensor& B_sf,
-                           torch::Tensor const& alpha) {
+                           const torch::Tensor& alpha) {
-#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
+  // Make sure we’re on A’s device.
-  return cutlass_scaled_fp4_mm_sm100a(D, A, B, A_sf, B_sf, alpha);
+  const c10::cuda::OptionalCUDAGuard device_guard(device_of(A));
-#elif defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120
+  const int32_t sm = get_sm_version_num();
-  return cutlass_scaled_fp4_mm_sm120a(D, A, B, A_sf, B_sf, alpha);
+
 #if defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100
  if (sm >= 100 && sm < 120) {
    cutlass_scaled_fp4_mm_sm100a(D, A, B, A_sf, B_sf, alpha);
    return;
  }
 #endif
-  TORCH_CHECK_NOT_IMPLEMENTED(false,
+
-                              "No compiled nvfp4 mm kernel, vLLM should "
+#if defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120
-                              "be compiled using CUDA 12.8 and target "
+  if (sm >= 120 && sm < 130) {
-                              "compute capability 100 or above.");
+    cutlass_scaled_fp4_mm_sm120a(D, A, B, A_sf, B_sf, alpha);
    return;
  }
 #endif
  TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled nvfp4 mm kernel for SM ", sm,
                              ". Recompile with CUDA >= 12.8 and CC >= 100.");
 }
 bool cutlass_scaled_mm_supports_fp4(int64_t cuda_device_capability) {
  int runtimeVersion;
  cudaRuntimeGetVersion(&runtimeVersion);
  return cuda_device_capability >= 100 && runtimeVersion >= 12080;
-}
+}
--- a/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu
+++ b/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu
@ -437,10 +437,10 @@ struct ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
      for (int n_idx = 0; n_idx < WARP_NITER; ++n_idx) {
  #pragma unroll
        for (int k_idx = 0; k_idx < 2; ++k_idx) {
-          FType low16 =
+          FType low16 = MarlinScalarType2<FType>::float2num(
-              ScalarType<FType>::float2num(C_frag[m_idx][n_idx][k_idx * 2]);
+              C_frag[m_idx][n_idx][k_idx * 2]);
-          FType high16 =
+          FType high16 = MarlinScalarType2<FType>::float2num(
-              ScalarType<FType>::float2num(C_frag[m_idx][n_idx][k_idx * 2 + 1]);
+              C_frag[m_idx][n_idx][k_idx * 2 + 1]);
          uint32_t tmp = (reinterpret_cast<uint32_t&>(low16) & 0xffff) |
                         (reinterpret_cast<uint32_t&>(high16) << 16);
          int sts_offset =
--- a/csrc/quantization/gptq_allspark/allspark_utils.cuh
+++ b/csrc/quantization/gptq_allspark/allspark_utils.cuh
@ -8,7 +8,7 @@
 #include <cuda_bf16.h>
 #include <iostream>
 #include "../gptq_marlin/marlin_dtypes.cuh"
-using marlin::ScalarType;
+using marlin::MarlinScalarType2;
 namespace allspark {
@ -72,10 +72,10 @@ __global__ void f16_gemm_splitk_reduce_kernel(const FType* C_split, FType* C,
  int n_mat = N_MATRIX > 0 ? N_MATRIX : (int)n_matrix;
  for (int i = 0; i < n_mat; ++i) {
-    sum += ScalarType<FType>::num2float(C_split[idx + i * matrix_size]);
+    sum += MarlinScalarType2<FType>::num2float(C_split[idx + i * matrix_size]);
  }
-  C[idx] = ScalarType<FType>::float2num(sum);
+  C[idx] = MarlinScalarType2<FType>::float2num(sum);
 }
 template <typename FType>
--- a/csrc/quantization/gptq_marlin/.gitignore
+++ b/csrc/quantization/gptq_marlin/.gitignore
@ -1 +1,2 @@
-kernel_*.cu
+sm*_kernel_*.cu
 kernel_selector.h
--- a/csrc/quantization/gptq_marlin/awq_marlin_repack.cu
+++ b/csrc/quantization/gptq_marlin/awq_marlin_repack.cu
@ -4,14 +4,16 @@
 namespace marlin {
-template <int const num_threads, int const num_bits>
+template <int const num_threads, int const num_bits, bool is_a_8bit>
 __global__ void awq_marlin_repack_kernel(
    uint32_t const* __restrict__ b_q_weight_ptr, uint32_t* __restrict__ out_ptr,
    int size_k, int size_n) {
  constexpr int pack_factor = 32 / num_bits;
-  int k_tiles = size_k / tile_k_size;
+  constexpr int target_tile_n_size = tile_n_size / (is_a_8bit ? 2 : 1);
-  int n_tiles = size_n / tile_n_size;
+  constexpr int target_tile_k_size = tile_k_size * (is_a_8bit ? 2 : 1);
  int k_tiles = size_k / target_tile_k_size;
  int n_tiles = size_n / target_tile_n_size;
  int block_k_tiles = div_ceil(k_tiles, gridDim.x);
  auto start_k_tile = blockIdx.x * block_k_tiles;
@ -33,10 +35,10 @@ __global__ void awq_marlin_repack_kernel(
  extern __shared__ int4 sh[];
-  constexpr int tile_n_ints = tile_n_size / pack_factor;
+  constexpr int tile_n_ints = target_tile_n_size / pack_factor;
  constexpr int stage_n_threads = tile_n_ints / 4;
-  constexpr int stage_k_threads = tile_k_size;
+  constexpr int stage_k_threads = target_tile_k_size;
  constexpr int stage_size = stage_k_threads * stage_n_threads;
  auto fetch_to_shared = [&](int pipe, int k_tile_id, int n_tile_id) {
@ -45,7 +47,7 @@ __global__ void awq_marlin_repack_kernel(
      return;
    }
-    int first_n = n_tile_id * tile_n_size;
+    int first_n = n_tile_id * target_tile_n_size;
    int first_n_packed = first_n / pack_factor;
    int4* sh_ptr = sh + stage_size * pipe;
@ -54,7 +56,7 @@ __global__ void awq_marlin_repack_kernel(
      auto k_id = threadIdx.x / stage_n_threads;
      auto n_id = threadIdx.x % stage_n_threads;
-      int first_k = k_tile_id * tile_k_size;
+      int first_k = k_tile_id * target_tile_k_size;
      cp_async4(&sh_ptr[k_id * stage_n_threads + n_id],
                reinterpret_cast<int4 const*>(
@ -78,11 +80,11 @@ __global__ void awq_marlin_repack_kernel(
    }
    int tc_col = th_id / 4;
-    int tc_row = (th_id % 4) * 2;
+    int tc_row = (th_id % 4) * (is_a_8bit ? 4 : 2);
    constexpr int tc_offsets[4] = {0, 1, 8, 9};
-    int cur_n = warp_id * 16 + tc_col;
+    int cur_n = (warp_id / (is_a_8bit ? 2 : 1)) * 16 + tc_col;
    int cur_n_packed = cur_n / pack_factor;
    int cur_n_pos = cur_n % pack_factor;
@ -105,23 +107,50 @@ __global__ void awq_marlin_repack_kernel(
    uint32_t vals[8];
 #pragma unroll
    for (int i = 0; i < 4; i++) {
-      int cur_elem = tc_row + tc_offsets[i];
+      if constexpr (is_a_8bit) {
        int cur_elem = tc_row + i;
-      int packed_src_0 = sh_stage_int_ptr[cur_n_packed + sh_stride * cur_elem];
+        int packed_src_0 =
-      int packed_src_1 = sh_stage_int_ptr[cur_n_packed + (8 / pack_factor) +
+            sh_stage_int_ptr[cur_n_packed + (8 / pack_factor) * (warp_id % 2) +
-                                          sh_stride * cur_elem];
+                             sh_stride * cur_elem];
        int packed_src_1 =
            sh_stage_int_ptr[cur_n_packed + (8 / pack_factor) * (warp_id % 2) +
                             sh_stride * (cur_elem + 16)];
-      vals[i] = (packed_src_0 >> (cur_n_pos_unpacked * num_bits)) & mask;
+        vals[i] = (packed_src_0 >> (cur_n_pos_unpacked * num_bits)) & mask;
-      vals[4 + i] = (packed_src_1 >> (cur_n_pos_unpacked * num_bits)) & mask;
+        vals[4 + i] = (packed_src_1 >> (cur_n_pos_unpacked * num_bits)) & mask;
      } else {
        int cur_elem = tc_row + tc_offsets[i];
        int packed_src_0 =
            sh_stage_int_ptr[cur_n_packed + sh_stride * cur_elem];
        int packed_src_1 = sh_stage_int_ptr[cur_n_packed + (8 / pack_factor) +
                                            sh_stride * cur_elem];
        vals[i] = (packed_src_0 >> (cur_n_pos_unpacked * num_bits)) & mask;
        vals[4 + i] = (packed_src_1 >> (cur_n_pos_unpacked * num_bits)) & mask;
      }
    }
-    constexpr int tile_size = tile_k_size * tile_n_size / pack_factor;
+    constexpr int tile_size =
        target_tile_k_size * target_tile_n_size / pack_factor;
    int out_offset = (k_tile_id * n_tiles + n_tile_id) * tile_size;
    // Result of:
    // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
-    if constexpr (num_bits == 4) {
+    if constexpr (!is_a_8bit && num_bits == 4) {
-      constexpr int pack_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
+      int pack_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
      uint32_t res = 0;
 #pragma unroll
      for (int i = 0; i < 8; i++) {
        res |= vals[pack_idx[i]] << (i * 4);
      }
      out_ptr[out_offset + th_id * 4 + warp_id] = res;
    } else if constexpr (is_a_8bit && num_bits == 4) {
      int pack_idx[8] = {0, 4, 1, 5, 2, 6, 3, 7};
      uint32_t res = 0;
 #pragma unroll
@ -138,8 +167,9 @@ __global__ void awq_marlin_repack_kernel(
      uint32_t res2 = 0;
 #pragma unroll
      for (int i = 0; i < 4; i++) {
-        res1 |= vals[pack_idx[i]] << (i * 8);
+        const int ii = is_a_8bit ? i : pack_idx[i];
-        res2 |= vals[4 + pack_idx[i]] << (i * 8);
+        res1 |= vals[ii] << (i * 8);
        res2 |= vals[4 + ii] << (i * 8);
      }
      out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 0] = res1;
@ -176,18 +206,21 @@ __global__ void awq_marlin_repack_kernel(
 }  // namespace marlin
-#define CALL_IF(NUM_BITS)                                                   \
+#define CALL_IF(NUM_BITS, IS_A_8BIT)                                       \
-  else if (num_bits == NUM_BITS) {                                          \
+  else if (num_bits == NUM_BITS && is_a_8bit == IS_A_8BIT) {               \
-    cudaFuncSetAttribute(                                                   \
+    cudaFuncSetAttribute(                                                  \
-        marlin::awq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS>, \
+        marlin::awq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS, \
-        cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);       \
+                                         IS_A_8BIT>,                       \
-    marlin::awq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS>      \
+        cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);      \
-        <<<blocks, marlin::repack_threads, max_shared_mem, stream>>>(       \
+    marlin::awq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS,     \
-            b_q_weight_ptr, out_ptr, size_k, size_n);                       \
+                                     IS_A_8BIT>                            \
        <<<blocks, marlin::repack_threads, max_shared_mem, stream>>>(      \
            b_q_weight_ptr, out_ptr, size_k, size_n);                      \
  }
 torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k,
-                                int64_t size_n, int64_t num_bits) {
+                                int64_t size_n, int64_t num_bits,
                                bool is_a_8bit) {
  // Verify compatibility with marlin tile of 16x64
  TORCH_CHECK(size_k % marlin::tile_k_size == 0, "size_k = ", size_k,
              " is not divisible by tile_k_size = ", marlin::tile_k_size);
@ -238,10 +271,13 @@ torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k,
  if (false) {
  }
-  CALL_IF(4)
+  CALL_IF(4, false)
-  CALL_IF(8)
+  CALL_IF(8, false)
  CALL_IF(4, true)
  CALL_IF(8, true)
  else {
-    TORCH_CHECK(false, "Unsupported repack config: num_bits = ", num_bits);
+    TORCH_CHECK(false, "Unsupported repack config: num_bits = ", num_bits,
                ", is_a_8bit = ", is_a_8bit);
  }
  return out;
--- a/csrc/quantization/gptq_marlin/dequant.h
+++ b/csrc/quantization/gptq_marlin/dequant.h
@ -470,6 +470,50 @@ __device__ inline void dequant<nv_bfloat162, vllm::kFE2M1f.id(), false>(
  frag_b[0] = __hmul2(frag_b[0], bias_reg);
 }
 template <>
 __device__ inline void dequant<__nv_fp8x4_e4m3, vllm::kFE2M1f.id(), true>(
    int q, __nv_fp8x4_e4m3* frag_b) {
  // Constants for FP4 (E2M1) and FP16 formats
  constexpr int FP4_EXPONENT = 2, FP8_EXPONENT = 4;
  constexpr int RIGHT_SHIFT = FP8_EXPONENT - FP4_EXPONENT;
  constexpr int MASK = 0x70707070;
  // Extract and shift FP4 values to FP16 format
  int Out1 = (q & 0x80808080) | ((q & MASK) >> RIGHT_SHIFT);
  q <<= 4;
  int Out2 = (q & 0x80808080) | ((q & MASK) >> RIGHT_SHIFT);
  // Note1: reverse indexing is intentional because weights are permuted
  // Note2: when dequant to 8bit type, we write to `frag_b[2]` instead of
  //        `frag_b[1]` to fit the layout of tensorcore
  frag_b[1] = *reinterpret_cast<const __nv_fp8x4_e4m3*>(&Out1);
  frag_b[0] = *reinterpret_cast<const __nv_fp8x4_e4m3*>(&Out2);
 }
 template <>
 __device__ inline void dequant<int32_t, vllm::kU4B8.id(), true>(
    int q, int32_t* frag_b) {
  constexpr int repeated_zp = 0x08080808;
  constexpr int MASK = 0x80808080;
  frag_b[0] = ((q & 0x0F0F0F0F | MASK) - repeated_zp) ^ MASK;
  q >>= 4;
  frag_b[1] = ((q & 0x0F0F0F0F | MASK) - repeated_zp) ^ MASK;
 }
 template <>
 __device__ inline void dequant<__nv_fp8x4_e4m3, vllm::kU4B8.id(), true>(
    int q, __nv_fp8x4_e4m3* frag_b) {
  int s = q & 0x08080808;
  int Out1 = ((q & 0x07070707) | (s << 4)) + (s >> 3);
  q >>= 4;
  s = q & 0x08080808;
  int Out2 = ((q & 0x07070707) | (s << 4)) + (s >> 3);
  frag_b[0] = *reinterpret_cast<const __nv_fp8x4_e4m3*>(&Out1);
  frag_b[1] = *reinterpret_cast<const __nv_fp8x4_e4m3*>(&Out2);
 }
 template <typename scalar_t2, vllm::ScalarTypeId s_type_id>
 __device__ inline void dequant_fp8_scales(int q, scalar_t2* frag_b);
@ -515,6 +559,49 @@ __device__ inline void dequant_fp8_scales<nv_bfloat162, vllm::kFE8M0fnu.id()>(
  // Note: reverse indexing is intentional because weights are permuted
  frag_b[1] = *reinterpret_cast<const nv_bfloat162*>(&Out1);
  frag_b[0] = *reinterpret_cast<const nv_bfloat162*>(&Out2);
 };
 // subtract zero point in quanted format and then dequant
 template <typename scalar_t2, vllm::ScalarTypeId w_type_id,
          bool skip_flop = false>
 __device__ inline void sub_zp_and_dequant(int q, scalar_t2* frag_b, int zp);
 template <>
 __device__ inline void sub_zp_and_dequant<int32_t, vllm::kU4.id(), true>(
    int q, int32_t* frag_b, int zp) {
  // INT4 with zp -> INT8
  // see https://github.com/vllm-project/vllm/pull/24722
  int repeated_zp = 0x01010101 * zp;
  int MASK = 0x80808080;
  frag_b[0] = ((q & 0x0F0F0F0F | MASK) - repeated_zp) ^ MASK;
  q >>= 4;
  frag_b[1] = ((q & 0x0F0F0F0F | MASK) - repeated_zp) ^ MASK;
 }
 template <>
 __device__ inline void sub_zp_and_dequant<__nv_fp8x4_e4m3, vllm::kU4.id(),
                                          true>(int q, __nv_fp8x4_e4m3* frag_b,
                                                int zp) {
  // INT4 with zp -> FP8
  // see https://github.com/vllm-project/vllm/pull/24722
  uint32_t u_q = *reinterpret_cast<uint32_t*>(&q);
  uint32_t u_zp = *reinterpret_cast<uint32_t*>(&zp);
  uint32_t u_zp1 = u_zp + 1;
  uint32_t repeated_zp = 0x01010101 * u_zp;
  uint32_t q0, s;
  q0 = (u_q & 0x0F0F0F0F) | 0x70707070;
  s = (q0 + repeated_zp) & 0x80808080;
  uint32_t Out1 = (q0 + (s >> 7) * u_zp1) & 0x0F0F0F0F | s;
  u_q >>= 4;
  q0 = (u_q & 0x0F0F0F0F) | 0x70707070;
  s = (q0 + repeated_zp) & 0x80808080;
  uint32_t Out2 = (q0 + (s >> 7) * u_zp1) & 0x0F0F0F0F | s;
  frag_b[0] = *reinterpret_cast<const __nv_fp8x4_e4m3*>(&Out1);
  frag_b[1] = *reinterpret_cast<const __nv_fp8x4_e4m3*>(&Out2);
 }
 #endif
--- a/csrc/quantization/gptq_marlin/generate_kernels.py
+++ b/csrc/quantization/gptq_marlin/generate_kernels.py
@ -4,141 +4,292 @@ import glob
 import itertools
 import os
 import subprocess
 import sys
 import jinja2
-FILE_HEAD = """
+ARCHS = []
-// auto generated by generate.py
+SUPPORT_FP8 = False
-// clang-format off
+for arch in sys.argv[1].split(","):
    arch = arch[: arch.index(".") + 2].replace(".", "")
    arch = int(arch)
    # only SM89 and SM120 fully support
    # mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32.
    # SM90 and SM100 can use this PTX, but it’s simulated
    # with FP16 MMA, so it cannot achieve any acceleration.
    if arch in [89, 120]:
        SUPPORT_FP8 = True
 FILE_HEAD_COMMENT = """
 // auto generated by generate_kernels.py
 // clang-format off
 """.lstrip()
 FILE_HEAD = (
    FILE_HEAD_COMMENT
    + """
 #include "kernel.h"
 #include "marlin_template.h"
 namespace MARLIN_NAMESPACE_NAME {
-""".strip()
+"""
 )
 TEMPLATE = (
    "template __global__ void Marlin<"
-    "{{scalar_t}}, "
+    "{{a_type_id}}, "
-    "{{w_type_id}}, "
+    "{{b_type_id}}, "
    "{{c_type_id}}, "
    "{{s_type_id}}, "
    "{{threads}}, "
    "{{thread_m_blocks}}, "
    "{{thread_n_blocks}}, "
    "{{thread_k_blocks}}, "
-    "{{'true' if m_block_size_8 else 'false'}}, "
+    "{{m_block_size_8}}, "
    "{{stages}}, "
    "{{group_blocks}}, "
-    "{{'true' if is_zp_float else 'false'}}>"
+    "{{is_zp_float}}>"
    "( MARLIN_KERNEL_PARAMS );"
 )
 # int8 with zero point case (vllm::kU8) is also supported,
 # we don't add it to reduce wheel size.
 SCALAR_TYPES = [
    "vllm::kU4",
    "vllm::kU4B8",
    "vllm::kU8B128",
    "vllm::kFE4M3fn",
    "vllm::kFE2M1f",
 ]
 THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128), (128, 64, 128)]
 THREAD_M_BLOCKS = [0.5, 1, 2, 3, 4]
-# group_blocks:
+
-#   = 0 : act order case
+QUANT_CONFIGS = [
-#   = -1 : channelwise quantization
+    # AWQ-INT4
-#   > 0 : group_size=16*group_blocks
+    {
-GROUP_BLOCKS = [0, 1, -1, 2, 4, 8]
+        "b_type": "kU4",
-DTYPES = ["fp16", "bf16"]
+        "thread_configs": THREAD_CONFIGS,
        "thread_m_blocks": THREAD_M_BLOCKS,
        "group_blocks": [-1, 2, 4, 8],
    },
    # HQQ
    {
        "a_type": ["kFloat16"],
        "b_type": "kU4",
        "thread_configs": THREAD_CONFIGS,
        "thread_m_blocks": THREAD_M_BLOCKS,
        "group_blocks": [4],
        "is_zp_float": True,
    },
    # GPTQ-INT4
    {
        "b_type": "kU4B8",
        "thread_configs": THREAD_CONFIGS,
        "thread_m_blocks": THREAD_M_BLOCKS,
        "group_blocks": [-1, 0, 2, 4, 8],
    },
    # GPTQ-INT8
    {
        "b_type": "kU8B128",
        "thread_configs": THREAD_CONFIGS,
        "thread_m_blocks": THREAD_M_BLOCKS,
        "group_blocks": [-1, 0, 2, 4, 8],
    },
    # FP8
    {
        "b_type": "kFE4M3fn",
        "thread_configs": THREAD_CONFIGS,
        "thread_m_blocks": THREAD_M_BLOCKS,
        "group_blocks": [-1, 8],
    },
    # NVFP4
    {
        "b_type": "kFE2M1f",
        "s_type": "kFE4M3fn",
        "thread_configs": THREAD_CONFIGS,
        "thread_m_blocks": THREAD_M_BLOCKS,
        "group_blocks": [1],
    },
    # MXFP4
    {
        "a_type": ["kBFloat16"],
        "b_type": "kFE2M1f",
        "s_type": "kFE8M0fnu",
        "thread_configs": THREAD_CONFIGS,
        "thread_m_blocks": THREAD_M_BLOCKS,
        "group_blocks": [2],
    },
    # AWQ-INT4 with INT8 activation
    {
        "a_type": ["kS8"],
        "b_type": "kU4",
        "thread_configs": THREAD_CONFIGS,
        "thread_m_blocks": [1, 2, 3, 4],
        "group_blocks": [-1, 2, 4, 8],
    },
    # GPTQ-INT4 with INT8 activation
    {
        "a_type": ["kS8"],
        "b_type": "kU4B8",
        "thread_configs": THREAD_CONFIGS,
        "thread_m_blocks": [1, 2, 3, 4],
        "group_blocks": [-1, 2, 4, 8],
    },
    # GPTQ-INT4 with FP8 activation
    {
        "a_type": ["kFE4M3fn"],
        "b_type": "kU4B8",
        "thread_configs": THREAD_CONFIGS,
        "thread_m_blocks": [1, 2, 3, 4],
        "group_blocks": [-1, 2, 4, 8],
    },
    # AWQ-INT4 with FP8 activation
    {
        "a_type": ["kFE4M3fn"],
        "b_type": "kU4",
        "thread_configs": THREAD_CONFIGS,
        "thread_m_blocks": [1, 2, 3, 4],
        "group_blocks": [-1, 2, 4, 8],
    },
    # MXFP4 with FP8 activation
    {
        "a_type": ["kFE4M3fn"],
        "b_type": "kFE2M1f",
        "c_type": ["kBFloat16"],
        "s_type": "kFE8M0fnu",
        "thread_configs": THREAD_CONFIGS,
        "thread_m_blocks": [1, 2, 3, 4],
        "group_blocks": [2],
    },
 ]
 def remove_old_kernels():
-    for filename in glob.glob(os.path.dirname(__file__) + "/kernel_*.cu"):
+    for filename in glob.glob(os.path.dirname(__file__) + "/*kernel_*.cu"):
        subprocess.call(["rm", "-f", filename])
    filename = os.path.dirname(__file__) + "/kernel_selector.h"
    subprocess.call(["rm", "-f", filename])
 def generate_new_kernels():
-    for scalar_type, dtype in itertools.product(SCALAR_TYPES, DTYPES):
+    result_dict = {}
    for quant_config in QUANT_CONFIGS:
        c_types = quant_config.get("c_type", ["kFloat16", "kBFloat16"])
        a_types = quant_config.get("a_type", ["kFloat16", "kBFloat16"])
        b_type = quant_config["b_type"]
        is_zp_float = quant_config.get("is_zp_float", False)
        all_group_blocks = quant_config["group_blocks"]
        all_m_blocks = quant_config["thread_m_blocks"]
        all_thread_configs = quant_config["thread_configs"]
        for a_type, c_type in itertools.product(a_types, c_types):
            if not SUPPORT_FP8 and a_type == "kFE4M3fn":
                continue
            if "16" in a_type and "16" in c_type and a_type != c_type:
                continue
            s_type = quant_config.get("s_type", c_type)
            if (a_type, b_type, c_type) not in result_dict:
                result_dict[(a_type, b_type, c_type)] = []
            for group_blocks, m_blocks, thread_configs in itertools.product(
                all_group_blocks, all_m_blocks, all_thread_configs
            ):
                thread_k, thread_n, threads = thread_configs
                if threads == 256:
                    # for small batch (m_blocks == 1),
                    #     we only need (128, 128, 256)
                    # for large batch (m_blocks > 1),
                    #     we only need (64, 256, 256)
                    if m_blocks <= 1 and (thread_k, thread_n) != (128, 128):
                        continue
                    if m_blocks > 1 and (thread_k, thread_n) != (64, 256):
                        continue
                config = {
                    "threads": threads,
                    "s_type": s_type,
                    "thread_m_blocks": max(m_blocks, 1),
                    "thread_k_blocks": thread_k // 16,
                    "thread_n_blocks": thread_n // 16,
                    "m_block_size_8": "true" if m_blocks == 0.5 else "false",
                    "stages": "pipe_stages",
                    "group_blocks": group_blocks,
                    "is_zp_float": "true" if is_zp_float else "false",
                }
                result_dict[(a_type, b_type, c_type)].append(config)
    kernel_selector_str = FILE_HEAD_COMMENT
    for (a_type, b_type, c_type), config_list in result_dict.items():
        all_template_str_list = []
        for config in config_list:
            s_type = config["s_type"]
            template_str = jinja2.Template(TEMPLATE).render(
                a_type_id=f"vllm::{a_type}.id()",
                b_type_id=f"vllm::{b_type}.id()",
                c_type_id=f"vllm::{c_type}.id()",
                s_type_id=f"vllm::{s_type}.id()",
                **config,
            )
            all_template_str_list.append(template_str)
-        for group_blocks, m_blocks, thread_configs in itertools.product(
+            conditions = [
-            GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS
+                f"a_type == vllm::{a_type}",
-        ):
+                f"b_type == vllm::{b_type}",
-            # act order case only support gptq-int4 and gptq-int8
+                f"c_type == vllm::{c_type}",
-            if group_blocks == 0 and scalar_type not in [
+                f"s_type == vllm::{s_type}",
-                "vllm::kU4B8",
+                f"threads == {config['threads']}",
-                "vllm::kU8B128",
+                f"thread_m_blocks == {config['thread_m_blocks']}",
-            ]:
+                f"thread_n_blocks == {config['thread_n_blocks']}",
-                continue
+                f"thread_k_blocks == {config['thread_k_blocks']}",
-            if thread_configs[2] == 256:
+                f"m_block_size_8 == {config['m_block_size_8']}",
-                # for small batch (m_blocks == 1), we only need (128, 128, 256)
+                f"group_blocks == {config['group_blocks']}",
-                # for large batch (m_blocks > 1), we only need (64, 256, 256)
+                f"is_zp_float == {config['is_zp_float']}",
-                if m_blocks <= 1 and thread_configs[0] != 128:
+            ]
-                    continue
+            conditions = " && ".join(conditions)
                if m_blocks > 1 and thread_configs[0] != 64:
                    continue
-            # we only support channelwise quantization and group_size == 128
+            if kernel_selector_str == FILE_HEAD_COMMENT:
-            # for fp8
+                kernel_selector_str += f"if ({conditions})\n  kernel = "
-            if scalar_type == "vllm::kFE4M3fn" and group_blocks not in [-1, 8]:
+            else:
-                continue
+                kernel_selector_str += f"else if ({conditions})\n  kernel = "
            # nvfp4 only supports group_size == 16
            # mxfp4 only supports group_size == 32
            if scalar_type == "vllm::kFE2M1f" and group_blocks not in [1, 2]:
                continue
            # other quantization methods don't support group_size = 16
            if scalar_type != "vllm::kFE2M1f" and group_blocks == 1:
                continue
-            k_blocks = thread_configs[0] // 16
+            kernel_template2 = (
-            n_blocks = thread_configs[1] // 16
+                "Marlin<{{a_type_id}}, {{b_type_id}}, {{c_type_id}}, "
-            threads = thread_configs[2]
+                "{{s_type_id}}, {{threads}}, {{thread_m_blocks}}, "
                "{{thread_n_blocks}}, {{thread_k_blocks}}, "
                "{{m_block_size_8}}, {{stages}}, {{group_blocks}}, "
                "{{is_zp_float}}>;"
            )
-            c_dtype = "half" if dtype == "fp16" else "nv_bfloat16"
+            kernel_selector_str += (
-
+                jinja2.Template(kernel_template2).render(
-            is_zp_float_list = [False]
+                    a_type_id=f"vllm::{a_type}.id()",
-            if dtype == "fp16" and scalar_type == "vllm::kU4" and group_blocks == 4:
+                    b_type_id=f"vllm::{b_type}.id()",
-                # HQQ (is_zp_float = true) only supports
+                    c_type_id=f"vllm::{c_type}.id()",
-                # 4bit quantization and fp16
+                    s_type_id=f"vllm::{s_type}.id()",
-                is_zp_float_list.append(True)
+                    **config,
            if scalar_type == "vllm::kFE2M1f" and group_blocks == 1:
                s_type = "vllm::kFE4M3fn"
            elif scalar_type == "vllm::kFE2M1f" and group_blocks == 2:
                s_type = "vllm::kFE8M0fnu"
                if dtype == "fp16":
                    # we cannot safely dequantize e8m0 to fp16, so skip this
                    continue
            elif dtype == "fp16":
                s_type = "vllm::kFloat16"
            elif dtype == "bf16":
                s_type = "vllm::kBFloat16"
            for is_zp_float in is_zp_float_list:
                template_str = jinja2.Template(TEMPLATE).render(
                    scalar_t=c_dtype,
                    w_type_id=scalar_type + ".id()",
                    s_type_id=s_type + ".id()",
                    threads=threads,
                    thread_m_blocks=max(m_blocks, 1),
                    thread_n_blocks=n_blocks,
                    thread_k_blocks=k_blocks,
                    m_block_size_8=m_blocks == 0.5,
                    stages="pipe_stages",
                    group_blocks=group_blocks,
                    is_zp_float=is_zp_float,
                )
-
+                + "\n"
-                all_template_str_list.append(template_str)
+            )
        file_content = FILE_HEAD + "\n\n"
        file_content += "\n\n".join(all_template_str_list) + "\n\n}\n"
-        filename = f"kernel_{dtype}_{scalar_type[6:].lower()}.cu"
+        if a_type == "kFE4M3fn":
            filename = f"sm89_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
        else:
            filename = f"sm80_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
        filename = filename.lower()
        with open(os.path.join(os.path.dirname(__file__), filename), "w") as f:
            f.write(file_content)
    if not SUPPORT_FP8 and kernel_selector_str != FILE_HEAD_COMMENT:
        kernel_selector_str += (
            "else if (a_type == vllm::kFE4M3fn)\n"
            "  TORCH_CHECK(false, "
            '"marlin kernel with fp8 activation is not built.");'
        )
    with open(os.path.join(os.path.dirname(__file__), "kernel_selector.h"), "w") as f:
        f.write(kernel_selector_str)
 if __name__ == "__main__":
    remove_old_kernels()
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@ -53,7 +53,7 @@ torch::Tensor gptq_marlin_gemm(
    std::optional<torch::Tensor> const& b_zeros_or_none,
    std::optional<torch::Tensor> const& g_idx_or_none,
    std::optional<torch::Tensor> const& perm_or_none, torch::Tensor& workspace,
-    vllm::ScalarTypeId const& b_q_type_id, int64_t size_m, int64_t size_n,
+    vllm::ScalarTypeId const& b_type_id, int64_t size_m, int64_t size_n,
    int64_t size_k, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce,
    bool is_zp_float) {
  TORCH_CHECK_NOT_IMPLEMENTED(false,
@ -243,204 +243,29 @@ bool is_valid_config(thread_config_t const& th_config, int thread_m_blocks,
  int cache_size = get_kernel_cache_size(
      th_config, thread_m_blocks, prob_m, prob_n, prob_k, num_bits, group_size,
      has_act_order, is_k_full, has_zp, is_zp_float);
-  return cache_size + 512 <= max_shared_mem;
+  return cache_size <= max_shared_mem;
 }
-  #define _GET_IF(W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,   \
+MarlinFuncPtr get_marlin_kernel(
-                  M_BLOCK_SIZE_8, GROUP_BLOCKS, NUM_THREADS, IS_ZP_FLOAT)      \
+    const vllm::ScalarType a_type, const vllm::ScalarType b_type,
-    else if (q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS &&         \
+    const vllm::ScalarType c_type, const vllm::ScalarType s_type,
-             thread_n_blocks == THREAD_N_BLOCKS &&                             \
+    int thread_m_blocks, int thread_n_blocks, int thread_k_blocks,
-             thread_k_blocks == THREAD_K_BLOCKS &&                             \
+    bool m_block_size_8, bool has_act_order, bool has_zp, int group_blocks,
-             m_block_size_8 == M_BLOCK_SIZE_8 &&                               \
+    int threads, bool is_zp_float) {
-             group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS &&     \
+  int num_bits = b_type.size_bits();
             is_zp_float == IS_ZP_FLOAT) {                                     \
      constexpr auto S_TYPE =                                                  \
          W_TYPE == vllm::kFE2M1f                                              \
              ? (GROUP_BLOCKS == 1 ? vllm::kFE4M3fn : vllm::kFE8M0fnu)         \
              : (std::is_same<scalar_t, half>::value ? vllm::kFloat16          \
                                                     : vllm::kBFloat16);       \
      kernel = Marlin<scalar_t, W_TYPE.id(), S_TYPE.id(), NUM_THREADS,         \
                      THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,       \
                      M_BLOCK_SIZE_8, pipe_stages, GROUP_BLOCKS, IS_ZP_FLOAT>; \
    }
  // COMMON: cases for (group_blocks in [-1, 2, 4, 8] and is_zp_float == false)
  //         this is the most common cases
  // BIGGROUP: cases for big group size (group_blocks in [-1, 8])
  // FZP: cases for float-zero-point (is_zp_float = true)
  // ACT: cases for act order case (group_blocks == 0)
  // FP4: cases for nvfp4(e2m1) (group_blocks == 1)
  #define COMMON_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, -1, NUM_THREADS, false)  \
    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 2, NUM_THREADS, false)   \
    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 4, NUM_THREADS, false)   \
    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 8, NUM_THREADS, false)   \
    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)  \
    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false)  \
    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
  #define COMMON_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)  \
    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false)  \
    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
                                                                          \
    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)  \
    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false)  \
    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
                                                                          \
    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)  \
    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false)  \
    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
  #define COMMON_GET_IF(W_TYPE)            \
    COMMON_GET_IF_M1(W_TYPE, 8, 8, 256)    \
    COMMON_GET_IF_M1(W_TYPE, 8, 4, 128)    \
    COMMON_GET_IF_M1(W_TYPE, 4, 8, 128)    \
    COMMON_GET_IF_M234(W_TYPE, 16, 4, 256) \
    COMMON_GET_IF_M234(W_TYPE, 8, 4, 128)  \
    COMMON_GET_IF_M234(W_TYPE, 4, 8, 128)
  #define BIGGROUP_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, -1, NUM_THREADS, false)  \
    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 8, NUM_THREADS, false)   \
    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
  #define BIGGROUP_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)   \
    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
  #define BIGGROUP_GET_IF(W_TYPE)            \
    BIGGROUP_GET_IF_M1(W_TYPE, 8, 8, 256)    \
    BIGGROUP_GET_IF_M1(W_TYPE, 8, 4, 128)    \
    BIGGROUP_GET_IF_M1(W_TYPE, 4, 8, 128)    \
    BIGGROUP_GET_IF_M234(W_TYPE, 16, 4, 256) \
    BIGGROUP_GET_IF_M234(W_TYPE, 8, 4, 128)  \
    BIGGROUP_GET_IF_M234(W_TYPE, 4, 8, 128)
  #define NVFP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)      \
    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 1, NUM_THREADS, false) \
    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)
  #define NVFP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)
  #define NVFP4_GET_IF(W_TYPE)            \
    NVFP4_GET_IF_M1(W_TYPE, 8, 8, 256)    \
    NVFP4_GET_IF_M1(W_TYPE, 8, 4, 128)    \
    NVFP4_GET_IF_M1(W_TYPE, 4, 8, 128)    \
    NVFP4_GET_IF_M234(W_TYPE, 16, 4, 256) \
    NVFP4_GET_IF_M234(W_TYPE, 8, 4, 128)  \
    NVFP4_GET_IF_M234(W_TYPE, 4, 8, 128)
  #define MXFP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)      \
    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 2, NUM_THREADS, false) \
    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)
  #define MXFP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \
    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \
    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)
  #define MXFP4_GET_IF(W_TYPE)            \
    MXFP4_GET_IF_M1(W_TYPE, 8, 8, 256)    \
    MXFP4_GET_IF_M1(W_TYPE, 8, 4, 128)    \
    MXFP4_GET_IF_M1(W_TYPE, 4, 8, 128)    \
    MXFP4_GET_IF_M234(W_TYPE, 16, 4, 256) \
    MXFP4_GET_IF_M234(W_TYPE, 8, 4, 128)  \
    MXFP4_GET_IF_M234(W_TYPE, 4, 8, 128)
  // We currently have 4-bit models only with group_blocks == 4
  #define FZP_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 4, NUM_THREADS, true) \
    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true)
  #define FZP_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)      \
    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true) \
    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true) \
    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true)
  #define FZP_GET_IF(W_TYPE)            \
    FZP_GET_IF_M1(W_TYPE, 8, 8, 256)    \
    FZP_GET_IF_M1(W_TYPE, 8, 4, 128)    \
    FZP_GET_IF_M1(W_TYPE, 4, 8, 128)    \
    FZP_GET_IF_M234(W_TYPE, 16, 4, 256) \
    FZP_GET_IF_M234(W_TYPE, 8, 4, 128)  \
    FZP_GET_IF_M234(W_TYPE, 4, 8, 128)
  // We currently have 4-bit models only with group_blocks == 4
  #define ACT_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)        \
    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS, false) \
    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false)
  #define ACT_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false) \
    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false) \
    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false)
  #define ACT_GET_IF(W_TYPE)            \
    ACT_GET_IF_M1(W_TYPE, 8, 8, 256)    \
    ACT_GET_IF_M1(W_TYPE, 8, 4, 128)    \
    ACT_GET_IF_M1(W_TYPE, 4, 8, 128)    \
    ACT_GET_IF_M234(W_TYPE, 16, 4, 256) \
    ACT_GET_IF_M234(W_TYPE, 8, 4, 128)  \
    ACT_GET_IF_M234(W_TYPE, 4, 8, 128)
 template <typename scalar_t>
 MarlinFuncPtr get_marlin_kernel(const vllm::ScalarType q_type,
                                int thread_m_blocks, int thread_n_blocks,
                                int thread_k_blocks, bool m_block_size_8,
                                bool has_act_order, bool has_zp,
                                int group_blocks, int num_threads,
                                bool is_zp_float) {
  int num_bits = q_type.size_bits();
  auto kernel = MarlinDefault;
  if (false) {
  }
-  COMMON_GET_IF(vllm::kU4)
+  #include "kernel_selector.h"
  COMMON_GET_IF(vllm::kU4B8)
  COMMON_GET_IF(vllm::kU8B128)
  NVFP4_GET_IF(vllm::kFE2M1f)
  BIGGROUP_GET_IF(vllm::kFE4M3fn)
  ACT_GET_IF(vllm::kU4B8)
  ACT_GET_IF(vllm::kU8B128)
  if (std::is_same<scalar_t, half>::value) {
    if (false) {
    }
    FZP_GET_IF(vllm::kU4)
  }
  if (std::is_same<scalar_t, nv_bfloat16>::value) {
    if (false) {
    }
    MXFP4_GET_IF(vllm::kFE2M1f)
  }
  return kernel;
 }
-template <typename scalar_t>
+exec_config_t determine_exec_config(
-exec_config_t determine_exec_config(const vllm::ScalarType& q_type, int prob_m,
+    const vllm::ScalarType& a_type, const vllm::ScalarType& b_type,
-                                    int prob_n, int prob_k, int thread_m_blocks,
+    const vllm::ScalarType& c_type, const vllm::ScalarType& s_type, int prob_m,
-                                    bool m_block_size_8, int num_bits,
+    int prob_n, int prob_k, int thread_m_blocks, bool m_block_size_8,
-                                    int group_size, bool has_act_order,
+    int num_bits, int group_size, bool has_act_order, bool is_k_full,
-                                    bool is_k_full, bool has_zp,
+    bool has_zp, bool is_zp_float, int max_shared_mem, int sms) {
                                    bool is_zp_float, int max_shared_mem,
                                    int sms) {
  exec_config_t exec_cfg = exec_config_t{1, thread_config_t{-1, -1, -1}};
  thread_config_t* thread_configs = thread_m_blocks > 1
                                        ? large_batch_thread_configs
@ -455,7 +280,7 @@ exec_config_t determine_exec_config(const vllm::ScalarType& q_type, int prob_m,
    if (!is_valid_config(th_config, thread_m_blocks, prob_m, prob_n, prob_k,
                         num_bits, group_size, has_act_order, is_k_full, has_zp,
-                         is_zp_float, max_shared_mem)) {
+                         is_zp_float, max_shared_mem - 512)) {
      continue;
    }
@ -468,10 +293,11 @@ exec_config_t determine_exec_config(const vllm::ScalarType& q_type, int prob_m,
      group_blocks = group_size == -1 ? -1 : group_size / 16;
    }
-    auto kernel = get_marlin_kernel<scalar_t>(
+    auto kernel =
-        q_type, thread_m_blocks, th_config.thread_n / 16,
+        get_marlin_kernel(a_type, b_type, c_type, s_type, thread_m_blocks,
-        th_config.thread_k / 16, m_block_size_8, has_act_order, has_zp,
+                          th_config.thread_n / 16, th_config.thread_k / 16,
-        group_blocks, th_config.num_threads, is_zp_float);
+                          m_block_size_8, has_act_order, has_zp, group_blocks,
                          th_config.num_threads, is_zp_float);
    if (kernel == MarlinDefault) continue;
@ -485,28 +311,16 @@ exec_config_t determine_exec_config(const vllm::ScalarType& q_type, int prob_m,
  return exec_cfg;
 }
 template <typename scalar_t>
 void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
-               void* s, void* s2, void* zp, void* g_idx, void* perm,
+               void* a_s, void* b_s, void* g_s, void* zp, void* g_idx,
-               void* a_tmp, int prob_m, int prob_n, int prob_k, int lda,
+               void* perm, void* a_tmp, int prob_m, int prob_n, int prob_k,
-               void* workspace, vllm::ScalarType const& q_type, bool has_bias,
+               int lda, void* workspace, vllm::ScalarType const& a_type,
               vllm::ScalarType const& b_type, vllm::ScalarType const& c_type,
               vllm::ScalarType const& s_type, bool has_bias,
               bool has_act_order, bool is_k_full, bool has_zp, int num_groups,
               int group_size, int dev, cudaStream_t stream, int thread_k_init,
               int thread_n_init, int sms, bool use_atomic_add,
               bool use_fp32_reduce, bool is_zp_float) {
  if (has_zp) {
    TORCH_CHECK(
        q_type == vllm::kU4 || q_type == vllm::kU8,
        "q_type must be u4 or u8 when has_zp = True. Got = ", q_type.str());
  } else {
    TORCH_CHECK(
        q_type == vllm::kU4B8 || q_type == vllm::kU8B128 ||
            q_type == vllm::kFE4M3fn || q_type == vllm::kFE2M1f,
        "q_type must be uint4b8, uint8b128, float8_e4m3fn or float4_e2m1f when "
        "has_zp = False. Got = ",
        q_type.str());
  }
  TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
              ", ", prob_n, ", ", prob_k, "]");
@ -531,19 +345,21 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
    }
  }
-  int num_bits = q_type.size_bits();
+  int num_bits = b_type.size_bits();
  const int4* A_ptr = (const int4*)A;
  const int4* B_ptr = (const int4*)B;
  int4* C_ptr = (int4*)C;
  int4* C_tmp_ptr = (int4*)C_tmp;
  const int4* bias_ptr = (const int4*)b_bias;
-  const int4* s_ptr = (const int4*)s;
+  const float* a_s_ptr = (const float*)a_s;
-  const uint16_t* s2_ptr = (const uint16_t*)s2;
+  const int4* b_s_ptr = (const int4*)b_s;
  const uint16_t* g_s_ptr = (const uint16_t*)g_s;
  const int4* zp_ptr = (const int4*)zp;
  const int* g_idx_ptr = (const int*)g_idx;
  const int* perm_ptr = (const int*)perm;
  int4* a_tmp_ptr = (int4*)a_tmp;
  int* locks = (int*)workspace;
  if (has_act_order) {
@ -568,6 +384,21 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
  TORCH_CHECK(max_shared_mem > 0);
  int major_capability, minor_capability;
  cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor,
                         dev);
  cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor,
                         dev);
  TORCH_CHECK(major_capability * 10 + minor_capability >= 80,
              "marlin kernel only support Ampere or newer GPUs.");
  if (a_type == vllm::kFE4M3fn) {
    TORCH_CHECK(
        major_capability * 10 + minor_capability == 89 ||
            major_capability * 10 + minor_capability == 120,
        "Marlin W4A8-FP8 only support SM89 or SM120 device (It is slower than "
        "Marlin W4A16 on other devices).");
  }
  int max_par = 16;
  if (prob_n <= 4096) max_par = 16 * 8;
  int max_shared_mem_new = max_shared_mem;
@ -583,7 +414,7 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
    int thread_n = thread_n_init;
    int thread_m_blocks = min(div_ceil(prob_m_split, 16), max_thread_m_blocks);
-    int m_block_size_8 = prob_m_split <= 8;
+    int m_block_size_8 = prob_m_split <= 8 && a_type.size_bits() == 16;
    // Set thread config
    exec_config_t exec_cfg;
@ -597,11 +428,25 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
                  " is not divisible by thread_k = ", thread_k);
    } else {
      // Auto config
-      exec_cfg = determine_exec_config<scalar_t>(
+      exec_cfg = determine_exec_config(
-          q_type, prob_m_split, prob_n, prob_k, thread_m_blocks, m_block_size_8,
+          a_type, b_type, c_type, s_type, prob_m_split, prob_n, prob_k,
-          num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float,
+          thread_m_blocks, m_block_size_8, num_bits, group_size, has_act_order,
-          max_shared_mem, sms);
+          is_k_full, has_zp, is_zp_float, max_shared_mem, sms);
      thread_tfg = exec_cfg.tb_cfg;
      if (thread_tfg.thread_n != -1) {
        if (prob_n / thread_tfg.thread_n *
                div_ceil(prob_m_split, thread_m_blocks * 16) * 4 <=
            sms) {
          if (is_valid_config({128, 64, 128}, thread_m_blocks, prob_m_split,
                              prob_n, prob_k, num_bits, group_size,
                              has_act_order, is_k_full, has_zp, is_zp_float,
                              max_shared_mem_new)) {
            thread_tfg = {128, 64, 128};
            exec_cfg = {1, thread_tfg};
          }
        }
      }
      if (thread_tfg.thread_k == -1 && max_thread_m_blocks > 1) {
        max_thread_m_blocks--;
        continue;
@ -632,10 +477,10 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
        ", has_zp = ", has_zp, ", is_zp_float = ", is_zp_float,
        ", max_shared_mem_new = ", max_shared_mem_new);
-    auto kernel = get_marlin_kernel<scalar_t>(
+    auto kernel = get_marlin_kernel(
-        q_type, thread_m_blocks, thread_n_blocks, thread_k_blocks,
+        a_type, b_type, c_type, s_type, thread_m_blocks, thread_n_blocks,
-        m_block_size_8, has_act_order, has_zp, group_blocks, num_threads,
+        thread_k_blocks, m_block_size_8, has_act_order, has_zp, group_blocks,
-        is_zp_float);
+        num_threads, is_zp_float);
    if (kernel == MarlinDefault) {
      TORCH_CHECK(false, "Unsupported shapes: MNK = [", prob_m, ", ", prob_n,
@ -657,13 +502,15 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
    // avoid ">>>" being formatted to "> > >"
    // clang-format off
    kernel<<<blocks, num_threads, max_shared_mem_new, stream>>>(
-        A_ptr, B_ptr, C_ptr, C_tmp_ptr, bias_ptr, s_ptr, s2_ptr, zp_ptr,
+        A_ptr, B_ptr, C_ptr, C_tmp_ptr, bias_ptr, a_s_ptr, b_s_ptr, g_s_ptr, zp_ptr,
        g_idx_ptr, num_groups,
        prob_m_split, prob_n, prob_k, lda, locks, has_bias, part_use_atomic_add,
        use_fp32_reduce, max_shared_mem_new);
    // clang-format on
-    A_ptr += prob_m_split * (lda / 8);
+    bool is_a_8bit = a_type.size_bits() == 8;
    A_ptr += prob_m_split * (lda / (is_a_8bit ? 16 : 8));
    a_s_ptr += prob_m_split;
    C_ptr += prob_m_split * (prob_n / 8);
    rest_m -= prob_m_split;
  }
@ -675,15 +522,73 @@ torch::Tensor gptq_marlin_gemm(
    torch::Tensor& a, std::optional<torch::Tensor> c_or_none,
    torch::Tensor& b_q_weight,
    std::optional<torch::Tensor> const& b_bias_or_none, torch::Tensor& b_scales,
    std::optional<torch::Tensor> const& a_scales_or_none,
    std::optional<torch::Tensor> const& global_scale_or_none,
    std::optional<torch::Tensor> const& b_zeros_or_none,
    std::optional<torch::Tensor> const& g_idx_or_none,
    std::optional<torch::Tensor> const& perm_or_none, torch::Tensor& workspace,
-    vllm::ScalarTypeId const& b_q_type_id, int64_t size_m, int64_t size_n,
+    vllm::ScalarTypeId const& b_type_id, int64_t size_m, int64_t size_n,
    int64_t size_k, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce,
    bool is_zp_float) {
-  vllm::ScalarType const b_q_type = vllm::ScalarType::from_id(b_q_type_id);
+  vllm::ScalarTypeId a_type_id, c_type_id, s_type_id;
-  int pack_factor = 32 / b_q_type.size_bits();
+
  auto c_dtype = a.dtype();
  if (a.scalar_type() == at::ScalarType::Half) {
    a_type_id = vllm::kFloat16.id();
    c_type_id = vllm::kFloat16.id();
  } else if (a.scalar_type() == at::ScalarType::BFloat16) {
    a_type_id = vllm::kBFloat16.id();
    c_type_id = vllm::kBFloat16.id();
  } else {
    c_dtype = b_scales.dtype();
    if (b_scales.scalar_type() == at::ScalarType::Half) {
      c_type_id = vllm::kFloat16.id();
    } else if (b_scales.scalar_type() == at::ScalarType::BFloat16) {
      c_type_id = vllm::kBFloat16.id();
    } else {
      c_type_id = vllm::kBFloat16.id();
      TORCH_CHECK(c_or_none.has_value(), "c must be passed for W4A8-FP4");
      torch::Tensor c = c_or_none.value();
      c_dtype = c.dtype();
      if (c.scalar_type() == at::ScalarType::Half) {
        c_type_id = vllm::kFloat16.id();
      } else if (c.scalar_type() == at::ScalarType::BFloat16) {
        c_type_id = vllm::kBFloat16.id();
      } else {
        TORCH_CHECK(false, "unsupported c dtype");
      }
    }
    if (a.scalar_type() == at::ScalarType::Float8_e4m3fn) {
      a_type_id = vllm::kFE4M3fn.id();
    } else if (a.scalar_type() == at::ScalarType::Char) {
      a_type_id = vllm::kS8.id();
    } else {
      TORCH_CHECK(false, "unsupported `a` scalar_type");
    }
  }
  s_type_id = c_type_id;
  if (b_type_id == vllm::kFE2M1f.id()) {
    if (b_scales.scalar_type() == at::ScalarType::Float8_e4m3fn) {
      s_type_id = vllm::kFE4M3fn.id();
    } else if (b_scales.scalar_type() == at::ScalarType::Float8_e8m0fnu) {
      s_type_id = vllm::kFE8M0fnu.id();
    } else {
      TORCH_CHECK(false,
                  "When b_type = float4_e2m1f, b_scale scalar type must be",
                  "float8_e4m3fn (for NVFP4) or float8_e8m0fnu (for MXFP4).");
    }
  }
  vllm::ScalarType a_type = vllm::ScalarType::from_id(a_type_id);
  vllm::ScalarType b_type = vllm::ScalarType::from_id(b_type_id);
  vllm::ScalarType c_type = vllm::ScalarType::from_id(c_type_id);
  vllm::ScalarType s_type = vllm::ScalarType::from_id(s_type_id);
  int pack_factor = 32 / b_type.size_bits();
  // Verify A
  TORCH_CHECK(a.size(0) == size_m, "Shape mismatch: a.size(0) = ", a.size(0),
@ -721,6 +626,21 @@ torch::Tensor gptq_marlin_gemm(
  TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
  TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");
  torch::Tensor a_scales;
  auto options = torch::TensorOptions().dtype(c_dtype).device(a.device());
  auto options_fp32 =
      torch::TensorOptions().dtype(at::kFloat).device(a.device());
  if (a_scales_or_none.has_value()) {
    a_scales = a_scales_or_none.value();
    TORCH_CHECK(a_type.size_bits() == 8,
                "a_scales can only be used for 8bit activation.");
  } else {
    a_scales = torch::empty({0}, options_fp32);
    TORCH_CHECK(a_type.size_bits() != 8,
                "the a_scales parameter must be passed for 8bit activation.");
  }
  // thread_k: `k` size of a thread_tile in `weights` (can usually be left as
  // auto -1)
  int thread_k = -1;
@ -733,7 +653,6 @@ torch::Tensor gptq_marlin_gemm(
  // Alloc buffers
  const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
  auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
  torch::Tensor c;
  if (c_or_none.has_value()) {
    c = c_or_none.value();
@ -750,8 +669,6 @@ torch::Tensor gptq_marlin_gemm(
  // Alloc C tmp buffer that is going to be used for the global reduce
  torch::Tensor c_tmp;
  auto options_fp32 =
      torch::TensorOptions().dtype(at::kFloat).device(a.device());
  if (use_fp32_reduce) {
    int max_m_block_size = (size_m + 16 - 1) / 16 * 16;
    max_m_block_size = min(max_m_block_size, 64);
@ -821,11 +738,11 @@ torch::Tensor gptq_marlin_gemm(
  torch::Tensor global_scale;
  if (global_scale_or_none.has_value()) {
    global_scale = global_scale_or_none.value();
-    TORCH_CHECK(b_q_type == vllm::kFE2M1f && group_size == 16,
+    TORCH_CHECK(b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn,
                "global_scale can only be used for nvfp4 format.");
  } else {
    global_scale = torch::empty({0}, options);
-    TORCH_CHECK(!(b_q_type == vllm::kFE2M1f && group_size == 16),
+    TORCH_CHECK(!(b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn),
                "the global_scale parameter must be passed for nvfp4 format.");
  }
@ -852,15 +769,15 @@ torch::Tensor gptq_marlin_gemm(
  bool has_zp = b_zeros.size(-1) > 0;
  if (has_zp) {
    TORCH_CHECK(
-        b_q_type == vllm::kU4 || b_q_type == vllm::kU8,
+        b_type == vllm::kU4 || b_type == vllm::kU8,
-        "b_q_type must be u4 or u8 when has_zp = True. Got = ", b_q_type.str());
+        "b_type must be u4 or u8 when has_zp = True. Got = ", b_type.str());
  } else {
-    TORCH_CHECK(b_q_type == vllm::kU4B8 || b_q_type == vllm::kU8B128 ||
+    TORCH_CHECK(b_type == vllm::kU4B8 || b_type == vllm::kU8B128 ||
-                    b_q_type == vllm::kFE4M3fn || b_q_type == vllm::kFE2M1f,
+                    b_type == vllm::kS4 || b_type == vllm::kS8 ||
-                "b_q_type must be uint4b8, uint8b128, float8_e4m3fn or "
+                    b_type == vllm::kFE4M3fn || b_type == vllm::kFE2M1f,
-                "float4_e2m1f when "
+                "b_type must be uint4b8, uint8b128, int4, int8, "
-                "has_zp = False. Got = ",
+                "float8_e4m3fn or float4_e2m1f when has_zp = False. Got = ",
-                b_q_type.str());
+                b_type.str());
  }
  if (has_zp && is_zp_float) {
@ -902,59 +819,27 @@ torch::Tensor gptq_marlin_gemm(
              " is below min_workspace_size = ", min_workspace_size);
  int dev = a.get_device();
  if (a.scalar_type() == at::ScalarType::Half) {
    void* scales_ptr;
    if (b_q_type == vllm::kFE2M1f) {
      if (group_size == 16)
        scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
      else if (group_size == 32)
        scales_ptr = b_scales.data_ptr<at::Float8_e8m0fnu>();
      else
        TORCH_CHECK(false,
                    "float4_e2m1f only supports group_size == 16 (NVFP4) ",
                    "and group_size == 32 (MXFP4)");
    } else {
      scales_ptr = b_scales.data_ptr<at::Half>();
    }
-    marlin::marlin_mm<half>(
+  TORCH_CHECK(a_scales.scalar_type() == at::ScalarType::Float,
-        a.data_ptr<at::Half>(), b_q_weight.data_ptr(), c.data_ptr<at::Half>(),
+              "scalar type of a_scales must be float");
-        c_tmp.data_ptr<float>(), b_bias.data_ptr<at::Half>(), scales_ptr,
+  TORCH_CHECK(global_scale.scalar_type() == c.scalar_type(),
-        global_scale.data_ptr<at::Half>(), b_zeros.data_ptr(), g_idx.data_ptr(),
+              "scalar type of global_scale must be the same with c");
-        perm.data_ptr(), a_tmp.data_ptr<at::Half>(), size_m, size_n, size_k,
+  if (a_type.size_bits() == 16) {
-        a.stride(0), workspace.data_ptr(), b_q_type, has_bias, has_act_order,
+    TORCH_CHECK(
-        is_k_full, has_zp, num_groups, group_size, dev,
+        a.scalar_type() == c.scalar_type(),
-        at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
+        "scalar type of a must be the same with c for 16 bit activation");
        use_atomic_add, use_fp32_reduce, is_zp_float);
  } else if (a.scalar_type() == at::ScalarType::BFloat16) {
    void* scales_ptr;
    if (b_q_type == vllm::kFE2M1f) {
      if (group_size == 16)
        scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
      else if (group_size == 32)
        scales_ptr = b_scales.data_ptr<at::Float8_e8m0fnu>();
      else
        TORCH_CHECK(false,
                    "float4_e2m1f only supports group_size == 16 (NVFP4) ",
                    "and group_size == 32 (MXFP4)");
    } else {
      scales_ptr = b_scales.data_ptr<at::BFloat16>();
    }
    marlin::marlin_mm<nv_bfloat16>(
        a.data_ptr<at::BFloat16>(), b_q_weight.data_ptr(),
        c.data_ptr<at::BFloat16>(), c_tmp.data_ptr<float>(),
        b_bias.data_ptr<at::BFloat16>(), scales_ptr,
        global_scale.data_ptr<at::BFloat16>(), b_zeros.data_ptr(),
        g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr<at::BFloat16>(),
        size_m, size_n, size_k, a.stride(0), workspace.data_ptr(), b_q_type,
        has_bias, has_act_order, is_k_full, has_zp, num_groups, group_size, dev,
        at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
        use_atomic_add, use_fp32_reduce, is_zp_float);
  } else {
    TORCH_CHECK(false, "gpt_marlin_gemm only supports bfloat16 and float16");
  }
  marlin::marlin_mm(
      a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(), c_tmp.data_ptr(),
      b_bias.data_ptr(), a_scales.data_ptr(), b_scales.data_ptr(),
      global_scale.data_ptr(), b_zeros.data_ptr(), g_idx.data_ptr(),
      perm.data_ptr(), a_tmp.data_ptr(), size_m, size_n, size_k, a.stride(0),
      workspace.data_ptr(), a_type, b_type, c_type, s_type, has_bias,
      has_act_order, is_k_full, has_zp, num_groups, group_size, dev,
      at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
      use_atomic_add, use_fp32_reduce, is_zp_float);
  return c;
 }
--- a/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
@ -4,15 +4,18 @@
 namespace marlin {
-template <int const num_threads, int const num_bits, bool const has_perm>
+template <int const num_threads, int const num_bits, bool const has_perm,
          bool is_a_8bit>
 __global__ void gptq_marlin_repack_kernel(
    uint32_t const* __restrict__ b_q_weight_ptr,
    uint32_t const* __restrict__ perm_ptr, uint32_t* __restrict__ out_ptr,
    int size_k, int size_n) {
  constexpr int pack_factor = 32 / num_bits;
-  int k_tiles = size_k / tile_k_size;
+  constexpr int target_tile_n_size = tile_n_size / (is_a_8bit ? 2 : 1);
-  int n_tiles = size_n / tile_n_size;
+  constexpr int target_tile_k_size = tile_k_size * (is_a_8bit ? 2 : 1);
  int k_tiles = size_k / target_tile_k_size;
  int n_tiles = size_n / target_tile_n_size;
  int block_k_tiles = div_ceil(k_tiles, gridDim.x);
  auto start_k_tile = blockIdx.x * block_k_tiles;
@ -34,7 +37,7 @@ __global__ void gptq_marlin_repack_kernel(
  extern __shared__ int4 sh[];
-  constexpr int perm_size = tile_k_size / 4;
+  constexpr int perm_size = target_tile_k_size / 4;
  int4* sh_perm_ptr = sh;
  int4* sh_pipe_ptr = sh_perm_ptr;
@ -42,14 +45,14 @@ __global__ void gptq_marlin_repack_kernel(
    sh_pipe_ptr += perm_size;
  }
-  constexpr int tile_ints = tile_k_size / pack_factor;
+  constexpr int tile_ints = target_tile_k_size / pack_factor;
-  constexpr int stage_n_threads = tile_n_size / 4;
+  constexpr int stage_n_threads = target_tile_n_size / 4;
-  constexpr int stage_k_threads = has_perm ? tile_k_size : tile_ints;
+  constexpr int stage_k_threads = has_perm ? target_tile_k_size : tile_ints;
  constexpr int stage_size = stage_k_threads * stage_n_threads;
  auto load_perm_to_shared = [&](int k_tile_id) {
-    int first_k_int4 = (k_tile_id * tile_k_size) / 4;
+    int first_k_int4 = (k_tile_id * target_tile_k_size) / 4;
    int4 const* perm_int4_ptr = reinterpret_cast<int4 const*>(perm_ptr);
@ -65,7 +68,7 @@ __global__ void gptq_marlin_repack_kernel(
      return;
    }
-    int first_n = n_tile_id * tile_n_size;
+    int first_n = n_tile_id * target_tile_n_size;
    int4* sh_ptr = sh_pipe_ptr + stage_size * pipe;
@ -91,7 +94,7 @@ __global__ void gptq_marlin_repack_kernel(
        auto k_id = threadIdx.x / stage_n_threads;
        auto n_id = threadIdx.x % stage_n_threads;
-        int first_k = k_tile_id * tile_k_size;
+        int first_k = k_tile_id * target_tile_k_size;
        int first_k_packed = first_k / pack_factor;
        cp_async4(&sh_ptr[k_id * stage_n_threads + n_id],
@ -117,13 +120,13 @@ __global__ void gptq_marlin_repack_kernel(
    }
    int tc_col = th_id / 4;
-    int tc_row = (th_id % 4) * 2;
+    int tc_row = (th_id % 4) * (is_a_8bit ? 4 : 2);
    constexpr int tc_offsets[4] = {0, 1, 8, 9};
-    int cur_n = warp_id * 16 + tc_col;
+    int cur_n = (warp_id / (is_a_8bit ? 2 : 1)) * 16 + tc_col;
-    constexpr int sh_stride = 64;
+    constexpr int sh_stride = target_tile_n_size;
    constexpr uint32_t mask = (1 << num_bits) - 1;
    int4* sh_stage_ptr = sh_pipe_ptr + stage_size * pipe;
@ -134,6 +137,7 @@ __global__ void gptq_marlin_repack_kernel(
    uint32_t vals[8];
    if constexpr (has_perm) {
      static_assert(!is_a_8bit);
      for (int i = 0; i < 4; i++) {
        int k_idx = tc_row + tc_offsets[i];
@ -156,28 +160,49 @@ __global__ void gptq_marlin_repack_kernel(
 #pragma unroll
      for (int i = 0; i < tile_ints; i++) {
-        b1_vals[i] = sh_stage_int_ptr[cur_n + sh_stride * i];
+        if constexpr (is_a_8bit) {
-        b2_vals[i] = sh_stage_int_ptr[cur_n + 8 + sh_stride * i];
+          b1_vals[i] =
              sh_stage_int_ptr[cur_n + sh_stride * i + (warp_id % 2) * 8];
        } else {
          b1_vals[i] = sh_stage_int_ptr[cur_n + sh_stride * i];
          b2_vals[i] = sh_stage_int_ptr[cur_n + 8 + sh_stride * i];
        }
      }
 #pragma unroll
      for (int i = 0; i < 4; i++) {
-        int cur_elem = tc_row + tc_offsets[i];
+        int cur_elem = tc_row + (is_a_8bit ? i : tc_offsets[i]);
        int cur_int = cur_elem / pack_factor;
        int cur_pos = cur_elem % pack_factor;
        vals[i] = (b1_vals[cur_int] >> (cur_pos * num_bits)) & mask;
-        vals[4 + i] = (b2_vals[cur_int] >> (cur_pos * num_bits)) & mask;
+        if constexpr (is_a_8bit)
          vals[4 + i] =
              (b1_vals[cur_int + tile_ints / 2] >> (cur_pos * num_bits)) & mask;
        else
          vals[4 + i] = (b2_vals[cur_int] >> (cur_pos * num_bits)) & mask;
      }
    }
-    constexpr int tile_size = tile_k_size * tile_n_size / pack_factor;
+    constexpr int tile_size =
        target_tile_k_size * target_tile_n_size / pack_factor;
    int out_offset = (k_tile_id * n_tiles + n_tile_id) * tile_size;
    // Result of:
    // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
-    if constexpr (num_bits == 4) {
+    if constexpr (!is_a_8bit && num_bits == 4) {
-      constexpr int pack_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
+      int pack_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
      uint32_t res = 0;
 #pragma unroll
      for (int i = 0; i < 8; i++) {
        res |= vals[pack_idx[i]] << (i * 4);
      }
      out_ptr[out_offset + th_id * 4 + warp_id] = res;
    } else if constexpr (is_a_8bit && num_bits == 4) {
      int pack_idx[8] = {0, 4, 1, 5, 2, 6, 3, 7};
      uint32_t res = 0;
 #pragma unroll
@ -194,8 +219,9 @@ __global__ void gptq_marlin_repack_kernel(
      uint32_t res2 = 0;
 #pragma unroll
      for (int i = 0; i < 4; i++) {
-        res1 |= vals[pack_idx[i]] << (i * 8);
+        const int ii = is_a_8bit ? i : pack_idx[i];
-        res2 |= vals[4 + pack_idx[i]] << (i * 8);
+        res1 |= vals[ii] << (i * 8);
        res2 |= vals[4 + ii] << (i * 8);
      }
      out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 0] = res1;
@ -236,21 +262,22 @@ __global__ void gptq_marlin_repack_kernel(
 }  // namespace marlin
-#define CALL_IF(NUM_BITS, HAS_PERM)                                         \
+#define CALL_IF(NUM_BITS, HAS_PERM, IS_A_8BIT)                              \
-  else if (num_bits == NUM_BITS && has_perm == HAS_PERM) {                  \
+  else if (num_bits == NUM_BITS && has_perm == HAS_PERM &&                  \
           is_a_8bit == IS_A_8BIT) {                                        \
    cudaFuncSetAttribute(                                                   \
        marlin::gptq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS, \
-                                          HAS_PERM>,                        \
+                                          HAS_PERM, IS_A_8BIT>,             \
        cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);       \
    marlin::gptq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS,     \
-                                      HAS_PERM>                             \
+                                      HAS_PERM, IS_A_8BIT>                  \
        <<<blocks, marlin::repack_threads, max_shared_mem, stream>>>(       \
            b_q_weight_ptr, perm_ptr, out_ptr, size_k, size_n);             \
  }
 torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
                                 int64_t size_k, int64_t size_n,
-                                 int64_t num_bits) {
+                                 int64_t num_bits, bool is_a_8bit) {
  // Verify compatibility with marlin tile of 16x64
  TORCH_CHECK(size_k % marlin::tile_k_size == 0, "size_k = ", size_k,
              " is not divisible by tile_k_size = ", marlin::tile_k_size);
@ -309,13 +336,17 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
  if (false) {
  }
-  CALL_IF(4, false)
+  CALL_IF(4, false, false)
-  CALL_IF(4, true)
+  CALL_IF(4, true, false)
-  CALL_IF(8, false)
+  CALL_IF(8, false, false)
-  CALL_IF(8, true)
+  CALL_IF(8, true, false)
  CALL_IF(4, false, true)
  CALL_IF(8, false, true)
  else {
    TORCH_CHECK(false, "Unsupported repack config: num_bits = ", num_bits,
-                ", has_perm = ", has_perm);
+                ", has_perm = ", has_perm, ", is_a_8bit = ", is_a_8bit);
  }
  return out;
--- a/csrc/quantization/gptq_marlin/kernel.h
+++ b/csrc/quantization/gptq_marlin/kernel.h
@ -11,17 +11,19 @@
  const int4 *__restrict__ A, const int4 *__restrict__ B,                      \
      int4 *__restrict__ C, int4 *__restrict__ C_tmp,                          \
      const int4 *__restrict__ b_bias_ptr,                                     \
      const float *__restrict__ a_scales_ptr,                                  \
      const int4 *__restrict__ scales_ptr,                                     \
-      const uint16_t *__restrict__ scale2_ptr,                                 \
+      const uint16_t *__restrict__ global_scale_ptr,                           \
      const int4 *__restrict__ zp_ptr, const int *__restrict__ g_idx,          \
      int num_groups, int prob_m, int prob_n, int prob_k, int lda, int *locks, \
      bool has_bias, bool use_atomic_add, bool use_fp32_reduce,                \
      int max_shared_mem
 namespace MARLIN_NAMESPACE_NAME {
-template <typename scalar_t,  // compute dtype, half or nv_float16
+template <const vllm::ScalarTypeId a_type_id,  // A ScalarType id
-          const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
+          const vllm::ScalarTypeId b_type_id,  // B ScalarType id
-          const vllm::ScalarTypeId s_type_id,  // weight ScalarType id
+          const vllm::ScalarTypeId c_type_id,  // C ScalarType id
          const vllm::ScalarTypeId s_type_id,  // B_SCALE ScalarType id
          const int threads,          // number of threads in a threadblock
          const int thread_m_blocks,  // number of 16x16 blocks in the m
                                      // dimension (batchsize) of the
--- a/csrc/quantization/gptq_marlin/marlin.cuh
+++ b/csrc/quantization/gptq_marlin/marlin.cuh
@ -55,6 +55,45 @@ constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; }
 // No support for async
 #else
 __device__ inline void cp_async1_ca_pred(void* smem_ptr, const void* glob_ptr,
                                         bool pred = true) {
  const int BYTES = 4;
  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
  asm volatile(
      "{\n"
      "   .reg .pred p;\n"
      "   setp.ne.b32 p, %0, 0;\n"
      "   @p cp.async.ca.shared.global [%1], [%2], %3;\n"
      "}\n" ::"r"((int)pred),
      "r"(smem), "l"(glob_ptr), "n"(BYTES));
 }
 __device__ inline void cp_async2_ca_pred(void* smem_ptr, const void* glob_ptr,
                                         bool pred = true) {
  const int BYTES = 8;
  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
  asm volatile(
      "{\n"
      "   .reg .pred p;\n"
      "   setp.ne.b32 p, %0, 0;\n"
      "   @p cp.async.ca.shared.global [%1], [%2], %3;\n"
      "}\n" ::"r"((int)pred),
      "r"(smem), "l"(glob_ptr), "n"(BYTES));
 }
 __device__ inline void cp_async4_ca_pred(void* smem_ptr, const void* glob_ptr,
                                         bool pred = true) {
  const int BYTES = 16;
  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
  asm volatile(
      "{\n"
      "   .reg .pred p;\n"
      "   setp.ne.b32 p, %0, 0;\n"
      "   @p cp.async.ca.shared.global [%1], [%2], %3;\n"
      "}\n" ::"r"((int)pred),
      "r"(smem), "l"(glob_ptr), "n"(BYTES));
 }
 __device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr,
                                      bool pred = true) {
  const int BYTES = 16;
--- a/csrc/quantization/gptq_marlin/marlin_dtypes.cuh
+++ b/csrc/quantization/gptq_marlin/marlin_dtypes.cuh
@ -2,8 +2,10 @@
 #ifndef _data_types_cuh
 #define _data_types_cuh
 #include "marlin.cuh"
 #include "core/scalar_type.hpp"
 #include <cuda_fp16.h>
 #include <cuda_bf16.h>
 #include <cuda_fp8.h>
 #ifndef MARLIN_NAMESPACE_NAME
  #define MARLIN_NAMESPACE_NAME marlin
@ -11,14 +13,16 @@
 namespace MARLIN_NAMESPACE_NAME {
-template <typename scalar_t>
+template <long scalar_type_id>
-class ScalarType {};
+class MarlinScalarType {};
 template <>
-class ScalarType<half> {
+class MarlinScalarType<vllm::kFloat16.id()> {
 public:
  using scalar_t = half;
  using scalar_t2 = half2;
  using scalar_t4 = half2;
  using scalar_32bit_t = half2;
  // Matrix fragments for tensor core instructions; their precise layout is
  // documented here:
@ -27,6 +31,7 @@ class ScalarType<half> {
  using FragB = Vec<half2, 2>;
  using FragC = Vec<float, 4>;
  using FragS = Vec<half2, 1>;
  using FragS0 = Vec<__nv_fp8x2_e4m3, 1>;
  using FragZP = Vec<half2, 4>;
  static __device__ float inline num2float(const half x) {
@ -44,18 +49,25 @@ class ScalarType<half> {
  static __host__ __device__ half inline float2num(const float x) {
    return __float2half(x);
  }
  static __host__ __device__ float2 inline num22float2(const half2 x) {
    return __half22float2(x);
  }
 };
 template <>
-class ScalarType<nv_bfloat16> {
+class MarlinScalarType<vllm::kBFloat16.id()> {
 public:
  using scalar_t = nv_bfloat16;
  using scalar_t2 = nv_bfloat162;
  using scalar_t4 = nv_bfloat162;
  using scalar_32bit_t = nv_bfloat162;
  using FragA = Vec<nv_bfloat162, 4>;
  using FragB = Vec<nv_bfloat162, 2>;
  using FragC = Vec<float, 4>;
  using FragS = Vec<nv_bfloat162, 1>;
  using FragS0 = Vec<__nv_fp8x2_e4m3, 1>;
  using FragZP = Vec<nv_bfloat162, 4>;
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
@ -75,9 +87,63 @@ class ScalarType<nv_bfloat16> {
  static __host__ __device__ nv_bfloat16 inline float2num(const float x) {
    return __float2bfloat16(x);
  }
  static __host__ __device__ float2 inline num22float2(const nv_bfloat162 x) {
    return __bfloat1622float2(x);
  }
 #endif
 };
 template <>
 class MarlinScalarType<vllm::kFE4M3fn.id()> {
 public:
  using scalar_t = __nv_fp8_e4m3;
  using scalar_t2 = __nv_fp8x2_e4m3;
  using scalar_t4 = __nv_fp8x4_e4m3;
  using scalar_32bit_t = __nv_fp8x4_e4m3;
  using FragA = Vec<__nv_fp8x4_e4m3, 4>;
  using FragB = Vec<__nv_fp8x4_e4m3, 2>;
  using FragC = Vec<float, 4>;
  using FragZP = Vec<__nv_fp8x2_e4m3, 4>;
  static __host__ __device__
      float2 inline num22float2(const __nv_fp8x2_e4m3 x) {
    return (float2)x;
  }
 };
 template <>
 class MarlinScalarType<vllm::kS8.id()> {
 public:
  using scalar_t = int8_t;
  using scalar_t2 = int16_t;
  using scalar_t4 = int32_t;
  using scalar_32bit_t = int32_t;
  using FragA = Vec<int32_t, 4>;
  using FragB = Vec<int32_t, 2>;
  using FragC = Vec<float, 4>;
  using FragZP = Vec<int16_t, 4>;
 };
 template <typename scalar_t>
 class MarlinScalarType2 {};
 template <>
 class MarlinScalarType2<half> : public MarlinScalarType<vllm::kFloat16.id()> {};
 template <>
 class MarlinScalarType2<nv_bfloat16>
    : public MarlinScalarType<vllm::kBFloat16.id()> {};
 template <>
 class MarlinScalarType2<__nv_fp8_e4m3>
    : public MarlinScalarType<vllm::kFE4M3fn.id()> {};
 template <>
 class MarlinScalarType2<int8_t> : public MarlinScalarType<vllm::kS8.id()> {};
 }  // namespace MARLIN_NAMESPACE_NAME
 #endif
--- a/csrc/quantization/gptq_marlin/marlin_int4_fp8_preprocess.cu
+++ b/csrc/quantization/gptq_marlin/marlin_int4_fp8_preprocess.cu
@ -0,0 +1,106 @@
 #include "marlin.cuh"
 #include "core/registration.h"
 // for only non-zp format (like gptq)
 __global__ void marlin_int4_fp8_preprocess_kernel_without_zp(
    // qweight: (size_k * size_n // 8,)
    const int32_t* __restrict__ qweight,
    // output: same shape with qweight
    int32_t* __restrict__ output) {
  int32_t val = qweight[blockIdx.x * 32 + threadIdx.x];
  int32_t new_val = 0;
 #pragma unroll
  for (int32_t i = 0; i < 8; i++) {
    int32_t single_val = val & 0xF;
    single_val = single_val >= 8 ? single_val - 8 : 15 - single_val;
    new_val |= single_val << (i * 4);
    val >>= 4;
  }
  output[blockIdx.x * 32 + threadIdx.x] = new_val;
 }
 // for awq format only (with zp and with awq weight layout)
 __global__ void marlin_int4_fp8_preprocess_kernel_awq(
    // AWQ qweight: (size_k, size_n // 8)
    const int32_t* __restrict__ qweight,
    // output: same shape with qweight
    int32_t* __restrict__ output,
    // AWQ zeros: (size_k // group_size, size_n // 8)
    const int32_t* __restrict__ qzeros, int32_t size_n, int32_t size_k,
    int32_t group_size) {
  int32_t val =
      qweight[(blockIdx.x * 32 + threadIdx.x) * size_n / 8 + blockIdx.y];
  int32_t zero =
      qzeros[(blockIdx.x * 32 + threadIdx.x) / group_size * size_n / 8 +
             blockIdx.y];
  int32_t new_val = 0;
 #pragma unroll
  for (int32_t i = 0; i < 8; i++) {
    int32_t single_val = val & 0xF;
    int32_t single_zero = zero & 0xF;
    single_val =
        single_val >= single_zero ? single_val - single_zero : 15 - single_val;
    new_val |= single_val << (i * 4);
    val >>= 4;
    zero >>= 4;
  }
  output[(blockIdx.x * 32 + threadIdx.x) * size_n / 8 + blockIdx.y] = new_val;
 }
 torch::Tensor marlin_int4_fp8_preprocess(
    torch::Tensor& qweight, std::optional<torch::Tensor> qzeros_or_none,
    bool inplace) {
  TORCH_CHECK(qweight.device().is_cuda(), "qweight is not on GPU");
  TORCH_CHECK(qweight.scalar_type() == at::ScalarType::Int,
              "qweight.dtype != torch.int32");
  const at::cuda::OptionalCUDAGuard device_guard(device_of(qweight));
  torch::Tensor output = inplace ? qweight : torch::empty_like(qweight);
  if (!qzeros_or_none.has_value()) {
    TORCH_CHECK(qweight.numel() * 8 % 256 == 0,
                "qweight.numel() * 8 % 256 != 0");
    int blocks = qweight.numel() * 8 / 256;
    marlin_int4_fp8_preprocess_kernel_without_zp<<<blocks, 32>>>(
        (const int32_t*)qweight.data_ptr(), (int32_t*)output.data_ptr());
  } else {
    int32_t size_k = qweight.size(0);
    int32_t size_n = qweight.size(1) * 8;
    torch::Tensor qzeros = qzeros_or_none.value();
    TORCH_CHECK(size_k % 32 == 0, "size_k % 32 != 0");
    TORCH_CHECK(qzeros.device().is_cuda(), "qzeros is not on GPU");
    TORCH_CHECK(qzeros.scalar_type() == at::ScalarType::Int,
                "qweight.dtype != torch.int32");
    TORCH_CHECK(device_of(qweight) == device_of(qzeros),
                "qzeros is not on the same device with qweight");
    int32_t group_size = qweight.size(0) / qzeros.size(0);
    TORCH_CHECK(qweight.size(1) == qzeros.size(1),
                "qweight.size(1) != qzeros.size(1)");
    TORCH_CHECK(qweight.size(0) % qzeros.size(0) == 0,
                "qweight.size(0) % qzeros.size(0) != 0");
    TORCH_CHECK(group_size % 8 == 0, "group_size % 8 != 0");
    dim3 blocks(size_k / 32, size_n / 8);
    marlin_int4_fp8_preprocess_kernel_awq<<<blocks, 32>>>(
        (const int32_t*)qweight.data_ptr(), (int32_t*)output.data_ptr(),
        (const int32_t*)qzeros.data_ptr(), size_n, size_k, group_size);
  }
  return output;
 }
 TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
  m.impl("marlin_int4_fp8_preprocess", &marlin_int4_fp8_preprocess);
 }
--- a/csrc/quantization/gptq_marlin/marlin_template.h
+++ b/csrc/quantization/gptq_marlin/marlin_template.h
--- a/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
+++ b/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
@ -67,9 +67,9 @@ void cutlass_scaled_mm_sm100(torch::Tensor& c, torch::Tensor const& a,
                             std::optional<torch::Tensor> const& bias);
 #endif
-#if defined(ENABLE_SCALED_MM_SM90) && ENABLE_SCALED_MM_SM90 ||   \
+#if (defined(ENABLE_CUTLASS_MOE_SM90) && ENABLE_CUTLASS_MOE_SM90) ||   \
-    defined(ENABLE_SCALED_MM_SM100) && ENABLE_SCALED_MM_SM100 || \
+    (defined(ENABLE_CUTLASS_MOE_SM100) && ENABLE_CUTLASS_MOE_SM100) || \
-    defined(ENABLE_SCALED_MM_SM120) && ENABLE_SCALED_MM_SM120
+    (defined(ENABLE_CUTLASS_MOE_SM120) && ENABLE_CUTLASS_MOE_SM120)
 void get_cutlass_moe_mm_data_caller(
    const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
    torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
@ -284,8 +284,9 @@ void get_cutlass_moe_mm_data(
  // This function currently gets compiled only if we have a valid cutlass moe
  // mm to run it for.
  int32_t version_num = get_sm_version_num();
-#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
+#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) ||   \
-    (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100)
+    (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
    (defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
  get_cutlass_moe_mm_data_caller(topk_ids, expert_offsets, problem_sizes1,
                                 problem_sizes2, input_permutation,
                                 output_permutation, num_experts, n, k,
@ -296,7 +297,7 @@ void get_cutlass_moe_mm_data(
      false,
      "No compiled get_cutlass_moe_mm_data: no cutlass_scaled_mm kernel for "
      "CUDA device capability: ",
-      version_num, ". Required capability: 90 or 100");
+      version_num, ". Required capability: 90, 100, or 120");
 }
 void get_cutlass_moe_mm_problem_sizes(
@ -304,8 +305,9 @@ void get_cutlass_moe_mm_problem_sizes(
    torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets) {
  int32_t version_num = get_sm_version_num();
-#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
+#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) ||   \
-    (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100)
+    (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
    (defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
  get_cutlass_moe_mm_problem_sizes_caller(topk_ids, problem_sizes1,
                                          problem_sizes2, num_experts, n, k,
                                          blockscale_offsets);
@ -315,7 +317,7 @@ void get_cutlass_moe_mm_problem_sizes(
      false,
      "No compiled get_cutlass_moe_mm_problem_sizes: no cutlass_scaled_mm "
      "kernel for CUDA device capability: ",
-      version_num, ". Required capability: 90 or 100");
+      version_num, ". Required capability: 90, 100, or 120");
 }
 void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
@ -328,8 +330,9 @@ void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
  // This function currently gets compiled only if we have a valid cutlass moe
  // mm to run it for.
  int32_t version_num = get_sm_version_num();
-#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
+#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) ||   \
-    (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100)
+    (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
    (defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
  get_cutlass_pplx_moe_mm_data_caller(expert_offsets, problem_sizes1,
                                      problem_sizes2, expert_num_tokens,
                                      num_local_experts, padded_m, n, k);
@ -339,7 +342,7 @@ void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
      false,
      "No compiled get_cutlass_pplx_moe_mm_data: no cutlass_scaled_mm kernel "
      "for CUDA device capability: ",
-      version_num, ". Required capability: 90 or 100");
+      version_num, ". Required capability: 90, 100, or 120");
 }
 void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@ -63,7 +63,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "    int blocksparse_head_sliding_step) -> ()");
  ops.impl("paged_attention_v2", torch::kCUDA, &paged_attention_v2);
 #ifndef USE_ROCM
  // Merge attn states
  // Implements section 2.2 of https://www.arxiv.org/pdf/2501.01005
  // can be used to combine partial attention results (in the split-KV case)
@ -76,7 +75,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "    Tensor suffix_output,"
      "    Tensor suffix_lse) -> ()");
  ops.impl("merge_attn_states", torch::kCUDA, &merge_attn_states);
-
+#ifndef USE_ROCM
  ops.def(
      "convert_vertical_slash_indexes("
      "   Tensor! block_count, Tensor! block_offset, "
@ -299,9 +298,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // gptq_marlin Optimized Quantized GEMM for GPTQ.
  ops.def(
      "gptq_marlin_gemm(Tensor a, Tensor? c_or_none, Tensor b_q_weight, "
-      "Tensor? b_bias_or_none,"
+      "Tensor? b_bias_or_none,Tensor b_scales, "
-      "Tensor b_scales, Tensor? global_scale, Tensor? b_zeros_or_none, Tensor? "
+      "Tensor? a_scales, Tensor? global_scale, Tensor? b_zeros_or_none, "
-      "g_idx_or_none, Tensor? perm_or_none, Tensor workspace, int b_q_type, "
+      "Tensor? "
      "g_idx_or_none, Tensor? perm_or_none, Tensor workspace, int b_type_id, "
      "SymInt size_m, SymInt size_n, SymInt size_k, bool is_k_full, "
      "bool use_atomic_add, bool use_fp32_reduce, bool is_zp_float) -> Tensor");
  // conditionally compiled so impl registration is in source file
@ -309,13 +309,19 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // gptq_marlin repack from GPTQ.
  ops.def(
      "gptq_marlin_repack(Tensor b_q_weight, Tensor perm, "
-      "SymInt size_k, SymInt size_n, int num_bits) -> Tensor");
+      "SymInt size_k, SymInt size_n, int num_bits, bool is_a_8bit) -> Tensor");
  // conditionally compiled so impl registrations are in source file
  // awq_marlin repack from AWQ.
  ops.def(
      "awq_marlin_repack(Tensor b_q_weight, SymInt size_k, "
-      "SymInt size_n, int num_bits) -> Tensor");
+      "SymInt size_n, int num_bits, bool is_a_8bit) -> Tensor");
  // conditionally compiled so impl registrations are in source file
  // preprocess W-int4A-fp8 weight for marlin kernel
  ops.def(
      "marlin_int4_fp8_preprocess(Tensor qweight, "
      "Tensor? qzeros_or_none, bool inplace) -> Tensor");
  // conditionally compiled so impl registrations are in source file
  // CUTLASS w4a8 GEMM
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -244,9 +244,15 @@ RUN mkdir -p /tmp/deepgemm/dist && touch /tmp/deepgemm/dist/.deepgemm_skipped
 COPY tools/ep_kernels/install_python_libraries.sh /tmp/install_python_libraries.sh
 # Install EP kernels(pplx-kernels and DeepEP)
 ARG PPLX_COMMIT_HASH
 ARG DEEPEP_COMMIT_HASH
 RUN --mount=type=cache,target=/root/.cache/uv \
    export TORCH_CUDA_ARCH_LIST='9.0a 10.0a' && \
-    /tmp/install_python_libraries.sh /tmp/ep_kernels_workspace wheel && \
+    /tmp/install_python_libraries.sh \
        --workspace /tmp/ep_kernels_workspace \
        --mode wheel \
        ${PPLX_COMMIT_HASH:+--pplx-ref "$PPLX_COMMIT_HASH"} \
        ${DEEPEP_COMMIT_HASH:+--deepep-ref "$DEEPEP_COMMIT_HASH"} && \
    find /tmp/ep_kernels_workspace/nvshmem -name '*.a' -delete
 # Check the size of the wheel if RUN_WHEEL_CHECK is true
@ -358,7 +364,12 @@ RUN CUDA_VERSION_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-') && \
    cuda-cudart-${CUDA_VERSION_DASH} \
    cuda-nvrtc-${CUDA_VERSION_DASH} \
    cuda-cuobjdump-${CUDA_VERSION_DASH} \
-    libcublas-${CUDA_VERSION_DASH} && \
+    # https://github.com/vllm-project/vllm/issues/29590
    libcurand-dev-${CUDA_VERSION_DASH} \
    libcublas-${CUDA_VERSION_DASH} \
    # Fixes nccl_allocator requiring nccl.h at runtime
    # https://github.com/vllm-project/vllm/blob/1336a1ea244fa8bfd7e72751cabbdb5b68a0c11a/vllm/distributed/device_communicators/pynccl_allocator.py#L22
    libnccl-dev && \
    rm -rf /var/lib/apt/lists/*
 ARG PIP_INDEX_URL UV_INDEX_URL
@ -392,8 +403,8 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 # Install FlashInfer pre-compiled kernel cache and binaries
 # https://docs.flashinfer.ai/installation.html
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system flashinfer-cubin==0.5.2 \
+    uv pip install --system flashinfer-cubin==0.5.3 \
-    && uv pip install --system flashinfer-jit-cache==0.5.2 \
+    && uv pip install --system flashinfer-jit-cache==0.5.3 \
        --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
    && flashinfer show-config
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@ -132,7 +132,7 @@ RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
      esac; \
    }; \
    remove_packages_not_supported_on_aarch64 && \
-    sed -i 's/^torch==.*/torch==2.8.0/g' requirements/cpu-test.in && \
+    sed -i 's/^torch==.*/torch==2.9.1/g' requirements/cpu-test.in && \
    sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
    sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \
    uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@ -65,6 +65,7 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/tests /tests
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/examples /examples
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/docker/Dockerfile.rocm /docker/
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/vllm/v1 /vllm_v1
 # -----------------------
 # Test vLLM image
@ -88,10 +89,22 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace
 # install development dependencies (for testing)
 RUN cd /vllm-workspace \
    && rm -rf vllm \
    && python3 -m pip install -e tests/vllm_test_utils \
    && python3 -m pip install pytest-shard
 # enable fast downloads from hf (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system hf_transfer
 ENV HF_HUB_ENABLE_HF_TRANSFER=1
 # Copy in the v1 package (for python-only install test group)
 COPY --from=export_vllm /vllm_v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1
 # Source code is used in the `python_only_compile.sh` test
 # We hide it inside `src/` so that this source code
 # will not be imported by other tests
 RUN mkdir src && mv vllm src/vllm
 # -----------------------
 # Final vLLM image
 FROM base AS final
--- a/docker/Dockerfile.rocm_base
+++ b/docker/Dockerfile.rocm_base
@ -5,6 +5,8 @@ ARG PYTORCH_BRANCH="1c57644d"
 ARG PYTORCH_VISION_BRANCH="v0.23.0"
 ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
 ARG PYTORCH_AUDIO_BRANCH="v2.9.0"
 ARG PYTORCH_AUDIO_REPO="https://github.com/pytorch/audio.git"
 ARG FA_BRANCH="0e60e394"
 ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
 ARG AITER_BRANCH="59bd8ff2"
@ -23,6 +25,7 @@ ENV AITER_ROCM_ARCH=gfx942;gfx950
 ENV HSA_NO_SCRATCH_RECLAIM=1
 ARG PYTHON_VERSION=3.12
 ENV PYTHON_VERSION=${PYTHON_VERSION}
 RUN mkdir -p /app
 WORKDIR /app
@ -45,6 +48,7 @@ RUN apt-get update -y \
    && python3 --version && python3 -m pip --version
 RUN pip install -U packaging 'cmake<4' ninja wheel 'setuptools<80' pybind11 Cython
 RUN apt-get update && apt-get install -y libjpeg-dev libsox-dev libsox-fmt-all sox && rm -rf /var/lib/apt/lists/*
 FROM base AS build_triton
 ARG TRITON_BRANCH
@ -66,11 +70,14 @@ RUN mkdir -p /app/install && cp /opt/rocm/share/amd_smi/dist/*.whl /app/install
 FROM base AS build_pytorch
 ARG PYTORCH_BRANCH
 ARG PYTORCH_VISION_BRANCH
 ARG PYTORCH_AUDIO_BRANCH
 ARG PYTORCH_REPO
 ARG PYTORCH_VISION_REPO
 ARG PYTORCH_AUDIO_REPO
 RUN git clone ${PYTORCH_REPO} pytorch
-RUN cd pytorch && git checkout ${PYTORCH_BRANCH} && \
+RUN cd pytorch && git checkout ${PYTORCH_BRANCH} \
-    pip install -r requirements.txt && git submodule update --init --recursive \
+    && pip install -r requirements.txt && git submodule update --init --recursive \
    && python3 tools/amd_build/build_amd.py \
    && CMAKE_PREFIX_PATH=$(python3 -c 'import sys; print(sys.prefix)') python3 setup.py bdist_wheel --dist-dir=dist \
    && pip install dist/*.whl
@ -78,8 +85,15 @@ RUN git clone ${PYTORCH_VISION_REPO} vision
 RUN cd vision && git checkout ${PYTORCH_VISION_BRANCH} \
    && python3 setup.py bdist_wheel --dist-dir=dist \
    && pip install dist/*.whl
 RUN git clone ${PYTORCH_AUDIO_REPO} audio
 RUN cd audio && git checkout ${PYTORCH_AUDIO_BRANCH} \
    && git submodule update --init --recursive \
    && pip install -r requirements.txt \
    && python3 setup.py bdist_wheel --dist-dir=dist \
    && pip install dist/*.whl
 RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
-    && cp /app/vision/dist/*.whl /app/install
+    && cp /app/vision/dist/*.whl /app/install \
    && cp /app/audio/dist/*.whl /app/install
 FROM base AS build_fa
 ARG FA_BRANCH
@ -130,6 +144,8 @@ ARG PYTORCH_BRANCH
 ARG PYTORCH_VISION_BRANCH
 ARG PYTORCH_REPO
 ARG PYTORCH_VISION_REPO
 ARG PYTORCH_AUDIO_BRANCH
 ARG PYTORCH_AUDIO_REPO
 ARG FA_BRANCH
 ARG FA_REPO
 ARG AITER_BRANCH
@ -141,7 +157,9 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
    && echo "PYTORCH_VISION_BRANCH: ${PYTORCH_VISION_BRANCH}" >> /app/versions.txt \
    && echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
    && echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
    && echo "PYTORCH_AUDIO_BRANCH: ${PYTORCH_AUDIO_BRANCH}" >> /app/versions.txt \
    && echo "PYTORCH_AUDIO_REPO: ${PYTORCH_AUDIO_REPO}" >> /app/versions.txt \
    && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
    && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
    && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
-    && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
+    && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
--- a/docs/.nav.yml
+++ b/docs/.nav.yml
@ -5,11 +5,7 @@ nav:
    - Getting Started:
      - getting_started/quickstart.md
      - getting_started/installation
-    - Examples:
+      - Examples: examples
      - examples/README.md
      - Offline Inference: examples/offline_inference
      - Online Serving: examples/online_serving
      - Others: examples/others
    - General:
      - usage/v1_guide.md
      - usage/*
@ -52,6 +48,11 @@ nav:
      - Plugins:
        - design/*plugin*.md
      - design/*
  - Benchmarking:
      - benchmarking/README.md
      - benchmarking/cli.md
      - benchmarking/sweeps.md
      - benchmarking/dashboard.md
  - API Reference:
    - api/README.md
    - api/vllm
--- a/docs/benchmarking/README.md
+++ b/docs/benchmarking/README.md
@ -0,0 +1,7 @@
 # Benchmark Suites
 vLLM provides comprehensive benchmarking tools for performance testing and evaluation:
 - **[Benchmark CLI](./cli.md)**: `vllm bench` CLI tools and specialized benchmark scripts for interactive performance testing.
 - **[Parameter Sweeps](./sweeps.md)**: Automate `vllm bench` runs for multiple configurations, useful for [optimization and tuning](../configuration/optimization.md).
 - **[Performance Dashboard](./dashboard.md)**: Automated CI that publishes benchmarks on each commit.
--- a/docs/contributing/benchmarks.md
+++ b/docs/contributing/benchmarks.md
@ -1,22 +1,10 @@
---
+# Benchmark CLI
 toc_depth: 4
 ---
-# Benchmark Suites
+This section guides you through running benchmark tests with the extensive datasets supported on vLLM.
-vLLM provides comprehensive benchmarking tools for performance testing and evaluation:
+It's a living document, updated as new features and datasets become available.
- **[Benchmark CLI](#benchmark-cli)**: `vllm bench` CLI tools and specialized benchmark scripts for interactive performance testing
+## Dataset Overview
 - **[Parameter sweeps](#parameter-sweeps)**: Automate `vllm bench` runs for multiple configurations
 - **[Performance benchmarks](#performance-benchmarks)**: Automated CI benchmarks for development
 ## Benchmark CLI
 This section guides you through running benchmark tests with the extensive
 datasets supported on vLLM. It's a living document, updated as new features and datasets
 become available.
 ### Dataset Overview
 <style>
 th {
@ -59,9 +47,9 @@ Legend:
    --dataset-path /datasets/VisionArena-Chat/ --hf-name lmarena-ai/VisionArena-Chat
    ```
-### Examples
+## Examples
-#### 🚀 Online Benchmark
+### 🚀 Online Benchmark
 <details class="admonition abstract" markdown="1">
 <summary>Show more</summary>
@ -112,7 +100,7 @@ P99 ITL (ms):                            8.39
 ==================================================
 ```
-##### Custom Dataset
+#### Custom Dataset
 If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl
@ -145,7 +133,7 @@ vllm bench serve --port 9001 --save-result --save-detailed \
 You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.
-##### VisionArena Benchmark for Vision Language Models
+#### VisionArena Benchmark for Vision Language Models
 ```bash
 # need a model with vision capability here
@ -163,7 +151,7 @@ vllm bench serve \
  --num-prompts 1000
 ```
-##### InstructCoder Benchmark with Speculative Decoding
+#### InstructCoder Benchmark with Speculative Decoding
 ``` bash
 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
@ -180,7 +168,7 @@ vllm bench serve \
    --num-prompts 2048
 ```
-##### Spec Bench Benchmark with Speculative Decoding
+#### Spec Bench Benchmark with Speculative Decoding
 ``` bash
 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
@ -217,7 +205,7 @@ vllm bench serve \
    --spec-bench-category "summarization"
 ```
-##### Other HuggingFaceDataset Examples
+#### Other HuggingFaceDataset Examples
 ```bash
 vllm serve Qwen/Qwen2-VL-7B-Instruct
@ -283,7 +271,7 @@ vllm bench serve \
    --blazedit-max-distance 0.99
 ```
-##### Running With Sampling Parameters
+#### Running With Sampling Parameters
 When using OpenAI-compatible backends such as `vllm`, optional sampling
 parameters can be specified. Example client command:
@ -301,7 +289,7 @@ vllm bench serve \
  --num-prompts 10
 ```
-##### Running With Ramp-Up Request Rate
+#### Running With Ramp-Up Request Rate
 The benchmark tool also supports ramping up the request rate over the
 duration of the benchmark run. This can be useful for stress testing the
@ -318,11 +306,11 @@ The following arguments can be used to control the ramp-up:
 - `--ramp-up-start-rps`: The request rate at the beginning of the benchmark.
 - `--ramp-up-end-rps`: The request rate at the end of the benchmark.
-##### Load Pattern Configuration
+#### Load Pattern Configuration
 vLLM's benchmark serving script provides sophisticated load pattern simulation capabilities through three key parameters that control request generation and concurrency behavior:
-###### Load Pattern Control Parameters
+##### Load Pattern Control Parameters
 - `--request-rate`: Controls the target request generation rate (requests per second). Set to `inf` for maximum throughput testing or finite values for controlled load simulation.
 - `--burstiness`: Controls traffic variability using a Gamma distribution (range: > 0). Lower values create bursty traffic, higher values create uniform traffic.
@ -387,7 +375,7 @@ Using KV cache metrics for load pattern configuration:
 </details>
-#### 📈 Offline Throughput Benchmark
+### 📈 Offline Throughput Benchmark
 <details class="admonition abstract" markdown="1">
 <summary>Show more</summary>
@ -408,7 +396,7 @@ Total num prompt tokens:  5014
 Total num output tokens:  1500
 ```
-##### VisionArena Benchmark for Vision Language Models
+#### VisionArena Benchmark for Vision Language Models
 ```bash
 vllm bench throughput \
@ -428,7 +416,7 @@ Total num prompt tokens:  14527
 Total num output tokens:  1280
 ```
-##### InstructCoder Benchmark with Speculative Decoding
+#### InstructCoder Benchmark with Speculative Decoding
 ``` bash
 VLLM_WORKER_MULTIPROC_METHOD=spawn \
@ -451,7 +439,7 @@ Total num prompt tokens:  261136
 Total num output tokens:  204800
 ```
-##### Other HuggingFaceDataset Examples
+#### Other HuggingFaceDataset Examples
 `lmms-lab/LLaVA-OneVision-Data`:
@ -509,20 +497,20 @@ vllm bench throughput \
 </details>
-#### 🛠️ Structured Output Benchmark
+### 🛠️ Structured Output Benchmark
 <details class="admonition abstract" markdown="1">
 <summary>Show more</summary>
 Benchmark the performance of structured output generation (JSON, grammar, regex).
-##### Server Setup
+#### Server Setup
 ```bash
 vllm serve NousResearch/Hermes-3-Llama-3.1-8B
 ```
-##### JSON Schema Benchmark
+#### JSON Schema Benchmark
 ```bash
 python3 benchmarks/benchmark_serving_structured_output.py \
@ -534,7 +522,7 @@ python3 benchmarks/benchmark_serving_structured_output.py \
  --num-prompts 1000
 ```
-##### Grammar-based Generation Benchmark
+#### Grammar-based Generation Benchmark
 ```bash
 python3 benchmarks/benchmark_serving_structured_output.py \
@ -546,7 +534,7 @@ python3 benchmarks/benchmark_serving_structured_output.py \
  --num-prompts 1000
 ```
-##### Regex-based Generation Benchmark
+#### Regex-based Generation Benchmark
 ```bash
 python3 benchmarks/benchmark_serving_structured_output.py \
@ -557,7 +545,7 @@ python3 benchmarks/benchmark_serving_structured_output.py \
  --num-prompts 1000
 ```
-##### Choice-based Generation Benchmark
+#### Choice-based Generation Benchmark
 ```bash
 python3 benchmarks/benchmark_serving_structured_output.py \
@ -568,7 +556,7 @@ python3 benchmarks/benchmark_serving_structured_output.py \
  --num-prompts 1000
 ```
-##### XGrammar Benchmark Dataset
+#### XGrammar Benchmark Dataset
 ```bash
 python3 benchmarks/benchmark_serving_structured_output.py \
@ -581,14 +569,14 @@ python3 benchmarks/benchmark_serving_structured_output.py \
 </details>
-#### 📚 Long Document QA Benchmark
+### 📚 Long Document QA Benchmark
 <details class="admonition abstract" markdown="1">
 <summary>Show more</summary>
 Benchmark the performance of long document question-answering with prefix caching.
-##### Basic Long Document QA Test
+#### Basic Long Document QA Test
 ```bash
 python3 benchmarks/benchmark_long_document_qa_throughput.py \
@ -600,7 +588,7 @@ python3 benchmarks/benchmark_long_document_qa_throughput.py \
  --repeat-count 5
 ```
-##### Different Repeat Modes
+#### Different Repeat Modes
 ```bash
 # Random mode (default) - shuffle prompts randomly
@ -633,14 +621,14 @@ python3 benchmarks/benchmark_long_document_qa_throughput.py \
 </details>
-#### 🗂️ Prefix Caching Benchmark
+### 🗂️ Prefix Caching Benchmark
 <details class="admonition abstract" markdown="1">
 <summary>Show more</summary>
 Benchmark the efficiency of automatic prefix caching.
-##### Fixed Prompt with Prefix Caching
+#### Fixed Prompt with Prefix Caching
 ```bash
 python3 benchmarks/benchmark_prefix_caching.py \
@ -651,7 +639,7 @@ python3 benchmarks/benchmark_prefix_caching.py \
  --input-length-range 128:256
 ```
-##### ShareGPT Dataset with Prefix Caching
+#### ShareGPT Dataset with Prefix Caching
 ```bash
 # download dataset
@ -682,14 +670,14 @@ vllm bench serve \
 </details>
-#### ⚡ Request Prioritization Benchmark
+### ⚡ Request Prioritization Benchmark
 <details class="admonition abstract" markdown="1">
 <summary>Show more</summary>
 Benchmark the performance of request prioritization in vLLM.
-##### Basic Prioritization Test
+#### Basic Prioritization Test
 ```bash
 python3 benchmarks/benchmark_prioritization.py \
@ -700,7 +688,7 @@ python3 benchmarks/benchmark_prioritization.py \
  --scheduling-policy priority
 ```
-##### Multiple Sequences per Prompt
+#### Multiple Sequences per Prompt
 ```bash
 python3 benchmarks/benchmark_prioritization.py \
@ -714,14 +702,14 @@ python3 benchmarks/benchmark_prioritization.py \
 </details>
-#### 👁️ Multi-Modal Benchmark
+### 👁️ Multi-Modal Benchmark
 <details class="admonition abstract" markdown="1">
 <summary>Show more</summary>
 Benchmark the performance of multi-modal requests in vLLM.
-##### Images (ShareGPT4V)
+#### Images (ShareGPT4V)
 Start vLLM:
@ -747,7 +735,7 @@ vllm bench serve \
  --endpoint /v1/chat/completions
 ```
-##### Videos (ShareGPT4Video)
+#### Videos (ShareGPT4Video)
 Start vLLM:
@ -773,7 +761,7 @@ vllm bench serve \
  --endpoint /v1/chat/completions
 ```
-##### Synthetic Random Images (random-mm)
+#### Synthetic Random Images (random-mm)
 Generate synthetic image inputs alongside random text prompts to stress-test vision models without external datasets.
@ -846,14 +834,14 @@ This should be seen as an edge case, and if this behavior can be avoided by sett
 </details>
-#### Embedding Benchmark
+### Embedding Benchmark
 Benchmark the performance of embedding requests in vLLM.
 <details class="admonition abstract" markdown="1">
 <summary>Show more</summary>
-##### Text Embeddings
+#### Text Embeddings
 Unlike generative models which use Completions API or Chat Completions API,
 you should set `--backend openai-embeddings` and `--endpoint /v1/embeddings` to use the Embeddings API.
@ -879,7 +867,7 @@ vllm bench serve \
  --dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json
 ```
-##### Multi-modal Embeddings
+#### Multi-modal Embeddings
 Unlike generative models which use Completions API or Chat Completions API,
 you should set `--endpoint /v1/embeddings` to use the Embeddings API. The backend to use depends on the model:
@ -944,7 +932,7 @@ vllm bench serve \
 </details>
-#### Reranker Benchmark
+### Reranker Benchmark
 Benchmark the performance of rerank requests in vLLM.
@ -988,222 +976,3 @@ to account for the extra prompt which is the query. The token accounting to repo
 throughput numbers correctly is also adjusted.
 </details>
 ## Parameter Sweeps
 ### Online Benchmark
 [`vllm/benchmarks/sweep/serve.py`](../../vllm/benchmarks/sweep/serve.py) automatically starts `vllm serve` and runs `vllm bench serve` to evaluate vLLM over multiple configurations.
 Follow these steps to run the script:
 1. Construct the base command to `vllm serve`, and pass it to the `--serve-cmd` option.
 2. Construct the base command to `vllm bench serve`, and pass it to the `--bench-cmd` option.
 3. (Optional) If you would like to vary the settings of `vllm serve`, create a new JSON file and populate it with the parameter combinations you want to test. Pass the file path to `--serve-params`.
    - Example: Tuning `--max-num-seqs` and `--max-num-batched-tokens`:
    ```json
    [
        {
            "max_num_seqs": 32,
            "max_num_batched_tokens": 1024
        },
        {
            "max_num_seqs": 64,
            "max_num_batched_tokens": 1024
        },
        {
            "max_num_seqs": 64,
            "max_num_batched_tokens": 2048
        },
        {
            "max_num_seqs": 128,
            "max_num_batched_tokens": 2048
        },
        {
            "max_num_seqs": 128,
            "max_num_batched_tokens": 4096
        },
        {
            "max_num_seqs": 256,
            "max_num_batched_tokens": 4096
        }
    ]
    ```
 4. (Optional) If you would like to vary the settings of `vllm bench serve`, create a new JSON file and populate it with the parameter combinations you want to test. Pass the file path to `--bench-params`.
    - Example: Using different input/output lengths for random dataset:
    ```json
    [
        {
            "random_input_len": 128,
            "random_output_len": 32
        },
        {
            "random_input_len": 256,
            "random_output_len": 64
        },
        {
            "random_input_len": 512,
            "random_output_len": 128
        }
    ]
    ```
 5. Determine where you want to save the results, and pass that to `--output-dir`.
 Example command:
 ```bash
 vllm bench sweep serve \
    --serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \
    --bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json' \
    --serve-params benchmarks/serve_hparams.json \
    --bench-params benchmarks/bench_hparams.json \
    -o benchmarks/results
 ```
 !!! important
    If both `--serve-params` and `--bench-params` are passed, the script will iterate over the Cartesian product between them.
    You can use `--dry-run` to preview the commands to be run.
    We only start the server once for each `--serve-params`, and keep it running for multiple `--bench-params`.
    Between each benchmark run, we call the `/reset_prefix_cache` and `/reset_mm_cache` endpoints to get a clean slate for the next run.
    In case you are using a custom `--serve-cmd`, you can override the commands used for resetting the state by setting `--after-bench-cmd`.
 !!! note
    By default, each parameter combination is run 3 times to make the results more reliable. You can adjust the number of runs by setting `--num-runs`.
 !!! tip
    You can use the `--resume` option to continue the parameter sweep if one of the runs failed.
 ### SLA Auto-Tuner
 [`vllm/benchmarks/sweep/serve_sla.py`](../../vllm/benchmarks/sweep/serve_sla.py) is a wrapper over [`vllm/benchmarks/sweep/serve.py`](../../vllm/benchmarks/sweep/serve.py) that tunes either the request rate or concurrency (choose using `--sla-variable`) in order to satisfy the SLA constraints given by `--sla-params`.
 For example, to ensure E2E latency within different target values for 99% of requests:
 ```json
 [
    {
        "p99_e2el_ms": "<=200"
    },
    {
        "p99_e2el_ms": "<=500"
    },
    {
        "p99_e2el_ms": "<=1000"
    },
    {
        "p99_e2el_ms": "<=2000"
    }
 ]
 ```
 Example command:
 ```bash
 vllm bench sweep serve_sla \
    --serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \
    --bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json' \
    --serve-params benchmarks/serve_hparams.json \
    --bench-params benchmarks/bench_hparams.json \
    --sla-params benchmarks/sla_hparams.json \
    --sla-variable max_concurrency \
    -o benchmarks/results
 ```
 The algorithm for adjusting the SLA variable is as follows:
 1. Run the benchmark with infinite QPS, and use the corresponding metrics to determine the initial value of the variable.
    - For example, the initial request rate is set to the concurrency under infinite QPS.
 2. If the SLA is still satisfied, keep doubling the value until the SLA is no longer satisfied. This gives a relatively narrow window that contains the point where the SLA is barely satisfied.
 3. Apply binary search over the window to find the maximum value that still satisfies the SLA.
 !!! important
    SLA tuning is applied over each combination of `--serve-params`, `--bench-params`, and `--sla-params`.
    For a given combination of `--serve-params` and `--bench-params`, we share the benchmark results across `--sla-params` to avoid rerunning benchmarks with the same SLA variable value.
 ### Visualizer
 [`vllm/benchmarks/sweep/plot.py`](../../vllm/benchmarks/sweep/plot.py) can be used to plot performance curves from parameter sweep results.
 Example command:
 ```bash
 vllm bench sweep plot benchmarks/results/<timestamp> \
    --var-x max_concurrency \
    --row-by random_input_len \
    --col-by random_output_len \
    --curve-by api_server_count,max_num_batched_tokens \
    --filter-by 'max_concurrency<=1024'
 ```
 !!! tip
    You can use `--dry-run` to preview the figures to be plotted.
 ## Performance Benchmarks
 The performance benchmarks are used for development to confirm whether new changes improve performance under various workloads. They are triggered on every commit with both the `perf-benchmarks` and `ready` labels, and when a PR is merged into vLLM.
 ### Manually Trigger the benchmark
 Use [vllm-ci-test-repo images](https://gallery.ecr.aws/q9t5s3a7/vllm-ci-test-repo) with vLLM benchmark suite.
 For CPU environment, please use the image with "-cpu" postfix.
 Here is an example for docker run command for CPU.
 ```bash
 docker run -it --entrypoint /bin/bash -v /data/huggingface:/root/.cache/huggingface  -e HF_TOKEN=''  --shm-size=16g --name vllm-cpu-ci  public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:1da94e673c257373280026f75ceb4effac80e892-cpu
 ```
 Then, run below command inside the docker instance.
 ```bash
 bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
 ```
 When run, benchmark script generates results under **benchmark/results** folder, along with the benchmark_results.md and benchmark_results.json.
 #### Runtime environment variables
 - `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
 - `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
 - `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
 - `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
 - `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
 - `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
 For more results visualization, check the [visualizing the results](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md#visualizing-the-results).
 The latest performance results are hosted on the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).
 More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](../../.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md).
 ### Continuous Benchmarking
 The continuous benchmarking provides automated performance monitoring for vLLM across different models and GPU devices. This helps track vLLM's performance characteristics over time and identify any performance regressions or improvements.
 #### How It Works
 The continuous benchmarking is triggered via a [GitHub workflow CI](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) in the PyTorch infrastructure repository, which runs automatically every 4 hours. The workflow executes three types of performance tests:
 - **Serving tests**: Measure request handling and API performance
 - **Throughput tests**: Evaluate token generation rates
 - **Latency tests**: Assess response time characteristics
 #### Benchmark Configuration
 The benchmarking currently runs on a predefined set of models configured in the [vllm-benchmarks directory](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks). To add new models for benchmarking:
 1. Navigate to the appropriate GPU directory in the benchmarks configuration
 2. Add your model specifications to the corresponding configuration files
 3. The new models will be included in the next scheduled benchmark run
 #### Viewing Results
 All continuous benchmarking results are automatically published to the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).
--- a/docs/benchmarking/dashboard.md
+++ b/docs/benchmarking/dashboard.md
@ -0,0 +1,58 @@
 # Performance Dashboard
 The performance dashboard is used to confirm whether new changes improve/degrade performance under various workloads.
 It is updated by triggering benchmark runs on every commit with both the `perf-benchmarks` and `ready` labels, and when a PR is merged into vLLM.
 The results are automatically published to the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).
 ## Manually Trigger the benchmark
 Use [vllm-ci-test-repo images](https://gallery.ecr.aws/q9t5s3a7/vllm-ci-test-repo) with vLLM benchmark suite.
 For CPU environment, please use the image with "-cpu" postfix.
 Here is an example for docker run command for CPU.
 ```bash
 docker run -it --entrypoint /bin/bash -v /data/huggingface:/root/.cache/huggingface  -e HF_TOKEN=''  --shm-size=16g --name vllm-cpu-ci  public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:1da94e673c257373280026f75ceb4effac80e892-cpu
 ```
 Then, run below command inside the docker instance.
 ```bash
 bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
 ```
 When run, benchmark script generates results under **benchmark/results** folder, along with the benchmark_results.md and benchmark_results.json.
 ### Runtime environment variables
 - `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
 - `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
 - `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
 - `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
 - `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
 - `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
 For more results visualization, check the [visualizing the results](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md#visualizing-the-results).
 More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](../../.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md).
 ## Continuous Benchmarking
 The continuous benchmarking provides automated performance monitoring for vLLM across different models and GPU devices. This helps track vLLM's performance characteristics over time and identify any performance regressions or improvements.
 ### How It Works
 The continuous benchmarking is triggered via a [GitHub workflow CI](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) in the PyTorch infrastructure repository, which runs automatically every 4 hours. The workflow executes three types of performance tests:
 - **Serving tests**: Measure request handling and API performance
 - **Throughput tests**: Evaluate token generation rates
 - **Latency tests**: Assess response time characteristics
 ### Benchmark Configuration
 The benchmarking currently runs on a predefined set of models configured in the [vllm-benchmarks directory](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks). To add new models for benchmarking:
 1. Navigate to the appropriate GPU directory in the benchmarks configuration
 2. Add your model specifications to the corresponding configuration files
 3. The new models will be included in the next scheduled benchmark run
--- a/docs/benchmarking/sweeps.md
+++ b/docs/benchmarking/sweeps.md
@ -0,0 +1,178 @@
 # Parameter Sweeps
 ## Online Benchmark
 ### Basic
 `vllm bench sweep serve` automatically starts `vllm serve` and runs `vllm bench serve` to evaluate vLLM over multiple configurations.
 Follow these steps to run the script:
 1. Construct the base command to `vllm serve`, and pass it to the `--serve-cmd` option.
 2. Construct the base command to `vllm bench serve`, and pass it to the `--bench-cmd` option.
 3. (Optional) If you would like to vary the settings of `vllm serve`, create a new JSON file and populate it with the parameter combinations you want to test. Pass the file path to `--serve-params`.
    - Example: Tuning `--max-num-seqs` and `--max-num-batched-tokens`:
    ```json
    [
        {
            "max_num_seqs": 32,
            "max_num_batched_tokens": 1024
        },
        {
            "max_num_seqs": 64,
            "max_num_batched_tokens": 1024
        },
        {
            "max_num_seqs": 64,
            "max_num_batched_tokens": 2048
        },
        {
            "max_num_seqs": 128,
            "max_num_batched_tokens": 2048
        },
        {
            "max_num_seqs": 128,
            "max_num_batched_tokens": 4096
        },
        {
            "max_num_seqs": 256,
            "max_num_batched_tokens": 4096
        }
    ]
    ```
 4. (Optional) If you would like to vary the settings of `vllm bench serve`, create a new JSON file and populate it with the parameter combinations you want to test. Pass the file path to `--bench-params`.
    - Example: Using different input/output lengths for random dataset:
    ```json
    [
        {
            "random_input_len": 128,
            "random_output_len": 32
        },
        {
            "random_input_len": 256,
            "random_output_len": 64
        },
        {
            "random_input_len": 512,
            "random_output_len": 128
        }
    ]
    ```
 5. Determine where you want to save the results, and pass that to `--output-dir`.
 Example command:
 ```bash
 vllm bench sweep serve \
    --serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \
    --bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json' \
    --serve-params benchmarks/serve_hparams.json \
    --bench-params benchmarks/bench_hparams.json \
    -o benchmarks/results
 ```
 !!! important
    If both `--serve-params` and `--bench-params` are passed, the script will iterate over the Cartesian product between them.
    You can use `--dry-run` to preview the commands to be run.
    We only start the server once for each `--serve-params`, and keep it running for multiple `--bench-params`.
    Between each benchmark run, we call the `/reset_prefix_cache` and `/reset_mm_cache` endpoints to get a clean slate for the next run.
    In case you are using a custom `--serve-cmd`, you can override the commands used for resetting the state by setting `--after-bench-cmd`.
 !!! note
    By default, each parameter combination is run 3 times to make the results more reliable. You can adjust the number of runs by setting `--num-runs`.
 !!! tip
    You can use the `--resume` option to continue the parameter sweep if one of the runs failed.
 ### SLA auto-tuner
 `vllm bench sweep serve_sla` is a wrapper over `vllm bench sweep serve` that tunes either the request rate or concurrency (choose using `--sla-variable`) in order to satisfy the SLA constraints given by `--sla-params`.
 For example, to ensure E2E latency within different target values for 99% of requests:
 ```json
 [
    {
        "p99_e2el_ms": "<=200"
    },
    {
        "p99_e2el_ms": "<=500"
    },
    {
        "p99_e2el_ms": "<=1000"
    },
    {
        "p99_e2el_ms": "<=2000"
    }
 ]
 ```
 Example command:
 ```bash
 vllm bench sweep serve_sla \
    --serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \
    --bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json' \
    --serve-params benchmarks/serve_hparams.json \
    --bench-params benchmarks/bench_hparams.json \
    --sla-params benchmarks/sla_hparams.json \
    --sla-variable max_concurrency \
    -o benchmarks/results
 ```
 The algorithm for adjusting the SLA variable is as follows:
 1. Run the benchmark with infinite QPS, and use the corresponding metrics to determine the initial value of the variable.
    - For example, the initial request rate is set to the concurrency under infinite QPS.
 2. If the SLA is still satisfied, keep doubling the value until the SLA is no longer satisfied. This gives a relatively narrow window that contains the point where the SLA is barely satisfied.
 3. Apply binary search over the window to find the maximum value that still satisfies the SLA.
 !!! important
    SLA tuning is applied over each combination of `--serve-params`, `--bench-params`, and `--sla-params`.
    For a given combination of `--serve-params` and `--bench-params`, we share the benchmark results across `--sla-params` to avoid rerunning benchmarks with the same SLA variable value.
 ## Visualization
 ### Basic
 `vllm bench sweep plot` can be used to plot performance curves from parameter sweep results.
 Example command:
 ```bash
 vllm bench sweep plot benchmarks/results/<timestamp> \
    --var-x max_concurrency \
    --row-by random_input_len \
    --col-by random_output_len \
    --curve-by api_server_count,max_num_batched_tokens \
    --filter-by 'max_concurrency<=1024'
 ```
 !!! tip
    You can use `--dry-run` to preview the figures to be plotted.
 ### Pareto chart
 `vllm bench sweep plot_pareto` helps pick configurations that balance per-user and per-GPU throughput.
 Higher concurrency or batch size can raise GPU efficiency (per-GPU), but can add per user latency; lower concurrency improves per-user rate but underutilizes GPUs; The Pareto frontier shows the best achievable pairs across your runs.
 - x-axis: tokens/s/user = `output_throughput` ÷ concurrency (`--user-count-var`, default `max_concurrency`, fallback `max_concurrent_requests`).
 - y-axis: tokens/s/GPU = `output_throughput` ÷ GPU count (`--gpu-count-var` if set; else gpu_count is TP×PP*DP).
 - Output: a single figure at `OUTPUT_DIR/pareto/PARETO.png`.
 - Show the configuration used in each data point `--label-by` (default: `max_concurrency,gpu_count`).
 Example:
 ```bash
 vllm bench sweep plot_pareto benchmarks/results/<timestamp> \
  --label-by max_concurrency,tensor_parallel_size,pipeline_parallel_size
 ```
--- a/docs/cli/bench/sweep/plot_pareto.md
+++ b/docs/cli/bench/sweep/plot_pareto.md
@ -0,0 +1,9 @@
 # vllm bench sweep plot_pareto
 ## JSON CLI Arguments
 --8<-- "docs/cli/json_tip.inc.md"
 ## Arguments
 --8<-- "docs/argparse/bench_sweep_plot_pareto.inc.md"
--- a/docs/community/meetups.md
+++ b/docs/community/meetups.md
@ -10,6 +10,7 @@ Stay tuned for upcoming meetups! Follow us on [Twitter/X](https://x.com/vllm_pro
 Below you'll find slides and recordings from our previous meetups:
 - [vLLM Bangkok Meetup](https://luma.com/v0f647nv), November 21st 2025. [[Slides]](https://drive.google.com/drive/folders/1H0DS57F8HQ5q3kSOSoRmucPJWL3E0A_X?usp=sharing)
 - [vLLM Zurich Meetup](https://luma.com/0gls27kb), November 6th 2025. [[Slides]](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) [[Recording]](https://www.youtube.com/watch?v=6m6ZE6yVEDI)
 - [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w), November 1st 2025. [[Slides]](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link)
 - [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg), October 25th 2025. [[Slides]](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6)
--- a/docs/contributing/model/basic.md
+++ b/docs/contributing/model/basic.md
@ -29,7 +29,7 @@ The initialization code should look like this:
    ```python
    from torch import nn
    from vllm.config import VllmConfig
-    from vllm.attention import Attention
+    from vllm.attention.layer import Attention
    class MyAttention(nn.Module):
        def __init__(self, vllm_config: VllmConfig, prefix: str):
@ -113,8 +113,6 @@ See [this page](registration.md) for instructions on how to register your new mo
 ### How to support models with interleaving sliding windows?
 For models with interleaving sliding windows (e.g. `google/gemma-2-2b-it` and `mistralai/Ministral-8B-Instruct-2410`), the scheduler will treat the model as a full-attention model, i.e., kv-cache of all tokens will not be dropped. This is to make sure prefix caching works with these models. Sliding window only appears as a parameter to the attention kernel computation.
 To support a model with interleaving sliding windows, we need to take care of the following details:
 - Make sure the model's `config.json` contains `layer_types`.
--- a/docs/contributing/profiling.md
+++ b/docs/contributing/profiling.md
@ -11,6 +11,8 @@ We support tracing vLLM workers using the `torch.profiler` module. You can enabl
 - `VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY=1` to record memory, off by default
 - `VLLM_TORCH_PROFILER_WITH_STACK=1` to enable recording stack information, on by default
 - `VLLM_TORCH_PROFILER_WITH_FLOPS=1` to enable recording FLOPs, off by default
 - `VLLM_TORCH_PROFILER_USE_GZIP=0` to disable gzip-compressing profiling files, on by default
 - `VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL=0` to disable dumping and printing the aggregated CUDA self time table, on by default
 The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` environment variable set.
--- a/docs/design/cuda_graphs.md
+++ b/docs/design/cuda_graphs.md
@ -84,12 +84,14 @@ See the following figures for a quick comparison between the previous and curren
 ```python
 class BatchDescriptor(NamedTuple):
    num_tokens: int
-    uniform_decode: bool = False
+    num_reqs: int
    uniform: bool = False
    has_lora: bool = False
 ```
-where `num_tokens` can be the padded token length, and `uniform_decode` is determined by if `max_query_len` of a batch is equal to the desired `max_query_len` of a uniform_decode, and the num_scheduled_tokens is divisible by that desired `max_query_len`.
+where `num_tokens` can be the padded token length, and `uniform` indicates if all the requests have the same query lengths. Many attention backends only support full cudagraphs when the batches are uniform; pure decode batches are uniform but may not be query length 1 (i.e. `num_tokens == num_reqs`), this occurs in the validation pass of spec-decode where "decode" batches will have a query length of  `1+num_spec_tokens`.
-The goal of this structure is to uniquely identify a (padded) batch with minimal possible items corresponding to a CUDA Graphs item. We are safe to exclude items like `uniform_query_len` because it is a constant at runtime for a certain setup currently. For example, it should be either `1` for a commonly pure decode or `1+num_spec_tokens` for a validation phase of speculative decode.
+The goal of this structure is to uniquely identify a (padded) batch with minimal possible items corresponding to a CUDA Graphs item.
 !!! note
    The prototype of `BatchDescriptor` may be extended for more general situations in the future, e.g., include more items, like `uniform_query_len` to support multiple different uniform decode lengths settings (<https://github.com/vllm-project/vllm/pull/23679>), or other modifications needed to support CUDA Graphs for models whose inputs are not necessarily token length aware (for example, some multi-modal inputs).
--- a/docs/design/debug_vllm_compile.md
+++ b/docs/design/debug_vllm_compile.md
@ -8,9 +8,9 @@ TL;DR:
 | Online Flag | Offline Flag   |      Result |
 |----------|----------|-------------|
 | --enforce-eager | enforce_eager=True |  Turn off torch.compile and CUDAGraphs |
-| -O.mode=0 | mode=CompilationMode.NONE |  Turn off torch.compile only |
+| -cc.mode=0 | mode=CompilationMode.NONE |  Turn off torch.compile only |
-| -O.cudagraph_mode=NONE | compilation_config=CompilationConfig(cudagraph_mode=CUDAGraphMode.NONE) |  Turn off CUDAGraphs only |
+| -cc.cudagraph_mode=NONE | compilation_config=CompilationConfig(cudagraph_mode=CUDAGraphMode.NONE) |  Turn off CUDAGraphs only |
-| -O.backend=eager | compilation_config=CompilationConfig(backend='eager') |  Turn off TorchInductor |
+| -cc.backend=eager | compilation_config=CompilationConfig(backend='eager') |  Turn off TorchInductor |
 ## vLLM-torch.compile overview
@ -86,11 +86,11 @@ LLM(model, enforce_eager=True)
 ```
 To turn off just torch.compile, pass `mode = NONE` to the compilation config.
-(`-O` is short for `--compilation_config`):
+(`-cc` is short for `--compilation_config`; `-O.*` dotted syntax is deprecated):
 ```sh
 # Online
-vllm serve -O.mode=0
+vllm serve -cc.mode=0
 ```
 ```py
@ -103,7 +103,7 @@ To turn off just CUDAGraphs, pass `cudagraph_mode = NONE`:
 ```sh
 # Online
-vllm serve -O.cudagraph_mode=NONE
+vllm serve -cc.cudagraph_mode=NONE
 ```
 ```py
@ -183,10 +183,10 @@ help debug the issue:
 ```sh
 # Online - using unbacked mode
-vllm serve meta-llama/Llama-3.2-1B -O.dynamic_shapes_config.type=unbacked
+vllm serve meta-llama/Llama-3.2-1B -cc.dynamic_shapes_config.type=unbacked
 # Online - using backed_size_oblivious mode
-vllm serve meta-llama/Llama-3.2-1B -O.dynamic_shapes_config.type=backed_size_oblivious
+vllm serve meta-llama/Llama-3.2-1B -cc.dynamic_shapes_config.type=backed_size_oblivious
 ```
 ```py
@ -233,7 +233,7 @@ to the compilation config:
 ```sh
 # online
-vllm serve -O.backend=eager
+vllm serve -cc.backend=eager
 ```
 ```py
@ -252,7 +252,7 @@ You can also use `TORCH_LOGS=output_code <command>` to print the Inductor output
 ### Editable TorchInductor code
 You can edit the TorchInductor code that gets run by setting `VLLM_COMPILE_CACHE_SAVE_FORMAT=unpacked`
-or passing `-O.compile_cache_save_format=unpacked`. The default is `binary`, which means it is not editable.
+or passing `-cc.compile_cache_save_format=unpacked`. The default is `binary`, which means it is not editable.
 This is a useful technique: you can put breakpoints (e.g. `torch.distributed.breakpoint()`)
 and print statements in the output code.
@ -299,7 +299,7 @@ To turn off just CUDAGraphs, pass `cudagraph_mode = NONE`:
 ```sh
 # Online
-vllm serve -O.cudagraph_mode=NONE
+vllm serve -cc.cudagraph_mode=NONE
 ```
 ```py
--- a/docs/design/huggingface_integration.md
+++ b/docs/design/huggingface_integration.md
@ -21,7 +21,7 @@ Let's say we want to serve the popular Qwen model by running `vllm serve Qwen/Qw
 Beyond that, there are two more things vLLM depends on Hugging Face for.
-1. **Tokenizer**: vLLM uses the tokenizer from Hugging Face to tokenize the input text. The tokenizer is loaded using [AutoTokenizer.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoTokenizer.from_pretrained) with the `model` argument as the model name and the `--revision` argument as the revision. It is also possible to use a tokenizer from another model by specifying the `--tokenizer` argument in the `vllm serve` command. Other relevant arguments are `--tokenizer-revision` and `--tokenizer-mode`. Please check Hugging Face's documentation for the meaning of these arguments. This part of the logic can be found in the [get_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L87) function. After obtaining the tokenizer, notably, vLLM will cache some expensive attributes of the tokenizer in [get_cached_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L24).
+1. **Tokenizer**: vLLM uses the tokenizer from Hugging Face to tokenize the input text. The tokenizer is loaded using [AutoTokenizer.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoTokenizer.from_pretrained) with the `model` argument as the model name and the `--revision` argument as the revision. It is also possible to use a tokenizer from another model by specifying the `--tokenizer` argument in the `vllm serve` command. Other relevant arguments are `--tokenizer-revision` and `--tokenizer-mode`. Please check Hugging Face's documentation for the meaning of these arguments. This part of the logic can be found in the [get_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L87) function. After obtaining the tokenizer, notably, vLLM will cache some expensive attributes of the tokenizer in [vllm.tokenizers.hf.get_cached_tokenizer][].
 2. **Model weight**: vLLM downloads the model weight from the Hugging Face model hub using the `model` argument as the model name and the `--revision` argument as the revision. vLLM provides the argument `--load-format` to control what files to download from the model hub. By default, it will try to load the weights in the safetensors format and fall back to the PyTorch bin format if the safetensors format is not available. We can also pass `--load-format dummy` to skip downloading the weights.
    - It is recommended to use the safetensors format, as it is efficient for loading in distributed inference and also safe from arbitrary code execution. See the [documentation](https://huggingface.co/docs/safetensors/en/index) for more information on the safetensors format. This part of the logic can be found [here](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/model_executor/model_loader/loader.py#L385). Please note that:
--- a/docs/design/io_processor_plugins.md
+++ b/docs/design/io_processor_plugins.md
@ -77,9 +77,9 @@ The `parse_request` method is used for validating the user prompt and converting
 The `pre_process*` methods take the validated plugin input to generate vLLM's model prompts for regular inference.
 The `post_process*` methods take `PoolingRequestOutput` objects as input and generate a custom plugin output.
 The `validate_or_generate_params` method is used for validating with the plugin any `SamplingParameters`/`PoolingParameters` received with the user request, or to generate new ones if none are specified. The function always returns the validated/generated parameters.
-The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/pooling` serving endpoint is available here [vllm/entrypoints/openai/serving_pooling.py](../../vllm/entrypoints/openai/serving_pooling.py).
+The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/pooling` serving endpoint is available here [vllm/entrypoints/openai/serving_pooling.py](../../vllm/entrypoints/pooling/pooling/serving.py).
-An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/IBM/terratorch/tree/main/terratorch/vllm/plugins/segmentation). Please, also refer to our online ([examples/online_serving/pooling/prithvi_geospatial_mae.py](../../examples/online_serving/pooling/prithvi_geospatial_mae.py)) and offline ([examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py](../../examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py)) inference examples.
+An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/IBM/terratorch/tree/main/terratorch/vllm/plugins/segmentation). Please, also refer to our online ([examples/pooling/plugin/prithvi_geospatial_mae_client.py](../../examples/pooling/plugin/prithvi_geospatial_mae_client.py)) and offline ([examples/pooling/plugin/prithvi_geospatial_mae_io_processor.py](../../examples/pooling/plugin/prithvi_geospatial_mae_io_processor.py)) inference examples.
 ## Using an IO Processor plugin
--- a/docs/design/metrics.md
+++ b/docs/design/metrics.md
@ -263,6 +263,29 @@ record:
 - End-to-end latency - the interval between frontend `arrival_time`
  and the frontend receiving the final token.
 ### KV Cache Residency Metrics
 We also emit a set of histograms that describe how long sampled KV cache
 blocks stay resident and how often they are reused. Sampling
 (`--kv-cache-metrics-sample`) keeps the overhead tiny; when a block is
 chosen we record:
 - `lifetime` – allocation ⟶ eviction
 - `idle before eviction` – last touch ⟶ eviction
 - `reuse gaps` – the pauses between touches when the block gets reused
 Those map directly to the Prometheus metrics:
 - `vllm:kv_block_lifetime_seconds` – how long each sampled block exists.
 - `vllm:kv_block_idle_before_evict_seconds` – idle tail after the final access.
 - `vllm:kv_block_reuse_gap_seconds` – time between consecutive touches.
 The engine core only ships raw eviction events via `SchedulerStats`; the
 frontend drains them, turns them into Prometheus observations, and also
 exposes the same data through `LLM.get_metrics()` when logging is on.
 Looking at lifetime and idle time on one chart makes it easy to spot
 stranded cache or workloads that pin prompts for a long decode.
 ### Metrics Publishing - Logging
 The `LoggingStatLogger` metrics publisher outputs a log `INFO` message
--- a/docs/design/moe_kernel_features.md
+++ b/docs/design/moe_kernel_features.md
@ -60,7 +60,7 @@ Modular kernels are supported by the following `FusedMoEMethodBase` classes.
 - [`ModelOptFp8MoEMethod`][vllm.model_executor.layers.quantization.modelopt.ModelOptFp8MoEMethod]
 - [`Fp8MoEMethod`][vllm.model_executor.layers.quantization.fp8.Fp8MoEMethod]
- [`CompressedTensorsW4A4MoeMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW4A4MoeMethod]
+- [`CompressedTensorsW4A4Nvfp4MoEMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW4A4Nvfp4MoEMethod]
 - [`CompressedTensorsW8A8Fp8MoEMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW8A8Fp8MoEMethod]
 - [`Mxfp4MoEMethod`][vllm.model_executor.layers.quantization.mxfp4.Mxfp4MoEMethod]
 - [`UnquantizedFusedMoEMethod`][vllm.model_executor.layers.fused_moe.layer.UnquantizedFusedMoEMethod]
--- a/docs/design/optimization_levels.md
+++ b/docs/design/optimization_levels.md
@ -0,0 +1,69 @@
 <!-- markdownlint-disable -->
 # Optimization Levels
 ## Overview
 vLLM now supports optimization levels (`-O0`, `-O1`, `-O2`, `-O3`). Optimization levels provide an intuitive mechnaism for users to trade startup time for performance. Higher levels have better performance but worse startup time. These optimization levels have associated defaults to help users get desired out of the box performance. Importantly, defaults set by optimization levels are purely defaults; explicit user settings will not be overwritten.
 ## Level Summaries and Usage Examples
 ```bash
 # CLI usage
 python -m vllm.entrypoints.api_server --model RedHatAI/Llama-3.2-1B-FP8 -O0
 # Python API usage
 from vllm.entrypoints.llm import LLM
 llm = LLM(
    model="RedHatAI/Llama-3.2-1B-FP8",
    optimization_level=0
 )
 ```
 #### `-O1`: Quick Optimizations
 - **Startup**: Moderate startup time
 - **Performance**: Inductor compilation, CUDAGraphMode.PIECEWISE
 - **Use case**:  Balance for most development scenarios
 ```bash
 # CLI usage
 python -m vllm.entrypoints.api_server --model RedHatAI/Llama-3.2-1B-FP8 -O1
 # Python API usage
 from vllm.entrypoints.llm import LLM
 llm = LLM(
    model="RedHatAI/Llama-3.2-1B-FP8",
    optimization_level=1
 )
 ```
 #### `-O2`: Full Optimizations (Default)
 - **Startup**: Longer startup time
 - **Performance**: `-O1` + CUDAGraphMode.FULL_AND_PIECEWISE
 - **Use case**: Production workloads where performance is important. This is the default use case. It is also very similar to the previous default. The primary difference is that  noop & fusion flags are enabled. 
 ```bash
 # CLI usage (default, so optional)
 python -m vllm.entrypoints.api_server --model RedHatAI/Llama-3.2-1B-FP8 -O2
 # Python API usage
 from vllm.entrypoints.llm import LLM
 llm = LLM(
    model="RedHatAI/Llama-3.2-1B-FP8",
    optimization_level=2  # This is the default
 )
 ```
 #### `-O3`: Full Optimization
 Still in development. Added infrastructure to prevent changing API in future 
 release. Currently behaves the same O2.
 ## Troubleshooting
 ### Common Issues
 1. **Startup Time Too Long**: Use `-O0` or `-O1` for faster startup
 2. **Compilation Errors**: Use `debug_dump_path` for additional debugging information
 3. **Performance Issues**: Ensure using `-O2` for production
--- a/docs/design/plugin_system.md
+++ b/docs/design/plugin_system.md
@ -86,7 +86,7 @@ Every plugin has three parts:
        },
        ...
    )
-        ```
+    ```
    Please make sure `vllm_add_dummy_platform:register` is a callable function and returns the platform class's fully qualified name. for example:
--- a/docs/design/torch_compile.md
+++ b/docs/design/torch_compile.md
@ -117,7 +117,7 @@ vllm serve meta-llama/Llama-3.2-1B \
 # Alternative: Using dot notation (simpler for single values)
-vllm serve meta-llama/Llama-3.2-1B -O.dynamic_shapes_config.type=unbacked
+vllm serve meta-llama/Llama-3.2-1B -cc.dynamic_shapes_config.type=unbacked
 ```
 #### Choosing the Right Mode
--- a/docs/features/reasoning_outputs.md
+++ b/docs/features/reasoning_outputs.md
@ -216,14 +216,13 @@ You can add a new `ReasoningParser` similar to [vllm/reasoning/deepseek_r1_reaso
    # import the required packages
    from vllm.reasoning import ReasoningParser, ReasoningParserManager
-    from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+    from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
                                                DeltaMessage)
    # define a reasoning parser and register it to vllm
    # the name list in register_module can be used
    # in --reasoning-parser.
    class ExampleParser(ReasoningParser):
-        def __init__(self, tokenizer: AnyTokenizer):
+        def __init__(self, tokenizer: TokenizerLike):
            super().__init__(tokenizer)
        def extract_reasoning_streaming(
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@ -371,7 +371,8 @@ Olmo 3 models output tool calls in a format that is very similar to the one expe
 Supported models:
-* TODO (will be updated after Olmo 3 release)
+* `allenai/Olmo-3-7B-Instruct`
 * `allenai/Olmo-3-32B-Think`
 Flags: `--tool-call-parser olmo3`
@ -421,7 +422,7 @@ Here is a summary of a plugin file:
    # in --tool-call-parser. you can define as many
    # tool parsers as you want here.
    class ExampleToolParser(ToolParser):
-        def __init__(self, tokenizer: AnyTokenizer):
+        def __init__(self, tokenizer: TokenizerLike):
            super().__init__(tokenizer)
        # adjust request. e.g.: set skip special tokens
--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@ -46,10 +46,23 @@ vLLM is a Python library that supports the following CPU variants. Select your C
 ### Pre-built wheels
-Currently, there are no pre-built CPU wheels.
+Please refer to the instructions for [pre-built wheels on GPU](./gpu.md#pre-built-wheels).
 When specifying the index URL, please make sure to use the `cpu` variant subdirectory.
 For example, the nightly build index is: `https://wheels.vllm.ai/nightly/cpu/`.
 ### Build wheel from source
 #### Set up using Python-only build (without compilation) {#python-only-build}
 Please refer to the instructions for [Python-only build on GPU](./gpu.md#python-only-build), and replace the build commands with:
 ```bash
 VLLM_USE_PRECOMPILED=1 VLLM_PRECOMPILED_WHEEL_VARIANT=cpu VLLM_TARGET_DEVICE=cpu uv pip install --editable .
 ```
 #### Full build (with compilation) {#full-build}
 === "Intel/AMD x86"
    --8<-- "docs/getting_started/installation/cpu.x86.inc.md:build-wheel-from-source"
@ -125,6 +138,35 @@ vllm serve facebook/opt-125m --dtype=bfloat16
 Note, it is recommended to manually reserve 1 CPU for vLLM front-end process when `world_size == 1`.
 ### What are supported models on CPU?
 For the full and up-to-date list of models validated on CPU platforms, please see the official documentation: [Supported Models on CPU](https://docs.vllm.ai/en/latest/models/hardware_supported_models/cpu)
 ### How to find benchmark configuration examples for supported CPU models?
 For any model listed under [Supported Models on CPU](https://docs.vllm.ai/en/latest/models/hardware_supported_models/cpu), optimized runtime configurations are provided in the vLLM Benchmark Suite’s CPU test cases, defined in [cpu test cases](https://github.com/vllm-project/vllm/blob/main/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json)
 For details on how these optimized configurations are determined, see: [performance-benchmark-details](https://github.com/vllm-project/vllm/tree/main/.buildkite/performance-benchmarks#performance-benchmark-details).
 To benchmark the supported models using these optimized settings, follow the steps in [running vLLM Benchmark Suite manually](https://docs.vllm.ai/en/latest/contributing/benchmarks/#manually-trigger-the-benchmark) and run the Benchmark Suite on a CPU environment.  
 Below is an example command to benchmark all CPU-supported models using optimized configurations.
 ```bash
 ON_CPU=1 bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
 ```
 The benchmark results will be saved in `./benchmark/results/`.
 In the directory, the generated `.commands` files contain all example commands for the benchmark.
 We recommend configuring tensor-parallel-size to match the number of NUMA nodes on your system. Note that the current release does not support tensor-parallel-size=6.
 To determine the number of NUMA nodes available, use the following command:
 ```bash
 lscpu | grep "NUMA node(s):" | awk '{print $3}'
 ```
 For performance reference, users may also consult the [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm&deviceName=cpu)
 , which publishes default-model CPU results produced using the same Benchmark Suite.
 ### How to decide `VLLM_CPU_OMP_THREADS_BIND`?
 - Default `auto` thread-binding is recommended for most cases. Ideally, each OpenMP thread will be bound to a dedicated physical core respectively, threads of each rank will be bound to the same NUMA node respectively, and 1 CPU per rank will be reserved for other vLLM components when `world_size > 1`. If you have any performance problems or unexpected binding behaviours, please try to bind threads as following.
--- a/docs/getting_started/installation/gpu.cuda.inc.md
+++ b/docs/getting_started/installation/gpu.cuda.inc.md
@ -26,42 +26,49 @@ uv pip install vllm --torch-backend=auto
 ??? console "pip"
    ```bash
-    # Install vLLM with CUDA 12.8.
+    # Install vLLM with CUDA 12.9.
-    pip install vllm --extra-index-url https://download.pytorch.org/whl/cu128
+    pip install vllm --extra-index-url https://download.pytorch.org/whl/cu129
    ```
-We recommend leveraging `uv` to [automatically select the appropriate PyTorch index at runtime](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection) by inspecting the installed CUDA driver version via `--torch-backend=auto` (or `UV_TORCH_BACKEND=auto`). To select a specific backend (e.g., `cu126`), set `--torch-backend=cu126` (or `UV_TORCH_BACKEND=cu126`). If this doesn't work, try running `uv self update` to update `uv` first.
+We recommend leveraging `uv` to [automatically select the appropriate PyTorch index at runtime](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection) by inspecting the installed CUDA driver version via `--torch-backend=auto` (or `UV_TORCH_BACKEND=auto`). To select a specific backend (e.g., `cu128`), set `--torch-backend=cu128` (or `UV_TORCH_BACKEND=cu128`). If this doesn't work, try running `uv self update` to update `uv` first.
 !!! note
    NVIDIA Blackwell GPUs (B200, GB200) require a minimum of CUDA 12.8, so make sure you are installing PyTorch wheels with at least that version. PyTorch itself offers a [dedicated interface](https://pytorch.org/get-started/locally/) to determine the appropriate pip command to run for a given target configuration.
-As of now, vLLM's binaries are compiled with CUDA 12.8 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.6, 11.8, and public PyTorch release versions:
+As of now, vLLM's binaries are compiled with CUDA 12.9 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.8, 13.0, and public PyTorch release versions:
 ```bash
-# Install vLLM with a specific CUDA version (e.g., 11.8 or 12.6).
+# Install vLLM with a specific CUDA version (e.g., 13.0).
 export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//')
-export CUDA_VERSION=118 # or 126
+export CUDA_VERSION=130 # or other
-uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu${CUDA_VERSION}-cp38-abi3-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu${CUDA_VERSION}
+uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu${CUDA_VERSION}-cp38-abi3-manylinux_2_31_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu${CUDA_VERSION}
 ```
 #### Install the latest code
-LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on an x86 platform with CUDA 12 for every commit since `v0.5.3`.
+LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for every commit since `v0.5.3` on <https://wheels.vllm.ai/nightly>. There are multiple indices that could be used:
 * `https://wheels.vllm.ai/nightly`: the default variant (CUDA with version specified in `VLLM_MAIN_CUDA_VERSION`) built with the last commit on the `main` branch. Currently it is CUDA 12.9.
 * `https://wheels.vllm.ai/nightly/<variant>`: all other variants. Now this includes `cu130`, and `cpu`. The default variant (`cu129`) also has a subdirectory to keep consistency.
 To install from nightly index, run:
 ```bash
 uv pip install -U vllm \
    --torch-backend=auto \
-    --extra-index-url https://wheels.vllm.ai/nightly
+    --extra-index-url https://wheels.vllm.ai/nightly # add variant subdirectory here if needed
 ```
-??? console "pip"
+!!! warning "`pip` caveat"
    ```bash
    pip install -U vllm \
        --pre \
        --extra-index-url https://wheels.vllm.ai/nightly
    ```
-    `--pre` is required for `pip` to consider pre-released versions.
+    Using `pip` to install from nightly indices is _not supported_, because `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version. In contrast, `uv` gives the extra index [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes).
    If you insist on using `pip`, you have to specify the full URL of the wheel file (which can be obtained from the web page).
    ```bash
    pip install -U https://wheels.vllm.ai/nightly/vllm-0.11.2.dev399%2Bg3c7461c18-cp38-abi3-manylinux_2_31_x86_64.whl # current nightly build (the filename will change!)
    pip install -U https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-0.11.2.dev399%2Bg3c7461c18-cp38-abi3-manylinux_2_31_x86_64.whl # from specific commit
    ```
 ##### Install specific revisions
@ -71,33 +78,13 @@ If you want to access the wheels for previous commits (e.g. to bisect the behavi
 export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch
 uv pip install vllm \
    --torch-backend=auto \
-    --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT}
+    --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT} # add variant subdirectory here if needed
 ```
 The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-remember command. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version.
 ??? note "pip"
    If you want to access the wheels for previous commits (e.g. to bisect the behavior change,
    performance regression), due to the limitation of `pip`, you have to specify the full URL of the
    wheel file by embedding the commit hash in the URL:
    ```bash
    export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
    pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
    ```
    Note that the wheels are built with Python 3.8 ABI (see [PEP
    425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible
    with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a
    placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in
    the wheel metadata (the wheels listed in the extra index url have correct versions). Although we
    don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the
    wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
 # --8<-- [end:pre-built-wheels]
 # --8<-- [start:build-wheel-from-source]
-#### Set up using Python-only build (without compilation)
+#### Set up using Python-only build (without compilation) {#python-only-build}
 If you only need to change Python code, you can build and install vLLM without compilation. Using `uv pip`'s [`--editable` flag](https://docs.astral.sh/uv/pip/packages/#editable-packages), changes you make to the code will be reflected when you run vLLM:
@ -121,18 +108,24 @@ This command will do the following:
 In case you see an error about wheel not found when running the above command, it might be because the commit you based on in the main branch was just merged and the wheel is being built. In this case, you can wait for around an hour to try again, or manually assign the previous commit in the installation using the `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable.
 ```bash
-export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch
+export VLLM_PRECOMPILED_WHEEL_COMMIT=$(git rev-parse HEAD~1) # or earlier commit on main
-export VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
+export VLLM_USE_PRECOMPILED=1
 uv pip install --editable .
 ```
 There are more environment variables to control the behavior of Python-only build:
 * `VLLM_PRECOMPILED_WHEEL_LOCATION`: specify the exact wheel URL or local file path of a pre-compiled wheel to use. All other logic to find the wheel will be skipped.
 * `VLLM_PRECOMPILED_WHEEL_COMMIT`: override the commit hash to download the pre-compiled wheel. It can be `nightly` to use the last **already built** commit on the main branch.
 * `VLLM_PRECOMPILED_WHEEL_VARIANT`: specify the variant subdirectory to use on the nightly index, e.g., `cu129`, `cpu`. If not specified, the CUDA variant with `VLLM_MAIN_CUDA_VERSION` will be tried, then fallback to the default variant on the remote index.
 You can find more information about vLLM's wheels in [Install the latest code](#install-the-latest-code).
 !!! note
    There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors.
    It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to [Install the latest code](#install-the-latest-code) for instructions on how to install a specified wheel.
-#### Full build (with compilation)
+#### Full build (with compilation) {#full-build}
 If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes:
--- a/docs/getting_started/installation/gpu.md
+++ b/docs/getting_started/installation/gpu.md
@ -52,7 +52,7 @@ vLLM is a Python library that supports the following GPU variants. Select your G
    --8<-- "docs/getting_started/installation/gpu.xpu.inc.md:set-up-using-python"
-### Pre-built wheels
+### Pre-built wheels {#pre-built-wheels}
 === "NVIDIA CUDA"
--- a/docs/mkdocs/hooks/generate_argparse.py
+++ b/docs/mkdocs/hooks/generate_argparse.py
@ -94,6 +94,9 @@ def auto_mock(module_name: str, attr: str, max_mocks: int = 100):
 bench_latency = auto_mock("vllm.benchmarks", "latency")
 bench_serve = auto_mock("vllm.benchmarks", "serve")
 bench_sweep_plot = auto_mock("vllm.benchmarks.sweep.plot", "SweepPlotArgs")
 bench_sweep_plot_pareto = auto_mock(
    "vllm.benchmarks.sweep.plot_pareto", "SweepPlotParetoArgs"
 )
 bench_sweep_serve = auto_mock("vllm.benchmarks.sweep.serve", "SweepServeArgs")
 bench_sweep_serve_sla = auto_mock(
    "vllm.benchmarks.sweep.serve_sla", "SweepServeSLAArgs"
@ -221,6 +224,7 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
        "bench_latency": create_parser(bench_latency.add_cli_args),
        "bench_serve": create_parser(bench_serve.add_cli_args),
        "bench_sweep_plot": create_parser(bench_sweep_plot.add_cli_args),
        "bench_sweep_plot_pareto": create_parser(bench_sweep_plot_pareto.add_cli_args),
        "bench_sweep_serve": create_parser(bench_sweep_serve.add_cli_args),
        "bench_sweep_serve_sla": create_parser(bench_sweep_serve_sla.add_cli_args),
        "bench_throughput": create_parser(bench_throughput.add_cli_args),
--- a/docs/mkdocs/hooks/generate_examples.py
+++ b/docs/mkdocs/hooks/generate_examples.py
@ -2,7 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import itertools
 import logging
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 from functools import cached_property
 from pathlib import Path
 from typing import Literal
@ -16,13 +17,18 @@ EXAMPLE_DIR = ROOT_DIR / "examples"
 EXAMPLE_DOC_DIR = ROOT_DIR / "docs/examples"
-def fix_case(text: str) -> str:
+def title(text: str) -> str:
    # Default title case
    text = text.replace("_", " ").replace("/", " - ").title()
    # Custom substitutions
    subs = {
        "io": "IO",
        "api": "API",
        "cli": "CLI",
        "cpu": "CPU",
        "llm": "LLM",
        "mae": "MAE",
        "ner": "NER",
        "tpu": "TPU",
        "gguf": "GGUF",
        "lora": "LoRA",
@ -48,71 +54,65 @@ class Example:
    Attributes:
        path (Path): The path to the main directory or file.
        category (str): The category of the document.
-        main_file (Path): The main file in the directory.
+
-        other_files (list[Path]): list of other files in the directory.
+    Properties::
-        title (str): The title of the document.
+        main_file() -> Path | None: Determines the main file in the given path.
        other_files() -> list[Path]: Determines other files in the directory excluding
        the main file.
        title() -> str: Determines the title of the document.
    Methods:
        __post_init__(): Initializes the main_file, other_files, and title attributes.
        determine_main_file() -> Path: Determines the main file in the given path.
        determine_other_files() -> list[Path]: Determines other files in the directory excluding the main file.
        determine_title() -> str: Determines the title of the document.
        generate() -> str: Generates the documentation content.
-    """  # noqa: E501
+    """
    path: Path
-    category: str = None
+    category: str
    main_file: Path = field(init=False)
    other_files: list[Path] = field(init=False)
    title: str = field(init=False)
-    def __post_init__(self):
+    @cached_property
-        self.main_file = self.determine_main_file()
+    def main_file(self) -> Path | None:
-        self.other_files = self.determine_other_files()
+        """Determines the main file in the given path.
        self.title = self.determine_title()
-    @property
+        If path is a file, it returns the path itself. If path is a directory, it
-    def is_code(self) -> bool:
+        searches for Markdown files (*.md) in the directory and returns the first one
-        return self.main_file.suffix != ".md"
+        found. If no Markdown files are found, it returns None."""
        # Single file example
        if self.path.is_file():
            return self.path
        # Multi file example with a README
        if md_paths := list(self.path.glob("*.md")):
            return md_paths[0]
        # Multi file example without a README
        return None
-    def determine_main_file(self) -> Path:
+    @cached_property
-        """
+    def other_files(self) -> list[Path]:
-        Determines the main file in the given path.
+        """Determine other files in the directory excluding the main file.
        If the path is a file, it returns the path itself. Otherwise, it searches
        for Markdown files (*.md) in the directory and returns the first one found.
        Returns:
            Path: The main file path, either the original path if it's a file or the first
            Markdown file found in the directory.
        Raises:
            IndexError: If no Markdown files are found in the directory.
        """  # noqa: E501
        return self.path if self.path.is_file() else list(self.path.glob("*.md")).pop()
-    def determine_other_files(self) -> list[Path]:
+        If path is a file, it returns an empty list. Otherwise, it returns every file
-        """
+        in the directory except the main file in a list."""
-        Determine other files in the directory excluding the main file.
+        # Single file example
        This method checks if the given path is a file. If it is, it returns an empty list.
        Otherwise, it recursively searches through the directory and returns a list of all
        files that are not the main file.
        Returns:
            list[Path]: A list of Path objects representing the other files in the directory.
        """  # noqa: E501
        if self.path.is_file():
            return []
        # Multi file example
        is_other_file = lambda file: file.is_file() and file != self.main_file
-        return [file for file in self.path.rglob("*") if is_other_file(file)]
+        return sorted(file for file in self.path.rglob("*") if is_other_file(file))
-    def determine_title(self) -> str:
+    @cached_property
-        if not self.is_code:
+    def is_code(self) -> bool:
-            # Specify encoding for building on Windows
+        return self.main_file is not None and self.main_file.suffix != ".md"
-            with open(self.main_file, encoding="utf-8") as f:
+
-                first_line = f.readline().strip()
+    @cached_property
-            match = re.match(r"^#\s+(?P<title>.+)$", first_line)
+    def title(self) -> str:
-            if match:
+        # Generate title from filename if no main md file found
-                return match.group("title")
+        if self.main_file is None or self.is_code:
-        return fix_case(self.path.stem.replace("_", " ").title())
+            return title(self.path.stem)
        # Specify encoding for building on Windows
        with open(self.main_file, encoding="utf-8") as f:
            first_line = f.readline().strip()
        match = re.match(r"^#\s+(?P<title>.+)$", first_line)
        if match:
            return match.group("title")
        raise ValueError(f"Title not found in {self.main_file}")
    def fix_relative_links(self, content: str) -> str:
        """
@ -156,24 +156,35 @@ class Example:
        # included files containing code fences too
        code_fence = "``````"
-        if self.is_code:
+        if self.main_file is not None:
-            content += (
+            # Single file example or multi file example with a README
-                f"{code_fence}{self.main_file.suffix[1:]}\n"
+            if self.is_code:
-                f'--8<-- "{self.main_file}"\n'
+                content += (
-                f"{code_fence}\n"
+                    f"{code_fence}{self.main_file.suffix[1:]}\n"
-            )
+                    f'--8<-- "{self.main_file}"\n'
                    f"{code_fence}\n"
                )
            else:
                with open(self.main_file, encoding="utf-8") as f:
                    # Skip the title from md snippets as it's been included above
                    main_content = f.readlines()[1:]
                content += self.fix_relative_links("".join(main_content))
            content += "\n"
        else:
-            with open(self.main_file) as f:
+            # Multi file example without a README
-                # Skip the title from md snippets as it's been included above
+            for file in self.other_files:
-                main_content = f.readlines()[1:]
+                file_title = title(str(file.relative_to(self.path).with_suffix("")))
-            content += self.fix_relative_links("".join(main_content))
+                content += f"## {file_title}\n\n"
-        content += "\n"
+                content += (
                    f'{code_fence}{file.suffix[1:]}\n--8<-- "{file}"\n{code_fence}\n\n'
                )
            return content
        if not self.other_files:
            return content
        content += "## Example materials\n\n"
-        for file in sorted(self.other_files):
+        for file in self.other_files:
            content += f'??? abstract "{file.relative_to(self.path)}"\n'
            if file.suffix != ".md":
                content += f"    {code_fence}{file.suffix[1:]}\n"
@ -200,11 +211,13 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
    glob_patterns = ["*.py", "*.md", "*.sh"]
    # Find categorised examples
    for category in categories:
        logger.info("Processing category: %s", category.stem)
        globs = [category.glob(pattern) for pattern in glob_patterns]
        for path in itertools.chain(*globs):
            examples.append(Example(path, category.stem))
        # Find examples in subdirectories
-        for path in category.glob("*/*.md"):
+        globs = [category.glob(f"*/{pattern}") for pattern in glob_patterns]
        for path in itertools.chain(*globs):
            examples.append(Example(path.parent, category.stem))
    # Generate the example documentation
@ -217,3 +230,4 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
        with open(doc_path, "w+", encoding="utf-8") as f:
            f.write(example.generate())
        logger.debug("Example generated: %s", doc_path.relative_to(ROOT_DIR))
    logger.info("Total examples generated: %d", len(examples))
--- a/docs/models/hardware_supported_models/cpu.md
+++ b/docs/models/hardware_supported_models/cpu.md
@ -1,25 +1,33 @@
 # CPU - Intel® Xeon®
 ## Validated Hardware
 | Hardware                                 |
 | ----------------------------------------- |
 | [Intel® Xeon® 6 Processors](https://www.intel.com/content/www/us/en/products/details/processors/xeon.html)                   |
 | [Intel® Xeon® 5 Processors](https://www.intel.com/content/www/us/en/products/docs/processors/xeon/5th-gen-xeon-scalable-processors.html)              |
 ## Supported Models
 ### Text-only Language Models
 | Model                                | Architecture                             | Supported |
 |--------------------------------------|-------------------------------------------|-----------|
-| meta-llama/Llama-3.1 / 3.3           | LlamaForCausalLM                          | ✅        |
+| meta-llama/Llama-3.1-8B-Instruct     | LlamaForCausalLM                          | ✅        |
-| meta-llama/Llama-4-Scout             | Llama4ForConditionalGeneration            | ✅        |
+| meta-llama/Llama-3.2-3B-Instruct     | LlamaForCausalLM                          | ✅        |
-| meta-llama/Llama-4-Maverick          | Llama4ForConditionalGeneration            | ✅        |
+| ibm-granite/granite-3.2-2b-instruct  | GraniteForCausalLM                        | ✅        |
-| ibm-granite/granite (Granite-MOE)    | GraniteMoeForCausalLM                     | ✅        |
+| Qwen/Qwen3-1.7B                      | Qwen3ForCausalLM                          | ✅        |
-| Qwen/Qwen3                           | Qwen3ForCausalLM                          | ✅        |
+| Qwen/Qwen3-4B                        | Qwen3ForCausalLM                          | ✅        |
-| zai-org/GLM-4.5                      | GLMForCausalLM                            | ✅        |
+| Qwen/Qwen3-8B                        | Qwen3ForCausalLM                          | ✅        |
-| google/gemma                         | GemmaForCausalLM                          | ✅        |
+| zai-org/glm-4-9b-hf                  | GLMForCausalLM                            | ✅        |
 | google/gemma-7b                      | GemmaForCausalLM                          | ✅        |
 ### Multimodal Language Models
 | Model                                | Architecture                             | Supported |
 |--------------------------------------|-------------------------------------------|-----------|
-| Qwen/Qwen2.5-VL                      | Qwen2VLForConditionalGeneration           | ✅        |
+| Qwen/Qwen2.5-VL-7B-Instruct          | Qwen2VLForConditionalGeneration           | ✅        |
-| openai/whisper                       | WhisperForConditionalGeneration           | ✅        |
+| openai/whisper-large-v3              | WhisperForConditionalGeneration           | ✅        |
 ✅ Runs and optimized.  
 🟨 Runs and correct but not optimized to green yet.  
--- a/docs/models/hardware_supported_models/xpu.md
+++ b/docs/models/hardware_supported_models/xpu.md
@ -0,0 +1,65 @@
 # XPU - Intel® GPUs
 ## Validated Hardware
 | Hardware                                 |
 | ----------------------------------------- |
 | [Intel® Arc™ Pro B-Series Graphics](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/workstations/b-series/overview.html)                   |
 ## Supported Models
 ### Text-only Language Models
 | Model                                     | Architecture                                         | FP16 | Dynamic FP8 | MXFP4 |
 | ----------------------------------------- | ---------------------------------------------------- | ---- | ----------- | ----- |
 | openai/gpt-oss-20b                        | GPTForCausalLM                                       |      |             | ✅     |
 | openai/gpt-oss-120b                       | GPTForCausalLM                                       |      |             | ✅     |
 | deepseek-ai/DeepSeek-R1-Distill-Llama-8B  | LlamaForCausalLM                                     | ✅    | ✅           |       |
 | deepseek-ai/DeepSeek-R1-Distill-Qwen-14B  | QwenForCausalLM                                      | ✅    | ✅           |       |
 | deepseek-ai/DeepSeek-R1-Distill-Qwen-32B  | QwenForCausalLM                                      | ✅    | ✅           |       |
 | deepseek-ai/DeepSeek-R1-Distill-Llama-70B | LlamaForCausalLM                                     | ✅    | ✅           |       |
 | Qwen/Qwen2.5-72B-Instruct                 | Qwen2ForCausalLM                                     | ✅    | ✅           |       |
 | Qwen/Qwen3-14B                            | Qwen3ForCausalLM                                     | ✅    | ✅           |       |
 | Qwen/Qwen3-32B                            | Qwen3ForCausalLM                                     | ✅    | ✅           |       |
 | Qwen/Qwen3-30B-A3B                        | Qwen3ForCausalLM                                     | ✅    | ✅           |       |
 | Qwen/Qwen3-30B-A3B-GPTQ-Int4              | Qwen3ForCausalLM                                     | ✅    | ✅           |       |
 | Qwen/Qwen3-coder-30B-A3B-Instruct         | Qwen3ForCausalLM                                     | ✅    | ✅           |       |
 | Qwen/QwQ-32B                              | QwenForCausalLM                                      | ✅    | ✅           |       |
 | deepseek-ai/DeepSeek-V2-Lite              | DeepSeekForCausalLM                                  | ✅    | ✅           |       |
 | meta-llama/Llama-3.1-8B-Instruct          | LlamaForCausalLM                                     | ✅    | ✅           |       |
 | baichuan-inc/Baichuan2-13B-Chat           | BaichuanForCausalLM                                  | ✅    | ✅           |       |
 | THUDM/GLM-4-9B-chat                       | GLMForCausalLM                                       | ✅    | ✅           |       |
 | THUDM/CodeGeex4-All-9B                    | CodeGeexForCausalLM                                  | ✅    | ✅           |       |
 | chuhac/TeleChat2-35B                      | LlamaForCausalLM (TeleChat2 based on Llama arch)     | ✅    | ✅           |       |
 | 01-ai/Yi1.5-34B-Chat                      | YiForCausalLM                                        | ✅    | ✅           |       |
 | THUDM/CodeGeex4-All-9B                    | CodeGeexForCausalLM                                  | ✅    | ✅           |       |
 | deepseek-ai/DeepSeek-Coder-33B-base       | DeepSeekCoderForCausalLM                             | ✅    | ✅           |       |
 | baichuan-inc/Baichuan2-13B-Chat           | BaichuanForCausalLM                                  | ✅    | ✅           |       |
 | meta-llama/Llama-2-13b-chat-hf            | LlamaForCausalLM                                     | ✅    | ✅           |       |
 | THUDM/CodeGeex4-All-9B                    | CodeGeexForCausalLM                                  | ✅    | ✅           |       |
 | Qwen/Qwen1.5-14B-Chat                     | QwenForCausalLM                                      | ✅    | ✅           |       |
 | Qwen/Qwen1.5-32B-Chat                     | QwenForCausalLM                                      | ✅    | ✅           |       |
 ### Multimodal Language Models
 | Model                        | Architecture                     | FP16 | Dynamic FP8 | MXFP4 |
 | ---------------------------- | -------------------------------- | ---- | ----------- | ----- |
 | OpenGVLab/InternVL3_5-8B     | InternVLForConditionalGeneration | ✅    | ✅           |       |
 | OpenGVLab/InternVL3_5-14B    | InternVLForConditionalGeneration | ✅    | ✅           |       |
 | OpenGVLab/InternVL3_5-38B    | InternVLForConditionalGeneration | ✅    | ✅           |       |
 | Qwen/Qwen2-VL-7B-Instruct    | Qwen2VLForConditionalGeneration  | ✅    | ✅           |       |
 | Qwen/Qwen2.5-VL-72B-Instruct | Qwen2VLForConditionalGeneration  | ✅    | ✅           |       |
 | Qwen/Qwen2.5-VL-32B-Instruct | Qwen2VLForConditionalGeneration  | ✅    | ✅           |       |
 | THUDM/GLM-4v-9B              | GLM4vForConditionalGeneration    | ✅    | ✅           |       |
 | openbmb/MiniCPM-V-4          | MiniCPMVForConditionalGeneration | ✅    | ✅           |       |
 ### Embedding and Reranker Language Models
 | Model                   | Architecture                   | FP16 | Dynamic FP8 | MXFP4 |
 | ----------------------- | ------------------------------ | ---- | ----------- | ----- |
 | Qwen/Qwen3-Embedding-8B | Qwen3ForTextEmbedding          | ✅    | ✅           |       |
 | Qwen/Qwen3-Reranker-8B  | Qwen3ForSequenceClassification | ✅    | ✅           |       |
 ✅ Runs and optimized.  
 🟨 Runs and correct but not optimized to green yet.  
 ❌ Does not pass accuracy test or does not run.  
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@ -1,15 +1,15 @@
 # Pooling Models
-vLLM also supports pooling models, such as embedding, classification and reward models.
+vLLM also supports pooling models, such as embedding, classification, and reward models.
 In vLLM, pooling models implement the [VllmModelForPooling][vllm.model_executor.models.VllmModelForPooling] interface.
 These models use a [Pooler][vllm.model_executor.layers.pooler.Pooler] to extract the final hidden states of the input
 before returning them.
 !!! note
-    We currently support pooling models primarily as a matter of convenience. This is not guaranteed to have any performance improvement over using HF Transformers / Sentence Transformers directly.
+    We currently support pooling models primarily for convenience. This is not guaranteed to provide any performance improvements over using Hugging Face Transformers or Sentence Transformers directly.
-    We are now planning to optimize pooling models in vLLM. Please comment on <https://github.com/vllm-project/vllm/issues/21796> if you have any suggestions!
+    We plan to optimize pooling models in vLLM. Please comment on <https://github.com/vllm-project/vllm/issues/21796> if you have any suggestions!
 ## Configuration
@ -19,7 +19,7 @@ Run a model in pooling mode via the option `--runner pooling`.
 !!! tip
    There is no need to set this option in the vast majority of cases as vLLM can automatically
-    detect the model runner to use via `--runner auto`.
+    detect the appropriate model runner via `--runner auto`.
 ### Model Conversion
@ -78,7 +78,7 @@ When loading [Sentence Transformers](https://huggingface.co/sentence-transformer
 its Sentence Transformers configuration file (`modules.json`) takes priority over the model's defaults.
 You can further customize this via the `--pooler-config` option,
-which takes priority over both the model's and Sentence Transformers's defaults.
+which takes priority over both the model's and Sentence Transformers' defaults.
 ## Offline Inference
@ -168,11 +168,11 @@ The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM.
    - For embeddings, use `LLM.embed(...)` or `pooling_task="embed"`.
    - For classification logits, use `LLM.classify(...)` or `pooling_task="classify"`.
-    - For similarity scores, use `LLM.score(...)`.  
+    - For similarity scores, use `LLM.score(...)`.
    - For rewards, use `LLM.reward(...)` or `pooling_task="token_classify"`.
    - For token classification, use `pooling_task="token_classify"`.
-    - For multi-vector retrieval, use `pooling_task="token_embed"`
+    - For multi-vector retrieval, use `pooling_task="token_embed"`.
-    - For IO Processor Plugins , use `pooling_task="plugin"`
+    - For IO Processor Plugins, use `pooling_task="plugin"`.
 ```python
 from vllm import LLM
@ -194,15 +194,15 @@ Our [OpenAI-Compatible Server](../serving/openai_compatible_server.md) provides
 - [Pooling API](../serving/openai_compatible_server.md#pooling-api) is similar to `LLM.encode`, being applicable to all types of pooling models.
 !!! note
-    Please use one of the more specific methods or set the task directly when using  [Pooling API](../serving/openai_compatible_server.md#pooling-api) api.:
+    Please use one of the more specific endpoints or set the task directly when using the [Pooling API](../serving/openai_compatible_server.md#pooling-api):
    - For embeddings, use [Embeddings API](../serving/openai_compatible_server.md#embeddings-api) or `"task":"embed"`.
-    - For classification logits, use [Classification API](../serving/openai_compatible_server.md#classification-api) or `task":"classify"`.
+    - For classification logits, use [Classification API](../serving/openai_compatible_server.md#classification-api) or `"task":"classify"`.
-    - For similarity scores, use [Score API](../serving/openai_compatible_server.md#score-api).  
+    - For similarity scores, use [Score API](../serving/openai_compatible_server.md#score-api).
-    - For rewards, `task":"token_classify"`.
+    - For rewards, use `"task":"token_classify"`.
-    - For token classification, use `task":"token_classify"`.
+    - For token classification, use `"task":"token_classify"`.
-    - For multi-vector retrieval, use `task":"token_embed"`
+    - For multi-vector retrieval, use `"task":"token_embed"`.
-    - For IO Processor Plugins , use `task":"plugin"`
+    - For IO Processor Plugins, use `"task":"plugin"`.
 ```python
 # start a supported embeddings model server with `vllm serve`, e.g.
@ -232,7 +232,7 @@ for output in response.json()["data"]:
 ## Matryoshka Embeddings
-[Matryoshka Embeddings](https://sbert.net/examples/sentence_transformer/training/matryoshka/README.html#matryoshka-embeddings) or [Matryoshka Representation Learning (MRL)](https://arxiv.org/abs/2205.13147) is a technique used in training embedding models. It allows user to trade off between performance and cost.
+[Matryoshka Embeddings](https://sbert.net/examples/sentence_transformer/training/matryoshka/README.html#matryoshka-embeddings) or [Matryoshka Representation Learning (MRL)](https://arxiv.org/abs/2205.13147) is a technique used in training embedding models. It allows users to trade off between performance and cost.
 !!! warning
    Not all embedding models are trained using Matryoshka Representation Learning. To avoid misuse of the `dimensions` parameter, vLLM returns an error for requests that attempt to change the output dimension of models that do not support Matryoshka Embeddings.
@ -245,9 +245,9 @@ for output in response.json()["data"]:
 ### Manually enable Matryoshka Embeddings
-There is currently no official interface for specifying support for Matryoshka Embeddings. In vLLM, if `is_matryoshka` is `True` in `config.json,` it is allowed to change the output to arbitrary dimensions. Using `matryoshka_dimensions` can control the allowed output dimensions.
+There is currently no official interface for specifying support for Matryoshka Embeddings. In vLLM, if `is_matryoshka` is `True` in `config.json`, you can change the output dimension to arbitrary values. Use `matryoshka_dimensions` to control the allowed output dimensions.
-For models that support Matryoshka Embeddings but not recognized by vLLM, please manually override the config using `hf_overrides={"is_matryoshka": True}`, `hf_overrides={"matryoshka_dimensions": [<allowed output dimensions>]}` (offline) or `--hf-overrides '{"is_matryoshka": true}'`,  `--hf-overrides '{"matryoshka_dimensions": [<allowed output dimensions>]}'`(online).
+For models that support Matryoshka Embeddings but are not recognized by vLLM, manually override the config using `hf_overrides={"is_matryoshka": True}` or `hf_overrides={"matryoshka_dimensions": [<allowed output dimensions>]}` (offline), or `--hf-overrides '{"is_matryoshka": true}'` or `--hf-overrides '{"matryoshka_dimensions": [<allowed output dimensions>]}'` (online).
 Here is an example to serve a model with Matryoshka Embeddings enabled.
@ -274,11 +274,11 @@ outputs = llm.embed(
 print(outputs[0].outputs)
 ```
-A code example can be found here: [examples/offline_inference/pooling/embed_matryoshka_fy.py](../../examples/offline_inference/pooling/embed_matryoshka_fy.py)
+A code example can be found here: [examples/pooling/embed/embed_matryoshka_fy.py](../../examples/pooling/embed/embed_matryoshka_fy.py)
 ### Online Inference
-Use the following command to start vllm server.
+Use the following command to start the vLLM server.
 ```bash
 vllm serve jinaai/jina-embeddings-v3 --trust-remote-code
@ -304,17 +304,17 @@ Expected output:
 {"id":"embd-5c21fc9a5c9d4384a1b021daccaf9f64","object":"list","created":1745476417,"model":"jinaai/jina-embeddings-v3","data":[{"index":0,"object":"embedding","embedding":[-0.3828125,-0.1357421875,0.03759765625,0.125,0.21875,0.09521484375,-0.003662109375,0.1591796875,-0.130859375,-0.0869140625,-0.1982421875,0.1689453125,-0.220703125,0.1728515625,-0.2275390625,-0.0712890625,-0.162109375,-0.283203125,-0.055419921875,-0.0693359375,0.031982421875,-0.04052734375,-0.2734375,0.1826171875,-0.091796875,0.220703125,0.37890625,-0.0888671875,-0.12890625,-0.021484375,-0.0091552734375,0.23046875]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0,"prompt_tokens_details":null}}
 ```
-An OpenAI client example can be found here: [examples/online_serving/pooling/openai_embedding_matryoshka_fy.py](../../examples/online_serving/pooling/openai_embedding_matryoshka_fy.py)
+An OpenAI client example can be found here: [examples/pooling/embed/openai_embedding_matryoshka_fy.py](../../examples/pooling/embed/openai_embedding_matryoshka_fy.py)
 ## Deprecated Features
 ### Encode task
-We have split the `encode` task into two more specific token wise tasks: `token_embed` and `token_classify`:
+We have split the `encode` task into two more specific token-wise tasks: `token_embed` and `token_classify`:
- `token_embed` is the same as embed, using normalize as activation.
+- `token_embed` is the same as `embed`, using normalization as the activation.
- `token_classify` is the same as classify, default using softmax as activation.
+- `token_classify` is the same as `classify`, by default using softmax as the activation.
 ### Remove softmax from PoolingParams
-We are going to remove `softmax` and `activation` from `PoolingParams`. Instead, you should set `use_activation`, since we actually allow `classify` and `token_classify` to use any activation function.
+We are going to remove `softmax` and `activation` from `PoolingParams`. Instead, use `use_activation`, since we allow `classify` and `token_classify` to use any activation function.
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@ -417,7 +417,8 @@ th {
 | `MiniCPMForCausalLM` | MiniCPM | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. | ✅︎ | ✅︎ |
 | `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ |
 | `MiniMaxM2ForCausalLM` | MiniMax-M2 |`MiniMaxAI/MiniMax-M2`, etc. | | ✅︎ |
-| `MistralForCausalLM` | Mistral, Mistral-Instruct | `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ |
+| `MistralForCausalLM` | Ministral-3, Mistral, Mistral-Instruct | `mistralai/Ministral-3-3B-Instruct-2512`, `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ |
 | `MistralLarge3ForCausalLM` | Mistral-Large-3-675B-Base-2512, Mistral-Large-3-675B-Instruct-2512 | `mistralai/Mistral-Large-3-675B-Base-2512`, `mistralai/Mistral-Large-3-675B-Instruct-2512`, etc. | ✅︎ | ✅︎ |
 | `MixtralForCausalLM` | Mixtral-8x7B, Mixtral-8x7B-Instruct | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. | ✅︎ | ✅︎ |
 | `MPTForCausalLM` | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. | | ✅︎ |
 | `NemotronForCausalLM` | Nemotron-3, Nemotron-4, Minitron | `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. | ✅︎ | ✅︎ |
@ -479,6 +480,7 @@ These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) A
 | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
 |--------------|--------|-------------------|----------------------|---------------------------|
 | `BertModel`<sup>C</sup> | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | |
 | `BertSpladeSparseEmbeddingModel` | SPLADE | `naver/splade-v3` | | |
 | `Gemma2Model`<sup>C</sup> | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | ✅︎ |
 | `Gemma3TextModel`<sup>C</sup> | Gemma 3-based | `google/embeddinggemma-300m`, etc. | ✅︎ | ✅︎ |
 | `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ |
@ -566,7 +568,7 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A
    ```
 !!! note
-    Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: [examples/offline_inference/pooling/qwen3_reranker.py](../../examples/offline_inference/pooling/qwen3_reranker.py).
+    Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: [examples/pooling/score/qwen3_reranker.py](../../examples/pooling/score/qwen3_reranker.py).
    ```bash
    vllm serve Qwen/Qwen3-Reranker-0.6B --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}'
@ -604,7 +606,7 @@ These models primarily support the [`LLM.encode`](./pooling_models.md#llmencode)
 | `ModernBertForTokenClassification` | ModernBERT-based | `disham993/electrical-ner-ModernBERT-base` |  |  |
 !!! note
-    Named Entity Recognition (NER) usage, please refer to [examples/offline_inference/pooling/ner.py](../../examples/offline_inference/pooling/ner.py), [examples/online_serving/pooling/ner_client.py](../../examples/online_serving/pooling/ner_client.py).
+    Named Entity Recognition (NER) usage, please refer to [examples/pooling/token_classify/ner.py](../../examples/pooling/token_classify/ner.py), [examples/pooling/token_classify/ner_client.py](../../examples/pooling/token_classify/ner_client.py).
 ## List of Multimodal Language Models
@ -710,7 +712,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ |
 | `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ |
 | `Phi4MultimodalForCausalLM` | Phi-4-multimodal (HF Transformers) | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct` (with revision `refs/pr/70`), etc. | ✅︎ | ✅︎ |
-| `PixtralForConditionalGeneration` | Mistral 3 (Mistral format), Pixtral (Mistral format) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistralai/Pixtral-12B-2409`, etc. | | ✅︎ |
+| `PixtralForConditionalGeneration` | Ministral 3 (Mistral format), Mistral 3 (Mistral format), Mistral Large 3 (Mistral format), Pixtral (Mistral format) | T + I<sup>+</sup> | `mistralai/Ministral-3-3B-Instruct-2512`, `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistralai/Mistral-Large-3-675B-Instruct-2512` `mistralai/Pixtral-12B-2409` etc. | | ✅︎ |
 | `QwenVLForConditionalGeneration`<sup>^</sup> | Qwen-VL | T + I<sup>E+</sup> | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | ✅︎ | ✅︎ |
 | `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A<sup>+</sup> | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ |
 | `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ |
@ -725,6 +727,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `Step3VLForConditionalGeneration` | Step3-VL | T + I<sup>+</sup> | `stepfun-ai/step3` | | ✅︎ |
 | `TarsierForConditionalGeneration` | Tarsier | T + I<sup>E+</sup> | `omni-search/Tarsier-7b`, `omni-search/Tarsier-34b` | | ✅︎ |
 | `Tarsier2ForConditionalGeneration`<sup>^</sup> | Tarsier2 | T + I<sup>E+</sup> + V<sup>E+</sup> | `omni-research/Tarsier2-Recap-7b`, `omni-research/Tarsier2-7b-0115` | | ✅︎ |
 | `UltravoxModel` | Ultravox | T + A<sup>E+</sup> | `fixie-ai/ultravox-v0_5-llama-3_2-1b` | ✅︎ | ✅︎ |
 Some models are supported only via the [Transformers modeling backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers modeling backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@ -49,7 +49,8 @@ We currently support the following OpenAI APIs:
    - *Note: `suffix` parameter is not supported.*
 - [Chat Completions API](#chat-api) (`/v1/chat/completions`)
    - Only applicable to [text generation models](../models/generative_models.md) with a [chat template](../serving/openai_compatible_server.md#chat-template).
-    - *Note: `parallel_tool_calls` and `user` parameters are ignored.*
+    - *Note: `user` parameter is ignored.*
    - *Note:* Setting the `parallel_tool_calls` parameter to `false` ensures vLLM only returns zero or one tool call per request. Setting it to `true` (the default) allows returning more than one tool call per request. There is no guarantee more than one tool call will be returned if this is set to `true`, as that behavior is model dependent and not all models are designed to support parallel tool calls.
 - [Embeddings API](#embeddings-api) (`/v1/embeddings`)
    - Only applicable to [embedding models](../models/pooling_models.md).
 - [Transcriptions API](#transcriptions-api) (`/v1/audio/transcriptions`)
@ -233,7 +234,7 @@ The following extra parameters are supported:
 Our Embeddings API is compatible with [OpenAI's Embeddings API](https://platform.openai.com/docs/api-reference/embeddings);
 you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
-Code example: [examples/online_serving/pooling/openai_embedding_client.py](../../examples/online_serving/pooling/openai_embedding_client.py)
+Code example: [examples/pooling/embed/openai_embedding_client.py](../../examples/pooling/embed/openai_embedding_client.py)
 If the model has a [chat template](../serving/openai_compatible_server.md#chat-template), you can replace `inputs` with a list of `messages` (same schema as [Chat API](#chat-api))
 which will be treated as a single prompt to the model. Here is a convenience function for calling the API while retaining OpenAI's type annotations:
@ -334,7 +335,7 @@ and passing a list of `messages` in the request. Refer to the examples below for
        `MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code
        example below for details.
-Full example: [examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py](../../examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py)
+Full example: [examples/pooling/embed/openai_chat_embedding_client_for_multimodal.py](../../examples/pooling/embed/openai_chat_embedding_client_for_multimodal.py)
 #### Extra parameters
@ -350,7 +351,7 @@ The following extra parameters are supported by default:
 ??? code
    ```python
-    --8<-- "vllm/entrypoints/openai/protocol.py:embedding-extra-params"
+    --8<-- "vllm/entrypoints/pooling/embed/protocol.py:embedding-extra-params"
    ```
 For chat-like input (i.e. if `messages` is passed), these extra parameters are supported instead:
@ -358,7 +359,7 @@ For chat-like input (i.e. if `messages` is passed), these extra parameters are s
 ??? code
    ```python
-    --8<-- "vllm/entrypoints/openai/protocol.py:chat-embedding-extra-params"
+    --8<-- "vllm/entrypoints/pooling/embed/protocol.py:chat-embedding-extra-params"
    ```
 ### Transcriptions API
@ -455,6 +456,7 @@ For `verbose_json` response format:
      ]
    }
    ```
 Currently “verbose_json” response format doesn’t support avg_logprob, compression_ratio, no_speech_prob.
 #### Extra Parameters
@ -514,7 +516,7 @@ Our Pooling API encodes input prompts using a [pooling model](../models/pooling_
 The input format is the same as [Embeddings API](#embeddings-api), but the output data can contain an arbitrary nested list, not just a 1-D list of floats.
-Code example: [examples/online_serving/pooling/openai_pooling_client.py](../../examples/online_serving/pooling/openai_pooling_client.py)
+Code example: [examples/pooling/pooling/openai_pooling_client.py](../../examples/pooling/pooling/openai_pooling_client.py)
 ### Classification API
@ -522,7 +524,7 @@ Our Classification API directly supports Hugging Face sequence-classification mo
 We automatically wrap any other transformer via `as_seq_cls_model()`, which pools on the last token, attaches a `RowParallelLinear` head, and applies a softmax to produce per-class probabilities.
-Code example: [examples/online_serving/pooling/openai_classification_client.py](../../examples/online_serving/pooling/openai_classification_client.py)
+Code example: [examples/pooling/classify/openai_classification_client.py](../../examples/pooling/classify/openai_classification_client.py)
 #### Example Requests
@ -628,7 +630,7 @@ The following [pooling parameters][vllm.PoolingParams] are supported.
 The following extra parameters are supported:
 ```python
--8<-- "vllm/entrypoints/openai/protocol.py:classification-extra-params"
+--8<-- "vllm/entrypoints/pooling/classify/protocol.py:classification-extra-params"
 ```
 ### Score API
@ -638,7 +640,7 @@ Usually, the score for a sentence pair refers to the similarity between two sent
 You can find the documentation for cross encoder models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
-Code example: [examples/online_serving/pooling/openai_cross_encoder_score.py](../../examples/online_serving/pooling/openai_cross_encoder_score.py)
+Code example: [examples/pooling/score/openai_cross_encoder_score.py](../../examples/pooling/score/openai_cross_encoder_score.py)
 #### Single inference
@ -819,7 +821,7 @@ You can pass multi-modal inputs to scoring models by passing `content` including
        print("Scoring output:", response_json["data"][0]["score"])
        print("Scoring output:", response_json["data"][1]["score"])
        ```
-Full example: [examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py](../../examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py)
+Full example: [examples/pooling/score/openai_cross_encoder_score_for_multimodal.py](../../examples/pooling/score/openai_cross_encoder_score_for_multimodal.py)
 #### Extra parameters
@ -833,7 +835,7 @@ The following [pooling parameters][vllm.PoolingParams] are supported.
 The following extra parameters are supported:
 ```python
--8<-- "vllm/entrypoints/openai/protocol.py:score-extra-params"
+--8<-- "vllm/entrypoints/pooling/score/protocol.py:score-extra-params"
 ```
 ### Re-rank API
@ -849,7 +851,7 @@ endpoints are compatible with both [Jina AI's re-rank API interface](https://jin
 [Cohere's re-rank API interface](https://docs.cohere.com/v2/reference/rerank) to ensure compatibility with
 popular open-source tools.
-Code example: [examples/online_serving/pooling/jinaai_rerank_client.py](../../examples/online_serving/pooling/jinaai_rerank_client.py)
+Code example: [examples/pooling/score/jinaai_rerank_client.py](../../examples/pooling/score/jinaai_rerank_client.py)
 #### Example Request
@ -914,7 +916,7 @@ The following [pooling parameters][vllm.PoolingParams] are supported.
 The following extra parameters are supported:
 ```python
--8<-- "vllm/entrypoints/openai/protocol.py:rerank-extra-params"
+--8<-- "vllm/entrypoints/pooling/score/protocol.py:rerank-extra-params"
 ```
 ## Ray Serve LLM
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@ -425,6 +425,13 @@ def parse_args():
        default=None,
        help="Set the seed when initializing `vllm.LLM`.",
    )
    parser.add_argument(
        "--tensor-parallel-size",
        "-tp",
        type=int,
        default=None,
        help="Tensor parallel size to override the model's default setting. ",
    )
    return parser.parse_args()
@ -434,6 +441,12 @@ def main(args):
    if model not in model_example_map:
        raise ValueError(f"Model type {model} is not supported.")
    if args.tensor_parallel_size is not None and args.tensor_parallel_size < 1:
        raise ValueError(
            f"tensor_parallel_size must be a positive integer, "
            f"got {args.tensor_parallel_size}"
        )
    audio_count = args.num_audios
    req_data = model_example_map[model](
        question_per_audio_count[audio_count], audio_count
@ -446,6 +459,8 @@ def main(args):
    )
    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
    if args.tensor_parallel_size is not None:
        engine_args["tensor_parallel_size"] = args.tensor_parallel_size
    llm = LLM(**engine_args)
    # We set temperature to 0.2 so that outputs can be different
--- a/examples/offline_inference/llm_engine_reset_kv.py
+++ b/examples/offline_inference/llm_engine_reset_kv.py
@ -0,0 +1,98 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This file demonstrates preempt requests when using the `LLMEngine`
 for processing prompts with various sampling parameters.
 """
 import argparse
 from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 def create_test_prompts() -> list[tuple[str, SamplingParams]]:
    """Create a list of test prompts with their sampling parameters."""
    return [
        (
            "A robot may not injure a human being " * 50,
            SamplingParams(
                temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=16
            ),
        ),
        (
            "A robot may not injure a human being " * 50,
            SamplingParams(
                temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=16
            ),
        ),
        (
            "To be or not to be,",
            SamplingParams(
                temperature=0.8, top_k=5, presence_penalty=0.2, max_tokens=128
            ),
        ),
        (
            "What is the meaning of life?",
            SamplingParams(
                n=2, temperature=0.8, top_p=0.95, frequency_penalty=0.1, max_tokens=128
            ),
        ),
    ]
 def process_requests(engine: LLMEngine, test_prompts: list[tuple[str, SamplingParams]]):
    """Continuously process a list of prompts and handle the outputs."""
    request_id = 0
    print("-" * 50)
    step_id = 0
    while test_prompts or engine.has_unfinished_requests():
        print("-" * 50)
        import os
        print(f"Step {step_id} (pid={os.getpid()})")
        if test_prompts:
            prompt, sampling_params = test_prompts.pop(0)
            engine.add_request(str(request_id), prompt, sampling_params)
            request_id += 1
        if step_id == 10:
            print(f"Resetting prefix cache at {step_id}")
            engine.reset_prefix_cache(reset_running_requests=True)
        request_outputs: list[RequestOutput] = engine.step()
        for request_output in request_outputs:
            if request_output.finished:
                print("-" * 50)
                print(request_output)
                print("-" * 50)
        step_id += 1
 def initialize_engine(args: argparse.Namespace) -> LLMEngine:
    """Initialize the LLMEngine from the command line arguments."""
    engine_args = EngineArgs.from_cli_args(args)
    return LLMEngine.from_engine_args(engine_args)
 def parse_args():
    parser = FlexibleArgumentParser(
        description="Demo on using the LLMEngine class directly"
    )
    parser = EngineArgs.add_cli_args(parser)
    return parser.parse_args()
 def main(args: argparse.Namespace):
    """Main function that sets up and runs the prompt processing."""
    engine = initialize_engine(args)
    test_prompts = create_test_prompts()
    process_requests(engine, test_prompts)
 if __name__ == "__main__":
    args = parse_args()
    main(args)
--- a/examples/offline_inference/multilora_inference.py
+++ b/examples/offline_inference/multilora_inference.py
@ -46,7 +46,6 @@ def create_test_prompts(
                logprobs=1,
                prompt_logprobs=1,
                max_tokens=128,
                stop_token_ids=[32003],
            ),
            LoRARequest("sql-lora", 1, lora_path),
        ),
@ -57,7 +56,6 @@ def create_test_prompts(
                logprobs=1,
                prompt_logprobs=1,
                max_tokens=128,
                stop_token_ids=[32003],
            ),
            LoRARequest("sql-lora2", 2, lora_path),
        ),
@ -98,7 +96,7 @@ def initialize_engine() -> LLMEngine:
    #   use the same rank, it is recommended to set this as low as possible.
    # max_cpu_loras: controls the size of the CPU LoRA cache.
    engine_args = EngineArgs(
-        model="meta-llama/Llama-2-7b-hf",
+        model="meta-llama/Llama-3.2-3B-Instruct",
        enable_lora=True,
        max_loras=1,
        max_lora_rank=8,
@ -111,7 +109,7 @@ def initialize_engine() -> LLMEngine:
 def main():
    """Main function that sets up and runs the prompt processing."""
    engine = initialize_engine()
-    lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
+    lora_path = snapshot_download(repo_id="jeeejeee/llama32-3b-text2sql-spider")
    test_prompts = create_test_prompts(lora_path)
    process_requests(engine, test_prompts)
--- a/examples/offline_inference/pooling/README.md
+++ b/examples/offline_inference/pooling/README.md
@ -1,57 +0,0 @@
 # Pooling models
 ## Convert llm model to seq cls
 ```bash
 # for BAAI/bge-reranker-v2-gemma
 # Caution: "Yes" and "yes" are two different tokens
 python examples/offline_inference/pooling/convert_model_to_seq_cls.py --model_name BAAI/bge-reranker-v2-gemma --classifier_from_tokens '["Yes"]' --method no_post_processing --path ./bge-reranker-v2-gemma-seq-cls
 # for mxbai-rerank-v2
 python examples/offline_inference/pooling/convert_model_to_seq_cls.py --model_name mixedbread-ai/mxbai-rerank-base-v2 --classifier_from_tokens '["0", "1"]' --method from_2_way_softmax --path ./mxbai-rerank-base-v2-seq-cls
 # for Qwen3-Reranker
 python examples/offline_inference/pooling/convert_model_to_seq_cls.py --model_name Qwen/Qwen3-Reranker-0.6B --classifier_from_tokens '["no", "yes"]' --method from_2_way_softmax --path ./Qwen3-Reranker-0.6B-seq-cls
 ```
 ## Embed jina_embeddings_v3 usage
 Only text matching task is supported for now. See <https://github.com/vllm-project/vllm/pull/16120>
 ```bash
 python examples/offline_inference/pooling/embed_jina_embeddings_v3.py
 ```
 ## Embed matryoshka dimensions usage
 ```bash
 python examples/offline_inference/pooling/embed_matryoshka_fy.py
 ```
 ## Multi vector retrieval usage
 ```bash
 python examples/offline_inference/pooling/multi_vector_retrieval.py
 ```
 ## Named Entity Recognition (NER) usage
 ```bash
 python examples/offline_inference/pooling/ner.py
 ```
 ## Prithvi Geospatial MAE usage
 ```bash
 python examples/offline_inference/pooling/prithvi_geospatial_mae.py
 ```
 ## IO Processor Plugins for Prithvi Geospatial MAE
 ```bash
 python examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py
 ```
 ## Qwen3 reranker usage
 ```bash
 python examples/offline_inference/pooling/qwen3_reranker.py
 ```
--- a/examples/offline_inference/spec_decode.py
+++ b/examples/offline_inference/spec_decode.py
@ -133,7 +133,7 @@ def main(args):
        tensor_parallel_size=args.tp,
        enable_chunked_prefill=args.enable_chunked_prefill,
        enforce_eager=args.enforce_eager,
-        gpu_memory_utilization=0.8,
+        gpu_memory_utilization=0.9,
        speculative_config=speculative_config,
        disable_log_stats=False,
        max_model_len=args.max_model_len,
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@ -1801,7 +1801,10 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
-        hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]},
+        hf_overrides={
            "architectures": ["Tarsier2ForConditionalGeneration"],
            "model_type": "tarsier2",
        },
        limit_mm_per_prompt={modality: 1},
    )
@ -2064,6 +2067,13 @@ def parse_args():
        help="If True, will send all requests in a second batch with empty mm "
        "data to verify cache hits with UUIDs.",
    )
    parser.add_argument(
        "--tensor-parallel-size",
        "-tp",
        type=int,
        default=None,
        help="Tensor parallel size to override the model's default setting. ",
    )
    return parser.parse_args()
@ -2072,6 +2082,12 @@ def main(args):
    if model not in model_example_map:
        raise ValueError(f"Model type {model} is not supported.")
    if args.tensor_parallel_size is not None and args.tensor_parallel_size < 1:
        raise ValueError(
            f"tensor_parallel_size must be a positive integer, "
            f"got {args.tensor_parallel_size}"
        )
    modality = args.modality
    mm_input = get_multi_modal_input(args)
    data = mm_input["data"]
@ -2089,6 +2105,8 @@ def main(args):
        "seed": args.seed,
        "mm_processor_cache_gb": 0 if args.disable_mm_processor_cache else 4,
    }
    if args.tensor_parallel_size is not None:
        engine_args["tensor_parallel_size"] = args.tensor_parallel_size
    llm = LLM(**engine_args)
    # Don't want to check the flag multiple times, so just hijack `prompts`.
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@ -1222,7 +1222,10 @@ def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData:
        trust_remote_code=True,
        max_model_len=32768,
        limit_mm_per_prompt={"image": len(image_urls)},
-        hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]},
+        hf_overrides={
            "architectures": ["Tarsier2ForConditionalGeneration"],
            "model_type": "tarsier2",
        },
    )
    prompt = (
@ -1352,10 +1355,18 @@ model_example_map = {
 }
-def run_generate(model, question: str, image_urls: list[str], seed: int | None):
+def run_generate(
    model,
    question: str,
    image_urls: list[str],
    seed: int | None,
    tensor_parallel_size: int | None,
 ):
    req_data = model_example_map[model](question, image_urls)
-    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
+    engine_args = asdict(req_data.engine_args) | {"seed": seed}
    if tensor_parallel_size is not None:
        engine_args["tensor_parallel_size"] = tensor_parallel_size
    llm = LLM(**engine_args)
    sampling_params = SamplingParams(
@ -1378,7 +1389,13 @@ def run_generate(model, question: str, image_urls: list[str], seed: int | None):
        print("-" * 50)
-def run_chat(model: str, question: str, image_urls: list[str], seed: int | None):
+def run_chat(
    model: str,
    question: str,
    image_urls: list[str],
    seed: int | None,
    tensor_parallel_size: int | None,
 ):
    req_data = model_example_map[model](question, image_urls)
    # Disable other modalities to save memory
@ -1388,6 +1405,8 @@ def run_chat(model: str, question: str, image_urls: list[str], seed: int | None)
    )
    engine_args = asdict(req_data.engine_args) | {"seed": seed}
    if tensor_parallel_size is not None:
        engine_args["tensor_parallel_size"] = tensor_parallel_size
    llm = LLM(**engine_args)
    sampling_params = (
@ -1463,6 +1482,13 @@ def parse_args():
        default=2,
        help="Number of images to use for the demo.",
    )
    parser.add_argument(
        "--tensor-parallel-size",
        "-tp",
        type=int,
        default=None,
        help="Tensor parallel size to override the model's default setting. ",
    )
    return parser.parse_args()
@ -1470,13 +1496,20 @@ def main(args: Namespace):
    model = args.model_type
    method = args.method
    seed = args.seed
    tensor_parallel_size = args.tensor_parallel_size
    if tensor_parallel_size is not None and tensor_parallel_size < 1:
        raise ValueError(
            f"tensor_parallel_size must be a positive integer, "
            f"got {tensor_parallel_size}"
        )
    image_urls = IMAGE_URLS[: args.num_images]
    if method == "generate":
-        run_generate(model, QUESTION, image_urls, seed)
+        run_generate(model, QUESTION, image_urls, seed, tensor_parallel_size)
    elif method == "chat":
-        run_chat(model, QUESTION, image_urls, seed)
+        run_chat(model, QUESTION, image_urls, seed, tensor_parallel_size)
    else:
        raise ValueError(f"Invalid method: {method}")
--- a/examples/online_serving/openai_responses_client_with_mcp_tools.py
+++ b/examples/online_serving/openai_responses_client_with_mcp_tools.py
@ -0,0 +1,184 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Example demonstrating MCP (Model Context Protocol) tools with the Responses API.
 This example shows how to use MCP tools with different allowed_tools configurations:
 1. No filter (allows all tools from the MCP server)
 2. Wildcard "*" (explicitly allows all tools)
 3. Specific tool names (filters to only those tools)
 Set up this example by starting a vLLM OpenAI-compatible server with MCP tools enabled.
 For example:
 vllm serve openai/gpt-oss-20b --enforce-eager --tool-server demo
 Environment variables:
 - VLLM_ENABLE_RESPONSES_API_STORE=1
 - VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS=code_interpreter,container
 - VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS=1
 """
 from openai import OpenAI
 from utils import get_first_model
 def example_no_filter():
    """Example with no allowed_tools filter - allows all tools."""
    print("=" * 60)
    print("Example 1: No allowed_tools filter (allows all tools)")
    print("=" * 60)
    base_url = "http://0.0.0.0:8000/v1"
    client = OpenAI(base_url=base_url, api_key="empty")
    model = get_first_model(client)
    response = client.responses.create(
        model=model,
        input="Execute this code: print('Hello from Python!')",
        instructions="Use the Python tool to execute code.",
        tools=[
            {
                "type": "mcp",
                "server_label": "code_interpreter",
                "server_url": "http://localhost:8888",
                # No allowed_tools specified - all tools are available
            }
        ],
    )
    print(f"Status: {response.status}")
    print(f"Output: {response.output_text}")
    print()
 def example_wildcard():
    """Example with allowed_tools=['*'] - explicitly allows all tools."""
    print("=" * 60)
    print("Example 2: allowed_tools=['*'] (select all tools)")
    print("=" * 60)
    base_url = "http://0.0.0.0:8000/v1"
    client = OpenAI(base_url=base_url, api_key="empty")
    model = get_first_model(client)
    response = client.responses.create(
        model=model,
        input="Execute this code: print('Hello from Python with wildcard!')",
        instructions="Use the Python tool to execute code.",
        tools=[
            {
                "type": "mcp",
                "server_label": "code_interpreter",
                "server_url": "http://localhost:8888",
                # Using "*" to explicitly allow all tools from this MCP server
                # This is equivalent to not specifying allowed_tools
                "allowed_tools": ["*"],
            }
        ],
    )
    print(f"Status: {response.status}")
    print(f"Output: {response.output_text}")
    print()
 def example_specific_tools():
    """Example with specific allowed_tools list - filters available tools.
    Note: This example uses 'web_search_preview' (browser) which has multiple
    sub-tools: 'search', 'open', 'find'. The code_interpreter (python) doesn't
    have sub-tools, so filtering doesn't apply there.
    """
    print("=" * 60)
    print("Example 3: allowed_tools=['search'] (filter browser to specific tools)")
    print("=" * 60)
    base_url = "http://0.0.0.0:8000/v1"
    client = OpenAI(base_url=base_url, api_key="empty")
    model = get_first_model(client)
    response = client.responses.create(
        model=model,
        input="Search for 'Python programming tutorials'",
        instructions="Use the browser tool to search.",
        tools=[
            {
                "type": "mcp",
                "server_label": "web_search_preview",
                "server_url": "http://localhost:8888",
                # Browser has tools: 'search', 'open', 'find'
                # Only allow 'search' - blocks 'open' and 'find'
                "allowed_tools": ["search"],
            }
        ],
    )
    print(f"Status: {response.status}")
    print(f"Output: {response.output_text}")
    print()
 def example_object_format():
    """Example using object format for allowed_tools with browser tools."""
    print("=" * 60)
    print("Example 4: allowed_tools with object format")
    print("=" * 60)
    base_url = "http://0.0.0.0:8000/v1"
    client = OpenAI(base_url=base_url, api_key="empty")
    model = get_first_model(client)
    response = client.responses.create(
        model=model,
        input="Search for 'machine learning' and open the first result",
        instructions="Use the browser tool.",
        tools=[
            {
                "type": "mcp",
                "server_label": "web_search_preview",
                "server_url": "http://localhost:8888",
                # Object format with tool_names field
                # Can also include read_only and other fields
                # Browser has tools: 'search', 'open', 'find'
                "allowed_tools": {
                    "tool_names": [
                        "search",
                        "open",
                    ],  # Allow search and open, block find
                    "read_only": False,
                },
            }
        ],
    )
    print(f"Status: {response.status}")
    print(f"Output: {response.output_text}")
    print()
 def main():
    """Run all examples."""
    print("\n" + "=" * 60)
    print("MCP Tools with allowed_tools Examples")
    print("=" * 60 + "\n")
    # Run all examples
    example_no_filter()
    example_wildcard()
    example_specific_tools()
    example_object_format()
    print("=" * 60)
    print("Summary:")
    print("  - No filter or '*' → All tools available from server")
    print("  - Specific list → Only those sub-tools available")
    print("  - Object format → More control with tool_names field")
    print("")
    print("Note: allowed_tools filters SUB-TOOLS within an MCP server:")
    print("  - code_interpreter (python): No sub-tools to filter")
    print("  - web_search_preview (browser): Has 'search', 'open', 'find'")
    print("=" * 60)
 if __name__ == "__main__":
    main()
--- a/examples/online_serving/pooling/README.md
+++ b/examples/online_serving/pooling/README.md
@ -1,97 +0,0 @@
 # Pooling models
 ## Cohere rerank usage
 ```bash
 # vllm serve BAAI/bge-reranker-base
 python examples/online_serving/pooling/cohere_rerank_client.py
 ```
 ## Embedding requests base64 encoding_format usage
 ```bash
 # vllm serve intfloat/e5-small
 python examples/online_serving/pooling/embedding_requests_base64_client.py
 ```
 ## Embedding requests bytes encoding_format usage
 ```bash
 # vllm serve intfloat/e5-small
 python examples/online_serving/pooling/embedding_requests_bytes_client.py
 ```
 ## Jinaai rerank usage
 ```bash
 # vllm serve BAAI/bge-reranker-base
 python examples/online_serving/pooling/jinaai_rerank_client.py
 ```
 ## Multi vector retrieval usage
 ```bash
 # vllm serve BAAI/bge-m3
 python examples/online_serving/pooling/multi_vector_retrieval_client.py
 ```
 ## Named Entity Recognition (NER) usage
 ```bash
 # vllm serve boltuix/NeuroBERT-NER
 python examples/online_serving/pooling/ner_client.py
 ```
 ## OpenAI chat embedding for multimodal usage
 ```bash
 python examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py
 ```
 ## OpenAI classification usage
 ```bash
 # vllm serve jason9693/Qwen2.5-1.5B-apeach
 python examples/online_serving/pooling/openai_classification_client.py
 ```
 ## OpenAI cross_encoder score usage
 ```bash
 # vllm serve BAAI/bge-reranker-v2-m3
 python examples/online_serving/pooling/openai_cross_encoder_score.py
 ```
 ## OpenAI cross_encoder score for multimodal usage
 ```bash
 # vllm serve jinaai/jina-reranker-m0
 python examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py
 ```
 ## OpenAI embedding usage
 ```bash
 # vllm serve intfloat/e5-small
 python examples/online_serving/pooling/openai_embedding_client.py
 ```
 ## OpenAI embedding matryoshka dimensions usage
 ```bash
 # vllm serve jinaai/jina-embeddings-v3 --trust-remote-code
 python examples/online_serving/pooling/openai_embedding_matryoshka_fy.py
 ```
 ## OpenAI pooling usage
 ```bash
 # vllm serve internlm/internlm2-1_8b-reward --trust-remote-code
 python examples/online_serving/pooling/openai_pooling_client.py
 ```
 ## Online Prithvi Geospatial MAE usage
 ```bash
 python examples/online_serving/pooling/prithvi_geospatial_mae.py
 ```
--- a/Show More
+++ b/Show More
`@ -1 +1,2 @@`
	`kernel_*.cu`	`sm_kernel_.cu`
		`kernel_selector.h`