diff --git a/.buildkite/performance-benchmarks/README.md b/.buildkite/performance-benchmarks/README.md
index 6d494f64f14fa..015f48c2520d6 100644
--- a/.buildkite/performance-benchmarks/README.md
+++ b/.buildkite/performance-benchmarks/README.md
@@ -108,6 +108,65 @@ The number of this test is less stable compared to the delay and latency benchma
WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
+#### Default Parameters Field
+
+We can specify default parameters in a JSON field with key `defaults`. Parameters defined in the field are applied globally to all serving tests, and can be overridden in test case fields. Here is an example:
+
+
+ An Example of default parameters field
+
+```json
+{
+ "defaults": {
+ "qps_list": [
+ "inf"
+ ],
+ "server_environment_variables": {
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1
+ },
+ "server_parameters": {
+ "tensor_parallel_size": 1,
+ "dtype": "bfloat16",
+ "block_size": 128,
+ "disable_log_stats": "",
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "num_prompts": 200,
+ "ignore-eos": ""
+ }
+ },
+ "tests": [
+ {
+ "test_name": "serving_llama3B_tp2_random_128_128",
+ "server_parameters": {
+ "model": "meta-llama/Llama-3.2-3B-Instruct",
+ "tensor_parallel_size": 2,
+ },
+ "client_parameters": {
+ "model": "meta-llama/Llama-3.2-3B-Instruct",
+ }
+ },
+ {
+ "test_name": "serving_qwen3_tp4_random_128_128",
+ "server_parameters": {
+ "model": "Qwen/Qwen3-14B",
+ "tensor_parallel_size": 4,
+ },
+ "client_parameters": {
+ "model": "Qwen/Qwen3-14B",
+ }
+ },
+ ]
+}
+```
+
+
+
### Visualizing the results
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.
diff --git a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
index 99a5a5e334f8e..34ceefe0996f2 100644
--- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
@@ -110,7 +110,8 @@ json2envs() {
wait_for_server() {
# wait for vllm server to start
# return 1 if vllm server crashes
- timeout 1200 bash -c '
+ local timeout_val="1200"
+ timeout "$timeout_val" bash -c '
until curl -X POST localhost:8000/v1/completions; do
sleep 1
done' && return 0 || return 1
@@ -316,12 +317,44 @@ run_throughput_tests() {
run_serving_tests() {
# run serving tests using `vllm bench serve` command
# $1: a json file specifying serving test cases
+ #
+ # Supported JSON formats:
+ # 1) Plain format: top-level array
+ # [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
+ #
+ # 2) Default parameters field + plain format tests
+ # {
+ # "defaults": { ... },
+ # "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
+ # }
local serving_test_file
serving_test_file=$1
# Iterate over serving tests
- jq -c '.[]' "$serving_test_file" | while read -r params; do
+ jq -c '
+ if type == "array" then
+ # Plain format: test cases array
+ .[]
+ elif (type == "object" and has("tests")) then
+ # merge the default parameters into each test cases
+ . as $root
+ | ($root.defaults // {}) as $d
+ | ($root.tests // [])[]
+ # default qps / max_concurrency from defaults if missing
+ | .qps_list = (.qps_list // $d.qps_list)
+ | .max_concurrency_list = (.max_concurrency_list // $d.max_concurrency_list)
+ # merge envs / params: test overrides defaults
+ | .server_environment_variables =
+ (($d.server_environment_variables // {}) + (.server_environment_variables // {}))
+ | .server_parameters =
+ (($d.server_parameters // {}) + (.server_parameters // {}))
+ | .client_parameters =
+ (($d.client_parameters // {}) + (.client_parameters // {}))
+ else
+ error("Unsupported serving test file format: must be array or object with .tests")
+ end
+ ' "$serving_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
if [[ ! "$test_name" =~ ^serving_ ]]; then
@@ -335,20 +368,25 @@ run_serving_tests() {
continue
fi
- # get client and server arguments
+ # get client and server arguments (after merged the default parameters)
server_params=$(echo "$params" | jq -r '.server_parameters')
server_envs=$(echo "$params" | jq -r '.server_environment_variables')
client_params=$(echo "$params" | jq -r '.client_parameters')
+
server_args=$(json2args "$server_params")
server_envs=$(json2envs "$server_envs")
client_args=$(json2args "$client_params")
+
+ # qps_list
qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
echo "Running over qps list $qps_list"
+
+ # max_concurrency_list (fallback to num_prompts if missing)
max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list')
if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then
- num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
- max_concurrency_list="[$num_prompts]"
+ num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
+ max_concurrency_list="[$num_prompts]"
fi
max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh')
echo "Running over max concurrency list $max_concurrency_list"
diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json
deleted file mode 100644
index f758097e098e4..0000000000000
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json
+++ /dev/null
@@ -1,610 +0,0 @@
-[
- {
- "test_name": "serving_llama8B_bf16_tp1_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp2_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp4_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp1_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp2_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp4_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp1_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp2_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp4_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp1_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp2_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp4_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp1_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp2_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp4_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp1_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp2_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp4_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- }
-]
diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json
deleted file mode 100644
index 0b1a42e790255..0000000000000
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json
+++ /dev/null
@@ -1,1023 +0,0 @@
-[
- {
- "test_name": "serving_llama8B_bf16_pp1_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "pipeline_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp2_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_bf16_pp3_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp4_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp2pp3_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 2,
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_bf16_pp1_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "pipeline_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp2_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_bf16_pp3_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp4_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp2pp3_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 2,
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int8_pp1_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "pipeline_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp2_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int8_pp3_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp4_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp2pp3_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 2,
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int8_pp1_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "pipeline_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp2_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int8_pp3_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp4_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp2pp3_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 2,
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int4_pp1_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "pipeline_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp2_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int4_pp3_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp4_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp2pp3_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 2,
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int4_pp1_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "pipeline_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp2_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int4_pp3_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp4_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp2pp3_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 2,
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- }
-]
diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
index f792956f39472..8f7200862d20c 100644
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
@@ -1,276 +1,246 @@
-[
- {
- "test_name": "serving_llama8B_tp1_sharegpt",
- "qps_list": [1, 4, 16, "inf"],
- "max_concurrency_list": [32],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 32
- }
+{
+ "defaults": {
+ "qps_list": [
+ "inf"
+ ],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
},
- {
- "test_name": "serving_llama8B_tp2_sharegpt",
- "qps_list": [1, 4, 16, "inf"],
- "max_concurrency_list": [32],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 32
- }
+ "server_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "tensor_parallel_size": 1,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
},
- {
- "test_name": "serving_llama8B_tp1_random_128_128",
- "qps_list": [1, 4, 16, "inf"],
- "max_concurrency_list": [32],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 32
- }
- },
- {
- "test_name": "serving_llama8B_tp2_random_128_128",
- "qps_list": [1, 4, 16, "inf"],
- "max_concurrency_list": [32],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 32
- }
- },
- {
- "test_name": "serving_llama8B_tp1_random_128_2048",
- "qps_list": [1, 4, 16, "inf"],
- "max_concurrency_list": [32],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 2048,
- "ignore-eos": "",
- "num_prompts": 32
- }
- },
- {
- "test_name": "serving_llama8B_tp2_random_128_2048",
- "qps_list": [1, 4, 16, "inf"],
- "max_concurrency_list": [32],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 2048,
- "ignore-eos": "",
- "num_prompts": 32
- }
- },
- {
- "test_name": "serving_llama8B_tp1_random_2048_128",
- "qps_list": [1, 4, 16, "inf"],
- "max_concurrency_list": [32],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 2048,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 32
- }
- },
- {
- "test_name": "serving_llama8B_tp2_random_2048_128",
- "qps_list": [1, 4, 16, "inf"],
- "max_concurrency_list": [32],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 2048,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 32
- }
+ "client_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "backend": "vllm",
+ "ignore-eos": "",
+ "num_prompts": 200
}
-]
+ },
+ "tests": [
+ {
+ "test_name": "serving_llama8B_tp1_sharegpt",
+ "server_parameters": {
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp2_sharegpt",
+ "server_parameters": {
+ "tensor_parallel_size": 2
+ },
+ "client_parameters": {
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp1_random_128_128",
+ "server_parameters": {
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp2_random_128_128",
+ "server_parameters": {
+ "tensor_parallel_size": 2
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp4_random_128_128",
+ "server_parameters": {
+ "tensor_parallel_size": 4
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp1_random_128_2048",
+ "server_parameters": {
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 2048
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp2_random_128_2048",
+ "server_parameters": {
+ "tensor_parallel_size": 2
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 2048
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp4_random_128_2048",
+ "server_parameters": {
+ "tensor_parallel_size": 4
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 2048
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp1_random_2048_128",
+ "server_parameters": {
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 2048,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp2_random_2048_128",
+ "server_parameters": {
+ "tensor_parallel_size": 2
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 2048,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp4_random_2048_128",
+ "server_parameters": {
+ "tensor_parallel_size": 4
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 2048,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_llama3B_tp1_random_128_128",
+ "server_parameters": {
+ "model": "meta-llama/Llama-3.2-3B-Instruct",
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "model": "meta-llama/Llama-3.2-3B-Instruct",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_granite2B_tp1_random_128_128",
+ "server_parameters": {
+ "model": "ibm-granite/granite-3.2-2b-instruct",
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "model": "ibm-granite/granite-3.2-2b-instruct",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_qwen1.7B_tp1_random_128_128",
+ "server_parameters": {
+ "model": "Qwen/Qwen3-1.7B",
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "model": "Qwen/Qwen3-1.7B",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_qwen4B_tp1_random_128_128",
+ "server_parameters": {
+ "model": "Qwen/Qwen3-4B",
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "model": "Qwen/Qwen3-4B",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_qwen8B_tp1_random_128_128",
+ "server_parameters": {
+ "model": "Qwen/Qwen3-8B",
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "model": "Qwen/Qwen3-8B",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_glm9B_tp1_random_128_128",
+ "server_parameters": {
+ "model": "zai-org/glm-4-9b-hf",
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "model": "zai-org/glm-4-9b-hf",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_gemma7B_tp1_random_128_128",
+ "server_parameters": {
+ "model": "google/gemma-7b",
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "model": "google/gemma-7b",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ }
+ ]
+}
diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 38c400ba1faf5..fbfc923998f89 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -8,7 +8,7 @@ steps:
commands:
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
@@ -30,19 +30,6 @@ steps:
DOCKER_BUILDKIT: "1"
# x86 + CUDA builds
- - label: "Build wheel - CUDA 12.8"
- depends_on: ~
- id: build-wheel-cuda-12-8
- agents:
- queue: cpu_queue_postmerge
- commands:
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- - "mkdir artifacts"
- - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- - "bash .buildkite/scripts/upload-wheels.sh"
- env:
- DOCKER_BUILDKIT: "1"
-
- label: "Build wheel - CUDA 12.9"
depends_on: ~
id: build-wheel-cuda-12-9
@@ -109,7 +96,6 @@ steps:
- label: "Annotate release workflow"
depends_on:
- create-multi-arch-manifest
- - build-wheel-cuda-12-8
id: annotate-release-workflow
agents:
queue: cpu_queue_postmerge
diff --git a/.buildkite/scripts/generate-nightly-index.py b/.buildkite/scripts/generate-nightly-index.py
new file mode 100644
index 0000000000000..8d09ba178db7b
--- /dev/null
+++ b/.buildkite/scripts/generate-nightly-index.py
@@ -0,0 +1,369 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# do not complain about line length (for docstring)
+# ruff: noqa: E501
+
+import argparse
+import json
+import re
+import sys
+from dataclasses import asdict, dataclass
+from pathlib import Path
+from typing import Any
+from urllib.parse import quote
+
+if not sys.version_info >= (3, 12):
+ raise RuntimeError("This script requires Python 3.12 or higher.")
+
+INDEX_HTML_TEMPLATE = """
+
+
+
+{items}
+
+
+"""
+
+
+@dataclass
+class WheelFileInfo:
+ package_name: str
+ version: str
+ build_tag: str | None
+ python_tag: str
+ abi_tag: str
+ platform_tag: str
+ variant: str | None
+ filename: str
+
+
+def parse_from_filename(file: str) -> WheelFileInfo:
+ """
+ Parse wheel file name to extract metadata.
+
+ The format of wheel names:
+ {package_name}-{version}(-{build_tag})?-{python_tag}-{abi_tag}-{platform_tag}.whl
+ All versions could contain a variant like '+cu129' or '.cpu' or `.rocm` (or not).
+ Example:
+ vllm-0.11.0-cp38-abi3-manylinux1_x86_64.whl
+ vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl
+ vllm-0.11.1rc8.dev14+gaa384b3c0-cp38-abi3-manylinux2014_aarch64.whl
+ vllm-0.11.1rc8.dev14+gaa384b3c0.cu130-cp38-abi3-manylinux1_x86_64.whl
+ """
+ wheel_file_re = re.compile(
+ r"^(?P.+)-(?P[^-]+?)(-(?P[^-]+))?-(?P[^-]+)-(?P[^-]+)-(?P[^-]+)\.whl$"
+ )
+ match = wheel_file_re.match(file)
+ if not match:
+ raise ValueError(f"Invalid wheel file name: {file}")
+
+ package_name = match.group("package_name")
+ version = match.group("version")
+ build_tag = match.group("build_tag")
+ python_tag = match.group("python_tag")
+ abi_tag = match.group("abi_tag")
+ platform_tag = match.group("platform_tag")
+
+ # extract variant from version
+ variant = None
+ if "dev" in version:
+ ver_after_dev = version.split("dev")[-1]
+ if "." in ver_after_dev:
+ variant = ver_after_dev.split(".")[-1]
+ version = version.removesuffix("." + variant)
+ else:
+ if "+" in version:
+ version, variant = version.split("+")
+
+ return WheelFileInfo(
+ package_name=package_name,
+ version=version,
+ build_tag=build_tag,
+ python_tag=python_tag,
+ abi_tag=abi_tag,
+ platform_tag=platform_tag,
+ variant=variant,
+ filename=file,
+ )
+
+
+def generate_project_list(subdir_names: list[str]) -> str:
+ """
+ Generate project list HTML content linking to each project & variant sub-directory.
+ """
+ href_tags = []
+ for name in sorted(subdir_names):
+ name = name.strip("/").strip(".")
+ href_tags.append(f' {name}/
')
+ return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
+
+
+def generate_package_index_and_metadata(
+ wheel_files: list[WheelFileInfo], wheel_base_dir: Path, index_base_dir: Path
+) -> tuple[str, str]:
+ """
+ Generate package index HTML content for a specific package, linking to actual wheel files.
+ """
+ href_tags = []
+ metadata = []
+ for file in sorted(wheel_files, key=lambda x: x.filename):
+ relative_path = (
+ wheel_base_dir.relative_to(index_base_dir, walk_up=True) / file.filename
+ )
+ # handle with '+' in URL, and avoid double-encoding '/' and already-encoded '%2B'
+ # NOTE: this is AWS S3 specific behavior!
+ file_path_quoted = quote(relative_path.as_posix(), safe=":%/")
+ href_tags.append(f' {file.filename}
')
+ file_meta = asdict(file)
+ file_meta["path"] = file_path_quoted
+ metadata.append(file_meta)
+ index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
+ metadata_str = json.dumps(metadata, indent=2)
+ return index_str, metadata_str
+
+
+def generate_index_and_metadata(
+ whl_files: list[str],
+ wheel_base_dir: Path,
+ index_base_dir: Path,
+ default_variant: str | None = None,
+ alias_to_default: str | None = None,
+):
+ """
+ Generate index for all wheel files.
+
+ Args:
+ whl_files (list[str]): List of wheel files (must be directly under `wheel_base_dir`).
+ wheel_base_dir (Path): Base directory for wheel files.
+ index_base_dir (Path): Base directory to store index files.
+ default_variant (str | None): The default variant name, if any.
+ alias_to_default (str | None): Alias variant name for the default variant, if any.
+
+ First, parse all wheel files to extract metadata.
+ We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
+ The index for the default variant (if any) is generated in the root index directory.
+
+ If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index
+ is purely a copy of the corresponding variant index, with only the links adjusted.
+ Otherwise, all wheels without variant suffixes are treated as the default variant.
+
+ If `alias_to_default` is provided, an additional alias sub-directory is created, it has the same content
+ as the default variant index, but the links are adjusted accordingly.
+
+ Index directory structure:
+ index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/)
+ index.html # project list, linking to "vllm/" and other packages, and all variant sub-directories
+ vllm/
+ index.html # package index, pointing to actual files in wheel_base_dir (relative path)
+ metadata.json # machine-readable metadata for all wheels in this package
+ cpu/ # cpu variant sub-directory
+ index.html
+ vllm/
+ index.html
+ metadata.json
+ cu129/ # cu129 is actually the alias to default variant
+ index.html
+ vllm/
+ index.html
+ metadata.json
+ cu130/ # cu130 variant sub-directory
+ index.html
+ vllm/
+ index.html
+ metadata.json
+ ...
+
+ metadata.json stores a dump of all wheel files' metadata in a machine-readable format:
+ [
+ {
+ "package_name": "vllm",
+ "version": "0.10.2rc2",
+ "build_tag": null,
+ "python_tag": "cp38",
+ "abi_tag": "abi3",
+ "platform_tag": "manylinux2014_aarch64",
+ "variant": "cu129",
+ "filename": "vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl",
+ "path": "../vllm-0.10.2rc2%2Bcu129-cp38-abi3-manylinux2014_aarch64.whl" # to be concatenated with the directory URL and URL-encoded
+ },
+ ...
+ ]
+ """
+
+ parsed_files = [parse_from_filename(f) for f in whl_files]
+
+ if not parsed_files:
+ print("No wheel files found, skipping index generation.")
+ return
+
+ # Group by variant
+ variant_to_files: dict[str, list[WheelFileInfo]] = {}
+ for file in parsed_files:
+ variant = file.variant or "default"
+ if variant not in variant_to_files:
+ variant_to_files[variant] = []
+ variant_to_files[variant].append(file)
+
+ print(f"Found variants: {list(variant_to_files.keys())}")
+
+ # sanity check for default variant
+ if default_variant:
+ if "default" in variant_to_files:
+ raise ValueError(
+ "All wheel files must have variant suffixes when `default_variant` is specified."
+ )
+ if default_variant not in variant_to_files:
+ raise ValueError(
+ f"Default variant '{default_variant}' not found among wheel files."
+ )
+
+ if alias_to_default:
+ if "default" not in variant_to_files:
+ # e.g. only some wheels are uploaded to S3 currently
+ print(
+ "[WARN] Alias to default variant specified, but no default variant found."
+ )
+ elif alias_to_default in variant_to_files:
+ raise ValueError(
+ f"Alias variant name '{alias_to_default}' already exists among wheel files."
+ )
+ else:
+ variant_to_files[alias_to_default] = variant_to_files["default"].copy()
+ print(f"Alias variant '{alias_to_default}' created for default variant.")
+
+ # Generate index for each variant
+ subdir_names = set()
+ for variant, files in variant_to_files.items():
+ if variant == "default":
+ variant_dir = index_base_dir
+ else:
+ variant_dir = index_base_dir / variant
+ subdir_names.add(variant)
+
+ variant_dir.mkdir(parents=True, exist_ok=True)
+
+ # gather all package names in this variant
+ packages = set(f.package_name for f in files)
+ if variant == "default":
+ # these packages should also appear in the "project list"
+ # generate after all variants are processed
+ subdir_names = subdir_names.union(packages)
+ else:
+ # generate project list for this variant directly
+ project_list_str = generate_project_list(sorted(packages))
+ with open(variant_dir / "index.html", "w") as f:
+ f.write(project_list_str)
+
+ for package in packages:
+ # filter files belonging to this package only
+ package_files = [f for f in files if f.package_name == package]
+ package_dir = variant_dir / package
+ package_dir.mkdir(parents=True, exist_ok=True)
+ index_str, metadata_str = generate_package_index_and_metadata(
+ package_files, wheel_base_dir, package_dir
+ )
+ with open(package_dir / "index.html", "w") as f:
+ f.write(index_str)
+ with open(package_dir / "metadata.json", "w") as f:
+ f.write(metadata_str)
+
+ # Generate top-level project list index
+ project_list_str = generate_project_list(sorted(subdir_names))
+ with open(index_base_dir / "index.html", "w") as f:
+ f.write(project_list_str)
+
+
+if __name__ == "__main__":
+ """
+ Arguments:
+ --version : version string for the current build (e.g., commit hash)
+ --current-objects : path to JSON file containing current S3 objects listing in this version directory
+ --output-dir : directory to store generated index files
+ --alias-to-default : (optional) alias variant name for the default variant
+ """
+
+ parser = argparse.ArgumentParser(
+ description="Process nightly build wheel files to generate indices."
+ )
+ parser.add_argument(
+ "--version",
+ type=str,
+ required=True,
+ help="Version string for the current build (e.g., commit hash)",
+ )
+ parser.add_argument(
+ "--current-objects",
+ type=str,
+ required=True,
+ help="Path to JSON file containing current S3 objects listing in this version directory",
+ )
+ parser.add_argument(
+ "--output-dir",
+ type=str,
+ required=True,
+ help="Directory to store generated index files",
+ )
+ parser.add_argument(
+ "--alias-to-default",
+ type=str,
+ default=None,
+ help="Alias variant name for the default variant",
+ )
+
+ args = parser.parse_args()
+
+ version = args.version
+ if "/" in version or "\\" in version:
+ raise ValueError("Version string must not contain slashes.")
+ current_objects_path = Path(args.current_objects)
+ output_dir = Path(args.output_dir)
+ if not output_dir.exists():
+ output_dir.mkdir(parents=True, exist_ok=True)
+
+ # Read current objects JSON
+ with open(current_objects_path) as f:
+ current_objects: dict[str, list[dict[str, Any]]] = json.load(f)
+
+ # current_objects looks like from list_objects_v2 S3 API:
+ """
+ "Contents": [
+ {
+ "Key": "e2f56c309d2a28899c68975a7e104502d56deb8f/vllm-0.11.2.dev363+ge2f56c309-cp38-abi3-manylinux1_x86_64.whl",
+ "LastModified": "2025-11-28T14:00:32+00:00",
+ "ETag": "\"37a38339c7cdb61ca737021b968075df-52\"",
+ "ChecksumAlgorithm": [
+ "CRC64NVME"
+ ],
+ "ChecksumType": "FULL_OBJECT",
+ "Size": 435649349,
+ "StorageClass": "STANDARD"
+ },
+ ...
+ ]
+ """
+
+ # Extract wheel file keys
+ wheel_files = []
+ for item in current_objects.get("Contents", []):
+ key: str = item["Key"]
+ if key.endswith(".whl"):
+ wheel_files.append(key.split("/")[-1]) # only the filename is used
+
+ print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")
+
+ # Generate index and metadata, assuming wheels and indices are stored as:
+ # s3://vllm-wheels/{version}/
+ # s3://vllm-wheels//
+ wheel_base_dir = Path(output_dir).parent / version
+ index_base_dir = Path(output_dir)
+
+ generate_index_and_metadata(
+ whl_files=wheel_files,
+ wheel_base_dir=wheel_base_dir,
+ index_base_dir=index_base_dir,
+ default_variant=None,
+ alias_to_default=args.alias_to_default,
+ )
+ print(f"Successfully generated index and metadata in {output_dir}")
diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
index d0036f24c8d04..b5f6b2494792f 100755
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
@@ -7,53 +7,51 @@ set -ex
# allow to bind to different cores
CORE_RANGE=${CORE_RANGE:-0-16}
OMP_CORE_RANGE=${OMP_CORE_RANGE:-0-16}
-NUMA_NODE=${NUMA_NODE:-0}
-export CMAKE_BUILD_PARALLEL_LEVEL=32
+export CMAKE_BUILD_PARALLEL_LEVEL=16
# Setup cleanup
remove_docker_container() {
set -e;
- docker rm -f cpu-test-"$NUMA_NODE" || true;
+ docker rm -f cpu-test || true;
}
trap remove_docker_container EXIT
remove_docker_container
# Try building the docker image
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
+docker build --tag cpu-test --target vllm-test -f docker/Dockerfile.cpu .
-# Run the image, setting --shm-size=4g for tensor parallel.
-docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
+# Run the image
+docker run -itd --cpuset-cpus="$CORE_RANGE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test cpu-test
function cpu_tests() {
set -e
- export NUMA_NODE=$2
- docker exec cpu-test-"$NUMA_NODE" bash -c "
+ docker exec cpu-test bash -c "
set -e
pip list"
# offline inference
- docker exec cpu-test-"$NUMA_NODE" bash -c "
+ docker exec cpu-test bash -c "
set -e
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
# Run kernel tests
- docker exec cpu-test-"$NUMA_NODE" bash -c "
+ docker exec cpu-test bash -c "
set -e
pytest -x -v -s tests/kernels/test_onednn.py
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py"
# basic online serving
- docker exec cpu-test-"$NUMA_NODE" bash -c '
+ docker exec cpu-test bash -c '
set -e
- VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve meta-llama/Llama-3.2-3B-Instruct --max-model-len 2048 &
+ VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve Qwen/Qwen3-0.6B --max-model-len 2048 &
server_pid=$!
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
vllm bench serve \
--backend vllm \
--dataset-name random \
- --model meta-llama/Llama-3.2-3B-Instruct \
+ --model Qwen/Qwen3-0.6B \
--num-prompts 20 \
--endpoint /v1/completions
kill -s SIGTERM $server_pid &'
@@ -61,4 +59,4 @@ function cpu_tests() {
# All of CPU tests are expected to be finished less than 40 mins.
export -f cpu_tests
-timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
+timeout 2h bash -c cpu_tests
diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
index 2267718f75ca5..438fe522c8702 100644
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -21,8 +21,8 @@ trap remove_docker_container EXIT
remove_docker_container
# Try building the docker image
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
# Run the image, setting --shm-size=4g for tensor parallel.
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
index d49f3e2f47cf1..4d163399cfc6c 100644
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -35,7 +35,7 @@ docker run \
echo $ZE_AFFINITY_MASK
pip install tblib==3.1.0
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
- python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
+ python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh
index 945c5e48c0090..2eaa91c04086c 100644
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@@ -2,6 +2,28 @@
set -ex
+# ======== part 0: setup ========
+
+BUCKET="vllm-wheels"
+INDICES_OUTPUT_DIR="indices"
+DEFAULT_VARIANT_ALIAS="cu129" # align with vLLM_MAIN_CUDA_VERSION in vllm/envs.py
+PYTHON=${PYTHON_PROG:=python3} # try to read from env var, otherwise use python3
+SUBPATH=$BUILDKITE_COMMIT
+S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
+
+# detect if python3.10+ is available
+has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,12) else 0)")
+if [[ "$has_new_python" -eq 0 ]]; then
+ # use new python from docker
+ docker pull python:3-slim
+ PYTHON="docker run --rm -v $(pwd):/app -w /app python:3-slim python3"
+fi
+
+echo "Using python interpreter: $PYTHON"
+echo "Python version: $($PYTHON --version)"
+
+# ========= part 1: collect, rename & upload the wheel ==========
+
# Assume wheels are in artifacts/dist/*.whl
wheel_files=(artifacts/dist/*.whl)
@@ -10,74 +32,69 @@ if [[ ${#wheel_files[@]} -ne 1 ]]; then
echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
exit 1
fi
-
-# Get the single wheel file
wheel="${wheel_files[0]}"
-# Detect architecture and rename 'linux' to appropriate manylinux version
-arch=$(uname -m)
-if [[ $arch == "x86_64" ]]; then
- manylinux_version="manylinux1"
-elif [[ $arch == "aarch64" ]]; then
- manylinux_version="manylinux2014"
-else
- echo "Warning: Unknown architecture $arch, using manylinux1 as default"
- manylinux_version="manylinux1"
-fi
+# current build image uses ubuntu 20.04, which corresponds to manylinux_2_31
+# refer to https://github.com/mayeut/pep600_compliance?tab=readme-ov-file#acceptable-distros-to-build-wheels
+manylinux_version="manylinux_2_31"
# Rename 'linux' to the appropriate manylinux version in the wheel filename
+if [[ "$wheel" != *"linux"* ]]; then
+ echo "Error: Wheel filename does not contain 'linux': $wheel"
+ exit 1
+fi
new_wheel="${wheel/linux/$manylinux_version}"
mv -- "$wheel" "$new_wheel"
wheel="$new_wheel"
+echo "Renamed wheel to: $wheel"
# Extract the version from the wheel
version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
-echo "Version: $version"
+echo "Version in wheel: $version"
+pure_version="${version%%+*}"
+echo "Pure version (without variant): $pure_version"
-normal_wheel="$wheel" # Save the original wheel filename
+# copy wheel to its own bucket
+aws s3 cp "$wheel" "$S3_COMMIT_PREFIX"
-# If the version contains "dev", rename it to v1.0.0.dev for consistency
-if [[ $version == *dev* ]]; then
- suffix="${version##*.}"
- if [[ $suffix == cu* ]]; then
- new_version="1.0.0.dev+${suffix}"
- else
- new_version="1.0.0.dev"
- fi
- new_wheel="${wheel/$version/$new_version}"
- # use cp to keep both files in the artifacts directory
- cp -- "$wheel" "$new_wheel"
- wheel="$new_wheel"
- version="$new_version"
-fi
+# ========= part 2: generate and upload indices ==========
+# generate indices for all existing wheels in the commit directory
+# this script might be run multiple times if there are multiple variants being built
+# so we need to guarantee there is little chance for "TOCTOU" issues
+# i.e., one process is generating indices while another is uploading a new wheel
+# so we need to ensure no time-consuming operations happen below
-# Upload the wheel to S3
-python3 .buildkite/generate_index.py --wheel "$normal_wheel"
+# list all wheels in the commit directory
+echo "Existing wheels on S3:"
+aws s3 ls "$S3_COMMIT_PREFIX"
+obj_json="objects.json"
+aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json"
+mkdir -p "$INDICES_OUTPUT_DIR"
-# generate index for this commit
-aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
-aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
-
-if [[ $normal_wheel == *"cu129"* ]]; then
- # only upload index.html for cu129 wheels (default wheels) as it
- # is available on both x86 and arm64
- aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
- aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
+# call script to generate indicies for all existing wheels
+# this indices have relative paths that could work as long as it is next to the wheel directory in s3
+# i.e., the wheels are always in s3://vllm-wheels//
+# and indices can be placed in //, or /nightly/, or //
+if [[ ! -z "$DEFAULT_VARIANT_ALIAS" ]]; then
+ alias_arg="--alias-to-default $DEFAULT_VARIANT_ALIAS"
else
- echo "Skipping index files for non-cu129 wheels"
+ alias_arg=""
fi
-# generate index for nightly
-aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
-aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
+$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" $alias_arg
-if [[ $normal_wheel == *"cu129"* ]]; then
- # only upload index.html for cu129 wheels (default wheels) as it
- # is available on both x86 and arm64
- aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
-else
- echo "Skipping index files for non-cu129 wheels"
+# copy indices to // unconditionally
+echo "Uploading indices to $S3_COMMIT_PREFIX"
+aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "$S3_COMMIT_PREFIX"
+
+# copy to /nightly/ only if it is on the main branch and not a PR
+if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]]; then
+ echo "Uploading indices to overwrite /nightly/"
+ aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/nightly/"
fi
-aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
-aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"
+# copy to // only if it does not have "dev" in the version
+if [[ "$version" != *"dev"* ]]; then
+ echo "Uploading indices to overwrite /$pure_version/"
+ aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
+fi
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 4ddf11c0b268f..ee4fdebae5675 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -39,9 +39,9 @@ steps:
# if this test fails, it means the nightly torch version is not compatible with some
# of the dependencies. Please check the error message and add the package to whitelist
# in /vllm/tools/pre_commit/generate_nightly_torch_test.py
- mirror_hardwares: [amdexperimental]
+ mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
agent_pool: mi325_1
- # grade: Blocking
+ grade: Blocking
soft_fail: true
source_file_dependencies:
- requirements/nightly_torch_test.txt
@@ -50,9 +50,9 @@ steps:
- label: Async Engine, Inputs, Utils, Worker Test # 10min
timeout_in_minutes: 15
- mirror_hardwares: [amdexperimental, amdproduction]
+ mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
agent_pool: mi325_1
- # grade: Blocking
+ grade: Blocking
source_file_dependencies:
- vllm/
- tests/multimodal
@@ -61,17 +61,18 @@ steps:
- pytest -v -s -m 'not cpu_test' multimodal
- pytest -v -s utils_
-- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 4 mins
- timeout_in_minutes: 10
- mirror_hardwares: [amdexperimental, amdproduction]
+- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 15min
+ timeout_in_minutes: 20
+ mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
agent_pool: mi325_1
- # grade: Blocking
+ grade: Blocking
source_file_dependencies:
- vllm/
- tests/test_inputs.py
- tests/test_outputs.py
- tests/multimodal
- tests/standalone_tests/lazy_imports.py
+ - tests/tokenizers_
- tests/transformers_utils
- tests/config
no_gpu: true
@@ -80,6 +81,7 @@ steps:
- pytest -v -s test_inputs.py
- pytest -v -s test_outputs.py
- pytest -v -s -m 'cpu_test' multimodal
+ - pytest -v -s tokenizers_
- pytest -v -s transformers_utils
- pytest -v -s config
@@ -113,9 +115,9 @@ steps:
- pytest -v -s basic_correctness/test_cpu_offload.py
- label: Entrypoints Unit Tests # 5min
- mirror_hardwares: [amdexperimental, amdproduction]
+ mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
agent_pool: mi325_1
- # grade: Blocking
+ grade: Blocking
timeout_in_minutes: 10
working_dir: "/vllm-workspace/tests"
fast_check: true
@@ -212,6 +214,7 @@ steps:
# test with internal dp
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+ - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
@@ -250,9 +253,9 @@ steps:
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
- label: EPLB Algorithm Test # 5min
- mirror_hardwares: [amdexperimental, amdproduction]
+ mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
agent_pool: mi325_1
- # grade: Blocking
+ grade: Blocking
timeout_in_minutes: 15
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
@@ -308,23 +311,20 @@ steps:
- pytest -v -s test_regression.py
working_dir: "/vllm-workspace/tests" # optional
-- label: Engine Test # 25min
- timeout_in_minutes: 40
+- label: Engine Test # 9min
+ timeout_in_minutes: 15
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
source_file_dependencies:
- vllm/
- tests/engine
- - tests/tokenization
- tests/test_sequence
- tests/test_config
- tests/test_logger
- tests/test_vllm_port
commands:
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
- # OOM in the CI unless we run this separately
- - pytest -v -s tokenization
- label: V1 Test e2e + engine # 30min
timeout_in_minutes: 45
@@ -342,9 +342,9 @@ steps:
- label: V1 Test entrypoints # 35min
timeout_in_minutes: 50
- mirror_hardwares: [amdexperimental, amdproduction]
+ mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
agent_pool: mi325_1
- # grade: Blocking
+ grade: Blocking
source_file_dependencies:
- vllm/
- tests/v1
@@ -392,6 +392,20 @@ steps:
commands:
- pytest -v -s v1/attention
+- label: Batch Invariance Tests (H100) # 10min
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ timeout_in_minutes: 25
+ gpu: h100
+ source_file_dependencies:
+ - vllm/
+ - tests/v1/determinism/
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pip install pytest-timeout pytest-forked
+ - pytest -v -s v1/determinism/test_batch_invariance.py
+ - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
+
- label: V1 Test attention (B200) # 10min
timeout_in_minutes: 30
gpu: b200
@@ -402,9 +416,9 @@ steps:
- VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
- label: V1 Test others (CPU) # 5 mins
- mirror_hardwares: [amdexperimental, amdproduction]
+ mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
agent_pool: mi325_1
- # grade: Blocking
+ grade: Blocking
source_file_dependencies:
- vllm/
- tests/v1
@@ -496,7 +510,7 @@ steps:
- label: PyTorch Compilation Unit Tests # 15min
timeout_in_minutes: 30
- mirror_hardwares: [amdexperimental]
+ mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
torch_nightly: true
@@ -513,7 +527,7 @@ steps:
- label: PyTorch Fullgraph Smoke Test # 15min
timeout_in_minutes: 30
- mirror_hardwares: [amdexperimental]
+ mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
torch_nightly: true
@@ -569,7 +583,7 @@ steps:
- label: Kernels Attention Test %N # 23min
timeout_in_minutes: 35
- mirror_hardwares: [amdexperimental]
+ mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_8
# grade: Blocking
source_file_dependencies:
@@ -596,7 +610,7 @@ steps:
- label: Kernels MoE Test %N # 40min
timeout_in_minutes: 60
- mirror_hardwares: [amdexperimental]
+ mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_8
# grade: Blocking
source_file_dependencies:
@@ -623,6 +637,26 @@ steps:
commands:
- pytest -v -s kernels/mamba
+- label: Kernels DeepGEMM Test (H100) # Nvidia-centric
+# Not replicating for CUTLAS & CuTe
+ timeout_in_minutes: 45
+ gpu: h100
+ num_gpus: 1
+ source_file_dependencies:
+ - tools/install_deepgemm.sh
+ - vllm/utils/deep_gemm.py
+ - vllm/model_executor/layers/fused_moe
+ - vllm/model_executor/layers/quantization
+ - tests/kernels/quantization/test_block_fp8.py
+ - tests/kernels/moe/test_deepgemm.py
+ - tests/kernels/moe/test_batched_deepgemm.py
+ - tests/kernels/attention/test_deepgemm_attention.py
+ commands:
+ - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
+ - pytest -v -s kernels/moe/test_deepgemm.py
+ - pytest -v -s kernels/moe/test_batched_deepgemm.py
+ - pytest -v -s kernels/attention/test_deepgemm_attention.py
+
- label: Model Executor Test # 23min
timeout_in_minutes: 35
torch_nightly: true
@@ -681,6 +715,7 @@ steps:
# we can only upgrade after this is resolved
# TODO(jerryzh168): resolve the above comment
- uv pip install --system torchao==0.13.0
+ - uv pip install --system conch-triton-kernels
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
- label: LM Eval Small Models # 15min
@@ -900,6 +935,18 @@ steps:
commands:
- pytest -v -s models/language/pooling_mteb_test
+- label: Multi-Modal Processor Test (CPU)
+ timeout_in_minutes: 60
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ source_file_dependencies:
+ - vllm/
+ - tests/models/multimodal
+ no_gpu: true
+ commands:
+ - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+ - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
+
- label: Multi-Modal Processor Test # 44min
timeout_in_minutes: 60
mirror_hardwares: [amdexperimental]
@@ -1056,6 +1103,7 @@ steps:
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
- pytest -v -s tests/kernels/moe/test_flashinfer.py
+ - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
- label: Blackwell Fusion and Compile Tests # 30 min
timeout_in_minutes: 40
@@ -1065,11 +1113,19 @@ steps:
- csrc/quantization/fp4/
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
+ - vllm/v1/worker/
+ - vllm/v1/cudagraph_dispatcher.py
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
+ - vllm/model_executor/layers/fused_moe/layer.py
+ - tests/compile/test_fusion_attn.py
+ - tests/compile/test_silu_mul_quant_fusion.py
+ - tests/compile/distributed/test_fusion_all_reduce.py
+ - tests/compile/distributed/test_fusions_e2e.py
+ - tests/compile/fullgraph/test_full_graph.py
commands:
- nvidia-smi
- pytest -v -s tests/compile/test_fusion_attn.py
@@ -1080,7 +1136,7 @@ steps:
# Wrap with quotes to escape yaml
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
- - pytest -v -s tests/compile/distributed/test_full_graph.py::test_fp8_kv_scale_compile
+ - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
- label: Blackwell Fusion E2E Tests # 30 min
timeout_in_minutes: 40
@@ -1102,7 +1158,7 @@ steps:
commands:
- nvidia-smi
# Run all e2e fusion tests
- - pytest -v -s tests/compile/test_fusions_e2e.py
+ - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
- label: ROCm GPT-OSS Eval
timeout_in_minutes: 60
@@ -1217,6 +1273,7 @@ steps:
- tests/v1/worker/test_worker_memory_snapshot.py
commands:
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+ - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
- pytest -v -s entrypoints/llm/test_collective_rpc.py
@@ -1252,7 +1309,7 @@ steps:
- label: Plugin Tests (2 GPUs) # 40min
timeout_in_minutes: 60
- mirror_hardwares: [amdexperimental]
+ mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_2
# grade: Blocking
working_dir: "/vllm-workspace/tests"
@@ -1328,7 +1385,7 @@ steps:
- label: Weight Loading Multiple GPU Test # 33min
timeout_in_minutes: 45
- mirror_hardwares: [amdexperimental]
+ mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_2
# grade: Blocking
working_dir: "/vllm-workspace/tests"
@@ -1428,14 +1485,14 @@ steps:
working_dir: "/vllm-workspace/"
num_gpus: 2
commands:
- - pytest -v -s tests/compile/distributed/test_async_tp.py
+ - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
#- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
- - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
- - pytest -v -s tests/compile/distributed/test_sequence_parallel.py
+ - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
+ - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
- pytest -v -s tests/distributed/test_context_parallel.py
- - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
+ - HIP_VISIBLE_DEVICES=0,1 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
- pytest -v -s tests/v1/distributed/test_dbo.py
##### B200 test #####
@@ -1465,7 +1522,7 @@ steps:
- bash .buildkite/scripts/run-prime-rl-test.sh
- label: DeepSeek V2-Lite Accuracy
- mirror_hardwares: [amdexperimental]
+ mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_4
# grade: Blocking
timeout_in_minutes: 60
@@ -1476,8 +1533,8 @@ steps:
commands:
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
-- label: Qwen3-30B-A3B-FP8-block Accuracy
- mirror_hardwares: [amdexperimental]
+- label: Qwen3-30B-A3B-FP8-block Accuracy (H100)
+ mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_4
# grade: Blocking
timeout_in_minutes: 60
@@ -1487,3 +1544,12 @@ steps:
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
+
+- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
+ timeout_in_minutes: 60
+ gpu: b200
+ optional: true
+ num_gpus: 2
+ working_dir: "/vllm-workspace"
+ commands:
+ - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
\ No newline at end of file
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index f1cd39ef4f948..52c848c784e53 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -57,14 +57,15 @@ steps:
- pytest -v -s -m 'not cpu_test' multimodal
- pytest -v -s utils_
-- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 4 mins
- timeout_in_minutes: 10
+- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 15min
+ timeout_in_minutes: 20
source_file_dependencies:
- vllm/
- tests/test_inputs.py
- tests/test_outputs.py
- tests/multimodal
- tests/standalone_tests/lazy_imports.py
+ - tests/tokenizers_
- tests/transformers_utils
- tests/config
no_gpu: true
@@ -73,6 +74,7 @@ steps:
- pytest -v -s test_inputs.py
- pytest -v -s test_outputs.py
- pytest -v -s -m 'cpu_test' multimodal
+ - pytest -v -s tokenizers_
- pytest -v -s transformers_utils
- pytest -v -s config
@@ -192,6 +194,7 @@ steps:
# test with internal dp
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+ - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
@@ -275,21 +278,18 @@ steps:
- pytest -v -s test_regression.py
working_dir: "/vllm-workspace/tests" # optional
-- label: Engine Test # 25min
- timeout_in_minutes: 40
+- label: Engine Test # 9min
+ timeout_in_minutes: 15
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
- tests/engine
- - tests/tokenization
- tests/test_sequence
- tests/test_config
- tests/test_logger
- tests/test_vllm_port
commands:
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
- # OOM in the CI unless we run this separately
- - pytest -v -s tokenization
- label: V1 Test e2e + engine # 30min
timeout_in_minutes: 45
@@ -390,20 +390,24 @@ steps:
- examples/
commands:
- pip install tensorizer # for tensorizer test
+ # for basic
+ - python3 offline_inference/basic/chat.py
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
- - python3 offline_inference/basic/chat.py
- - python3 offline_inference/prefix_caching.py
- - python3 offline_inference/llm_engine_example.py
- - python3 offline_inference/audio_language.py --seed 0
- - python3 offline_inference/vision_language.py --seed 0
- - python3 offline_inference/vision_language_pooling.py --seed 0
- - python3 offline_inference/vision_language_multi_image.py --seed 0
- - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
- python3 offline_inference/basic/classify.py
- python3 offline_inference/basic/embed.py
- python3 offline_inference/basic/score.py
+ # for multi-modal models
+ - python3 offline_inference/audio_language.py --seed 0
+ - python3 offline_inference/vision_language.py --seed 0
+ - python3 offline_inference/vision_language_multi_image.py --seed 0
+ - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
+ # for pooling models
+ - python3 pooling/pooling/vision_language_pooling.py --seed 0
+ # for features demo
+ - python3 offline_inference/prefix_caching.py
+ - python3 offline_inference/llm_engine_example.py
+ - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
@@ -631,6 +635,7 @@ steps:
# we can only upgrade after this is resolved
# TODO(jerryzh168): resolve the above comment
- uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
+ - uv pip install --system conch-triton-kernels
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
- label: LM Eval Small Models # 53min
@@ -818,14 +823,24 @@ steps:
commands:
- pytest -v -s models/language/pooling_mteb_test
-- label: Multi-Modal Processor Test # 44min
+- label: Multi-Modal Processor Test (CPU)
+ timeout_in_minutes: 60
+ source_file_dependencies:
+ - vllm/
+ - tests/models/multimodal
+ no_gpu: true
+ commands:
+ - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+ - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
+
+- label: Multi-Modal Processor Test
timeout_in_minutes: 60
source_file_dependencies:
- vllm/
- tests/models/multimodal
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- - pytest -v -s models/multimodal/processing
+ - pytest -v -s models/multimodal/processing/test_tensor_schema.py
- label: Multi-Modal Models Test (Standard) # 60min
timeout_in_minutes: 80
@@ -902,11 +917,12 @@ steps:
- label: Transformers Nightly Models Test
working_dir: "/vllm-workspace/"
optional: true
+ soft_fail: true
commands:
- pip install --upgrade git+https://github.com/huggingface/transformers
- - pytest -v -s tests/models/test_initialization.py -k 'not (Ultravox or Phi4Multimodal or MiniCPMO or Lfm2Moe or RobertaForSequenceClassification or Ovis2_5 or DeepseekOCR or KimiVL)'
+ - pytest -v -s tests/models/test_initialization.py
- pytest -v -s tests/models/test_transformers.py
- # - pytest -v -s tests/models/multimodal/processing/
+ - pytest -v -s tests/models/multimodal/processing/
- pytest -v -s tests/models/multimodal/test_mapping.py
- python3 examples/offline_inference/basic/chat.py
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
@@ -1116,6 +1132,7 @@ steps:
# https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+ - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
- pytest -v -s entrypoints/llm/test_collective_rpc.py
@@ -1299,11 +1316,11 @@ steps:
working_dir: "/vllm-workspace/"
num_gpus: 2
commands:
- - pytest -v -s tests/compile/distributed/test_async_tp.py
+ - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
- - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
- - pytest -v -s tests/distributed/test_sequence_parallel.py
+ - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
+ - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
- pytest -v -s tests/distributed/test_context_parallel.py
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
- pytest -v -s tests/v1/distributed/test_dbo.py
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 3247408e1163e..d6447649cd89a 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -146,9 +146,10 @@ mkdocs.yaml @hmellor
/requirements/kv_connectors.txt @NickLucche
# Pooling models
-/examples/*/pooling/ @noooop
+/examples/pooling @noooop
/tests/models/*/pooling* @noooop
/tests/entrypoints/pooling @noooop
+/vllm/entrypoints/pooling @noooop
/vllm/config/pooler.py @noooop
/vllm/pooling_params.py @noooop
/vllm/model_executor/layers/pooler.py @noooop
diff --git a/.github/workflows/cleanup_pr_body.yml b/.github/workflows/cleanup_pr_body.yml
index c3e132a536a42..56fbe5ca704a1 100644
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@@ -13,10 +13,10 @@ jobs:
steps:
- name: Checkout repository
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+ uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
- name: Set up Python
- uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
+ uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
with:
python-version: '3.12'
diff --git a/.github/workflows/issue_autolabel.yml b/.github/workflows/issue_autolabel.yml
index 7d565ef9f2e45..629966b959330 100644
--- a/.github/workflows/issue_autolabel.yml
+++ b/.github/workflows/issue_autolabel.yml
@@ -105,6 +105,31 @@ jobs:
}
],
},
+ cpu: {
+ // Keyword search - matches whole words only (with word boundaries)
+ keywords: [
+ {
+ term: "CPU Backend",
+ searchIn: "title"
+ },
+ {
+ term: "x86",
+ searchIn: "title"
+ },
+ {
+ term: "ARM",
+ searchIn: "title"
+ },
+ {
+ term: "Apple Silicon",
+ searchIn: "title"
+ },
+ {
+ term: "IBM Z",
+ searchIn: "title"
+ },
+ ],
+ },
// Add more label configurations here as needed
// example: {
// keywords: [...],
diff --git a/.github/workflows/macos-smoke-test.yml b/.github/workflows/macos-smoke-test.yml
index a183033c9adde..3a12c4b3a8300 100644
--- a/.github/workflows/macos-smoke-test.yml
+++ b/.github/workflows/macos-smoke-test.yml
@@ -12,7 +12,7 @@ jobs:
timeout-minutes: 30
steps:
- - uses: actions/checkout@v4
+ - uses: actions/checkout@v6
- uses: astral-sh/setup-uv@v7
with:
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index e21d13b8161f3..a03b979ad761d 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -16,8 +16,8 @@ jobs:
pre-commit:
runs-on: ubuntu-latest
steps:
- - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
+ - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+ - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
with:
python-version: "3.12"
- run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a4cf51d17e982..e09972fe71995 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -136,7 +136,7 @@ elseif(HIP_FOUND)
# ROCm 5.X and 6.X
if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND
- NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM})
+ Torch_VERSION VERSION_LESS ${TORCH_SUPPORTED_VERSION_ROCM})
message(WARNING "Pytorch version >= ${TORCH_SUPPORTED_VERSION_ROCM} "
"expected for ROCm build, saw ${Torch_VERSION} instead.")
endif()
@@ -354,8 +354,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# Only build Marlin kernels if we are building for at least some compatible archs.
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
# are not supported by Machete yet.
- # 9.0 for latest bf16 atomicAdd PTX
- cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
+
+ # marlin arches for fp16 output
+ cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
+ # marlin arches for bf16 output (we need 9.0 for bf16 atomicAdd PTX)
+ cuda_archs_loose_intersection(MARLIN_BF16_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
+ # marlin arches for fp8 input
+ # - sm80 doesn't support fp8 computation
+ # - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
+ # so we only enable fp8 computation for SM89 (e.g. RTX 40x0) and 12.0 (e.g. RTX 50x0)
+ cuda_archs_loose_intersection(MARLIN_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
+
if (MARLIN_ARCHS)
#
@@ -365,16 +374,18 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
set(MARLIN_GEN_SCRIPT
${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py)
file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH)
+ list(JOIN CUDA_ARCHS "," CUDA_ARCHS_STR)
+ set(MARLIN_GEN_SCRIPT_HASH_AND_ARCH "${MARLIN_GEN_SCRIPT_HASH}(ARCH:${CUDA_ARCHS_STR})")
- message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH}")
- message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH}")
+ message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
+ message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
- if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH}
- OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH})
+ if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
+ OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH})
execute_process(
COMMAND ${CMAKE_COMMAND} -E env
PYTHONPATH=$PYTHONPATH
- ${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT}
+ ${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR}
RESULT_VARIABLE marlin_generation_result
OUTPUT_VARIABLE marlin_generation_result
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
@@ -387,15 +398,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"\nCheck the log for details: "
"${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log")
else()
- set(MARLIN_GEN_SCRIPT_HASH ${MARLIN_GEN_SCRIPT_HASH}
- CACHE STRING "Last run Marlin generate script hash" FORCE)
+ set(MARLIN_GEN_SCRIPT_HASH_AND_ARCH ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
+ CACHE STRING "Last run Marlin generate script hash and arch" FORCE)
message(STATUS "Marlin generation completed successfully.")
endif()
else()
message(STATUS "Marlin generation script has not changed, skipping generation.")
endif()
- file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/kernel_*.cu")
+ file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_float16.cu")
set_gencode_flags_for_srcs(
SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
CUDA_ARCHS "${MARLIN_ARCHS}")
@@ -403,12 +414,34 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC}
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
endif()
-
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
+ file(GLOB MARLIN_TEMPLATE_BF16_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_bfloat16.cu")
+ set_gencode_flags_for_srcs(
+ SRCS "${MARLIN_TEMPLATE_BF16_KERNEL_SRC}"
+ CUDA_ARCHS "${MARLIN_BF16_ARCHS}")
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+ set_source_files_properties(${MARLIN_TEMPLATE_BF16_KERNEL_SRC}
+ PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+ endif()
+ list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_BF16_KERNEL_SRC})
+
+ if (MARLIN_FP8_ARCHS)
+ file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/gptq_marlin/sm89_kernel_*.cu")
+ set_gencode_flags_for_srcs(
+ SRCS "${MARLIN_TEMPLATE_FP8_KERNEL_SRC}"
+ CUDA_ARCHS "${MARLIN_FP8_ARCHS}")
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+ set_source_files_properties(${MARLIN_TEMPLATE_FP8_KERNEL_SRC}
+ PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+ endif()
+ list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_FP8_KERNEL_SRC})
+ endif()
+
set(MARLIN_SRCS
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
+ "csrc/quantization/gptq_marlin/marlin_int4_fp8_preprocess.cu"
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
"csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
set_gencode_flags_for_srcs(
@@ -604,12 +637,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
set(SRCS
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
"csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
- "csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu")
+ "csrc/quantization/fp4/nvfp4_experts_quant.cu"
+ "csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu"
+ "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${FP4_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM120=1")
+ list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM120=1")
message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
else()
message(STATUS "Not building NVFP4 as no compatible archs were found.")
@@ -938,8 +974,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
CUDA_ARCHS "${CUDA_ARCHS}")
list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
- # 9.0 for latest bf16 atomicAdd PTX
- cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
+ # moe marlin arches
+ # note that we always set `use_atomic_add=False` for moe marlin now,
+ # so we don't need 9.0 for bf16 atomicAdd PTX
+ cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
+ # moe marlin arches for fp8 input
+ # - sm80 doesn't support fp8 computation
+ # - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
+ # so we only enable fp8 computation for SM89 (e.g. RTX 40x0) and 12.0 (e.g. RTX 50x0)
+ cuda_archs_loose_intersection(MARLIN_MOE_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
if (MARLIN_MOE_ARCHS)
#
@@ -949,16 +992,18 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
set(MOE_MARLIN_GEN_SCRIPT
${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py)
file(MD5 ${MOE_MARLIN_GEN_SCRIPT} MOE_MARLIN_GEN_SCRIPT_HASH)
+ list(JOIN CUDA_ARCHS "," CUDA_ARCHS_STR)
+ set(MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH "${MOE_MARLIN_GEN_SCRIPT_HASH}(ARCH:${CUDA_ARCHS_STR})")
- message(STATUS "Marlin MOE generation script hash: ${MOE_MARLIN_GEN_SCRIPT_HASH}")
- message(STATUS "Last run Marlin MOE generate script hash: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}")
+ message(STATUS "Marlin MOE generation script hash with arch: ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
+ message(STATUS "Last run Marlin MOE generate script hash with arch: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
- if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}
- OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH})
+ if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
+ OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH})
execute_process(
COMMAND ${CMAKE_COMMAND} -E env
PYTHONPATH=$PYTHONPATH
- ${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT}
+ ${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR}
RESULT_VARIABLE moe_marlin_generation_result
OUTPUT_VARIABLE moe_marlin_generation_output
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
@@ -971,7 +1016,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"\nCheck the log for details: "
"${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log")
else()
- set(MOE_MARLIN_GEN_SCRIPT_HASH ${MOE_MARLIN_GEN_SCRIPT_HASH}
+ set(MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
CACHE STRING "Last run Marlin MOE generate script hash" FORCE)
message(STATUS "Marlin MOE generation completed successfully.")
endif()
@@ -979,16 +1024,28 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
endif()
- file(GLOB MOE_WNAA16_MARLIN_SRC "csrc/moe/marlin_moe_wna16/*.cu")
+ file(GLOB MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/sm80_kernel_*.cu")
+ list(APPEND MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/ops.cu")
set_gencode_flags_for_srcs(
- SRCS "${MOE_WNAA16_MARLIN_SRC}"
+ SRCS "${MARLIN_MOE_SRC}"
CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
- set_source_files_properties(${MOE_WNAA16_MARLIN_SRC}
+ set_source_files_properties(${MARLIN_MOE_SRC}
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
endif()
+ list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SRC})
- list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})
+ if (MARLIN_MOE_FP8_ARCHS)
+ file(GLOB MARLIN_MOE_FP8_SRC "csrc/moe/marlin_moe_wna16/sm89_kernel_*.cu")
+ set_gencode_flags_for_srcs(
+ SRCS "${MARLIN_MOE_FP8_SRC}"
+ CUDA_ARCHS "${MARLIN_MOE_FP8_ARCHS}")
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+ set_source_files_properties(${MARLIN_MOE_FP8_SRC}
+ PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+ endif()
+ list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_FP8_SRC})
+ endif()
message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
else()
diff --git a/README.md b/README.md
index 033e1035d8916..abbb63158f166 100644
--- a/README.md
+++ b/README.md
@@ -21,6 +21,7 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio
*Latest News* 🔥
+- [2025/11] We hosted [vLLM Bangkok Meetup](https://luma.com/v0f647nv). We explored vLLM and LMCache inference and low-resource language adaptation with speakers from Embedded LLM, AMD, and Red Hat. Please find the meetup slides [here](https://drive.google.com/drive/folders/1H0DS57F8HQ5q3kSOSoRmucPJWL3E0A_X?usp=sharing).
- [2025/11] We hosted [the first vLLM Europe Meetup in Zurich](https://luma.com/0gls27kb) focused on quantization, distributed inference, and reinforcement learning at scale with speakers from Mistral, IBM, and Red Hat. Please find the meetup slides [here](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) and recording [here](https://www.youtube.com/watch?v=6m6ZE6yVEDI)
- [2025/11] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w) focusing on distributed inference and diverse accelerator support with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link).
- [2025/10] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg) focused on hands-on vLLM inference optimization! Please find the meetup slides [here](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6).
diff --git a/benchmarks/auto_tune/README.md b/benchmarks/auto_tune/README.md
index d1bdb4c43f10b..9a9600e08dafe 100644
--- a/benchmarks/auto_tune/README.md
+++ b/benchmarks/auto_tune/README.md
@@ -83,7 +83,7 @@ MIN_CACHE_HIT_PCT=0
MAX_LATENCY_ALLOWED_MS=100000000000 # A very large number
```
-#### 2. Maximize Throughput with a Latency Requirement
+### 2. Maximize Throughput with a Latency Requirement
- **Goal**: Find the best server parameters when P99 end-to-end latency must be below 500ms.
- **Configuration**:
@@ -96,7 +96,7 @@ MIN_CACHE_HIT_PCT=0
MAX_LATENCY_ALLOWED_MS=500
```
-#### 3. Maximize Throughput with Prefix Caching and Latency Requirements
+### 3. Maximize Throughput with Prefix Caching and Latency Requirements
- **Goal**: Find the best server parameters assuming a 60% prefix cache hit rate and a latency requirement of 500ms.
- **Configuration**:
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 4021fede72153..d69d74ca61f54 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -620,7 +620,7 @@ def get_tokenizer(
kwargs["use_fast"] = False
if tokenizer_mode == "mistral":
try:
- from vllm.transformers_utils.tokenizer import MistralTokenizer
+ from vllm.tokenizers import MistralTokenizer
except ImportError as e:
raise ImportError(
"MistralTokenizer requires vllm package.\n"
diff --git a/benchmarks/benchmark_ngram_proposer.py b/benchmarks/benchmark_ngram_proposer.py
index dedb564fffac8..cac401456b62a 100644
--- a/benchmarks/benchmark_ngram_proposer.py
+++ b/benchmarks/benchmark_ngram_proposer.py
@@ -108,7 +108,10 @@ def benchmark_batched_propose(args):
device_config=DeviceConfig(device=current_platform.device_type),
parallel_config=ParallelConfig(),
load_config=LoadConfig(),
- scheduler_config=SchedulerConfig(),
+ scheduler_config=SchedulerConfig(
+ max_model_len=model_config.max_model_len,
+ is_encoder_decoder=model_config.is_encoder_decoder,
+ ),
)
# monkey patch vllm.v1.worker.gpu_model_runner.get_pp_group
diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py
index 28fc383a318dd..e6391134ff932 100644
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -40,7 +40,7 @@ from vllm.engine.arg_utils import EngineArgs
from vllm.utils.argparse_utils import FlexibleArgumentParser
try:
- from vllm.transformers_utils.tokenizer import get_tokenizer
+ from vllm.tokenizers import get_tokenizer
except ImportError:
from backend_request_func import get_tokenizer
diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py
index 55001cf3722a0..df122b4c5e8db 100644
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -46,7 +46,7 @@ from tqdm.asyncio import tqdm
from transformers import PreTrainedTokenizerBase
try:
- from vllm.transformers_utils.tokenizer import get_tokenizer
+ from vllm.tokenizers import get_tokenizer
except ImportError:
from backend_request_func import get_tokenizer
diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py
index 8787724d77cfb..ac78c019a59e5 100644
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@@ -237,6 +237,7 @@ def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable:
b_q_weight=w_q,
b_bias=None,
b_scales=w_s,
+ a_scales=None,
global_scale=None,
b_zeros=w_zp,
g_idx=g_idx,
diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py
index 12ca9214b1f95..48d790aec9e07 100644
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@@ -263,7 +263,7 @@ def bench_run(
results.append(
benchmark.Timer(
- stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)", # noqa: E501
+ stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)", # noqa: E501
globals=globals,
label=label,
sub_label=sub_label,
@@ -273,7 +273,7 @@ def bench_run(
results.append(
benchmark.Timer(
- stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)", # noqa: E501
+ stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)", # noqa: E501
globals=globals,
label=label,
sub_label=sub_label,
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index ca0062ba4fabe..5047c354ff7d2 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -495,7 +495,13 @@ function (define_extension_target MOD_NAME)
set(SOABI_KEYWORD "")
endif()
- if (ARG_USE_SABI)
+ run_python(IS_FREETHREADED_PYTHON
+ "import sysconfig; print(1 if sysconfig.get_config_var(\"Py_GIL_DISABLED\") else 0)"
+ "Failed to determine whether interpreter is free-threaded")
+
+ # Free-threaded Python doesn't yet support the stable ABI (see PEP 803/809),
+ # so avoid using the stable ABI under free-threading only.
+ if (ARG_USE_SABI AND NOT IS_FREETHREADED_PYTHON)
Python_add_library(${MOD_NAME} MODULE USE_SABI ${ARG_USE_SABI} ${SOABI_KEYWORD} "${ARG_SOURCES}")
else()
Python_add_library(${MOD_NAME} MODULE ${SOABI_KEYWORD} "${ARG_SOURCES}")
diff --git a/csrc/attention/merge_attn_states.cu b/csrc/attention/merge_attn_states.cu
index 229d9862fb670..27d1e990c611e 100644
--- a/csrc/attention/merge_attn_states.cu
+++ b/csrc/attention/merge_attn_states.cu
@@ -16,7 +16,8 @@ __global__ void merge_attn_states_kernel(
scalar_t* output, float* output_lse, const scalar_t* prefix_output,
const float* prefix_lse, const scalar_t* suffix_output,
const float* suffix_lse, const uint num_tokens, const uint num_heads,
- const uint head_size) {
+ const uint head_size, const uint prefix_head_stride,
+ const uint output_head_stride) {
using pack_128b_t = uint4;
const uint pack_size = 16 / sizeof(scalar_t);
const uint threads_per_head = head_size / pack_size;
@@ -34,11 +35,13 @@ __global__ void merge_attn_states_kernel(
const uint head_idx = token_head_idx % num_heads;
const uint pack_offset = pack_idx * pack_size; // (0~15)*8, etc.
- const uint head_offset =
- token_idx * num_heads * head_size + head_idx * head_size;
- const scalar_t* prefix_head_ptr = prefix_output + head_offset;
- const scalar_t* suffix_head_ptr = suffix_output + head_offset;
- scalar_t* output_head_ptr = output + head_offset;
+ const uint src_head_offset = token_idx * num_heads * prefix_head_stride +
+ head_idx * prefix_head_stride;
+ const uint dst_head_offset = token_idx * num_heads * output_head_stride +
+ head_idx * output_head_stride;
+ const scalar_t* prefix_head_ptr = prefix_output + src_head_offset;
+ const scalar_t* suffix_head_ptr = suffix_output + src_head_offset;
+ scalar_t* output_head_ptr = output + dst_head_offset;
float p_lse = prefix_lse[head_idx * num_tokens + token_idx];
float s_lse = suffix_lse[head_idx * num_tokens + token_idx];
@@ -140,7 +143,7 @@ __global__ void merge_attn_states_kernel(
reinterpret_cast(prefix_lse.data_ptr()), \
reinterpret_cast(suffix_output.data_ptr()), \
reinterpret_cast(suffix_lse.data_ptr()), num_tokens, \
- num_heads, head_size); \
+ num_heads, head_size, prefix_head_stride, output_head_stride); \
}
/*@brief Merges the attention states from prefix and suffix
@@ -166,17 +169,11 @@ void merge_attn_states_launcher(torch::Tensor& output,
const uint num_tokens = output.size(0);
const uint num_heads = output.size(1);
const uint head_size = output.size(2);
+ const uint prefix_head_stride = prefix_output.stride(1);
+ const uint output_head_stride = output.stride(1);
const uint pack_size = 16 / sizeof(scalar_t);
TORCH_CHECK(head_size % pack_size == 0,
"headsize must be multiple of pack_size:", pack_size);
- TORCH_CHECK(output.stride(-2) == head_size && output.stride(-1) == 1,
- "output heads must be contiguous in memory");
- TORCH_CHECK(
- prefix_output.stride(-2) == head_size && prefix_output.stride(-1) == 1,
- "prefix_output heads must be contiguous in memory");
- TORCH_CHECK(
- suffix_output.stride(-2) == head_size && suffix_output.stride(-1) == 1,
- "suffix_output heads must be contiguous in memory");
float* output_lse_ptr = nullptr;
if (output_lse.has_value()) {
output_lse_ptr = output_lse.value().data_ptr();
diff --git a/csrc/cpu/utils.cpp b/csrc/cpu/utils.cpp
index 5199ba2af024f..3dacfc7b2b7a3 100644
--- a/csrc/cpu/utils.cpp
+++ b/csrc/cpu/utils.cpp
@@ -51,12 +51,13 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
if (node_id != -1) {
node_ids.insert(node_id);
}
- TORCH_WARN(node_id == mem_node_id, "CPU ", cpu_id, " is on NUMA node ",
- node_id, ", but CPU ", omp_cpu_ids.front(),
- " is on NUMA node ", mem_node_id,
- ". All CPUs should be on the same NUMA node for optimal "
- "performance. Memory will be bound to NUMA node ",
- mem_node_id, ".");
+ if (node_id != mem_node_id) {
+ TORCH_WARN("CPU ", cpu_id, " is on NUMA node ", node_id, ", but CPU ",
+ omp_cpu_ids.front(), " is on NUMA node ", mem_node_id,
+ ". All CPUs should be on the same NUMA node for optimal "
+ "performance. Memory will be bound to NUMA node ",
+ mem_node_id, ".");
+ }
}
// Concatenate all node_ids into a single comma-separated string
if (!node_ids.empty()) {
diff --git a/csrc/moe/dynamic_4bit_int_moe_cpu.cpp b/csrc/moe/dynamic_4bit_int_moe_cpu.cpp
index df47bb8dd1d7d..58dc402016881 100644
--- a/csrc/moe/dynamic_4bit_int_moe_cpu.cpp
+++ b/csrc/moe/dynamic_4bit_int_moe_cpu.cpp
@@ -93,16 +93,16 @@ torch::Tensor dynamic_4bit_int_moe_cpu(
}
auto Y_all = at::empty({offsets[E], H}, x_c.options());
- at::parallel_for(0, E, 1, [&](int64_t e_begin, int64_t e_end) {
+ at::parallel_for(0, offsets[E], 0, [&](int64_t idx_begin, int64_t idx_end) {
c10::InferenceMode guard;
- for (int64_t e = e_begin; e < e_end; ++e) {
- const int64_t te = counts[e];
- if (te == 0) {
+ for (int64_t e = 0; e < E; ++e) {
+ int64_t start = std::max(offsets[e], idx_begin);
+ int64_t end = std::min(offsets[e + 1], idx_end);
+ int64_t te = end - start;
+ if (te <= 0) {
continue;
}
- const int64_t start = offsets[e];
-
auto x_e = X_all.narrow(/*dim=*/0, /*start=*/start, /*length=*/te);
auto w13_e = w13_packed.select(/*dim=*/0, e);
diff --git a/csrc/moe/marlin_moe_wna16/.gitignore b/csrc/moe/marlin_moe_wna16/.gitignore
index 77088552b85b4..ba805f9250ece 100644
--- a/csrc/moe/marlin_moe_wna16/.gitignore
+++ b/csrc/moe/marlin_moe_wna16/.gitignore
@@ -1 +1,2 @@
-kernel_*.cu
\ No newline at end of file
+sm*_kernel_*.cu
+kernel_selector.h
diff --git a/csrc/moe/marlin_moe_wna16/generate_kernels.py b/csrc/moe/marlin_moe_wna16/generate_kernels.py
index be5b68cc53e6f..88f1055337fd5 100644
--- a/csrc/moe/marlin_moe_wna16/generate_kernels.py
+++ b/csrc/moe/marlin_moe_wna16/generate_kernels.py
@@ -4,134 +4,282 @@ import glob
import itertools
import os
import subprocess
+import sys
import jinja2
-FILE_HEAD = """
-// auto generated by generate.py
-// clang-format off
+ARCHS = []
+SUPPORT_FP8 = False
+for arch in sys.argv[1].split(","):
+ arch = arch[: arch.index(".") + 2].replace(".", "")
+ arch = int(arch)
+ # only SM89 and SM120 fully support
+ # mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32.
+ # SM90 and SM100 can use this PTX, but it’s simulated
+ # with FP16 MMA, so it cannot achieve any acceleration.
+ if arch in [89, 120]:
+ SUPPORT_FP8 = True
+FILE_HEAD_COMMENT = """
+// auto generated by generate_kernels.py
+// clang-format off
+""".lstrip()
+
+FILE_HEAD = (
+ FILE_HEAD_COMMENT
+ + """
#include "kernel.h"
#include "marlin_template.h"
namespace MARLIN_NAMESPACE_NAME {
-""".strip()
+"""
+)
TEMPLATE = (
"template __global__ void Marlin<"
- "{{scalar_t}}, "
- "{{w_type_id}}, "
+ "{{a_type_id}}, "
+ "{{b_type_id}}, "
+ "{{c_type_id}}, "
"{{s_type_id}}, "
"{{threads}}, "
"{{thread_m_blocks}}, "
"{{thread_n_blocks}}, "
"{{thread_k_blocks}}, "
- "{{'true' if m_block_size_8 else 'false'}}, "
+ "{{m_block_size_8}}, "
"{{stages}}, "
"{{group_blocks}}, "
- "{{'true' if is_zp_float else 'false'}}>"
+ "{{is_zp_float}}>"
"( MARLIN_KERNEL_PARAMS );"
)
-# int8 with zero point case (vllm::kU8) is also supported,
-# we don't add it to reduce wheel size.
-SCALAR_TYPES = [
- "vllm::kU4",
- "vllm::kU4B8",
- "vllm::kU8B128",
- "vllm::kFE4M3fn",
- "vllm::kFE2M1f",
-]
THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128)]
THREAD_M_BLOCKS = [0.5, 1, 2, 3, 4]
-# group_blocks:
-# = 0 : act order case
-# = -1 : channelwise quantization
-# > 0 : group_size=16*group_blocks
-GROUP_BLOCKS = [0, -1, 1, 2, 4, 8]
-DTYPES = ["fp16", "bf16"]
+
+QUANT_CONFIGS = [
+ # AWQ-INT4
+ {
+ "b_type": "kU4",
+ "thread_configs": THREAD_CONFIGS,
+ "thread_m_blocks": THREAD_M_BLOCKS,
+ "group_blocks": [-1, 2, 4, 8],
+ },
+ # GPTQ-INT4
+ {
+ "b_type": "kU4B8",
+ "thread_configs": THREAD_CONFIGS,
+ "thread_m_blocks": THREAD_M_BLOCKS,
+ "group_blocks": [-1, 0, 2, 4, 8],
+ },
+ # AWQ-INT8
+ {
+ "b_type": "kU8B128",
+ "thread_configs": THREAD_CONFIGS,
+ "thread_m_blocks": THREAD_M_BLOCKS,
+ "group_blocks": [-1, 0, 2, 4, 8],
+ },
+ # FP8
+ {
+ "b_type": "kFE4M3fn",
+ "thread_configs": THREAD_CONFIGS,
+ "thread_m_blocks": THREAD_M_BLOCKS,
+ "group_blocks": [-1, 8],
+ },
+ # NVFP4
+ {
+ "b_type": "kFE2M1f",
+ "s_type": "kFE4M3fn",
+ "thread_configs": THREAD_CONFIGS,
+ "thread_m_blocks": THREAD_M_BLOCKS,
+ "group_blocks": [1],
+ },
+ # MXFP4
+ {
+ "a_type": ["kBFloat16"],
+ "b_type": "kFE2M1f",
+ "s_type": "kFE8M0fnu",
+ "thread_configs": THREAD_CONFIGS,
+ "thread_m_blocks": THREAD_M_BLOCKS,
+ "group_blocks": [2],
+ },
+ # AWQ-INT4 with INT8 activation
+ {
+ "a_type": ["kS8"],
+ "b_type": "kU4",
+ "thread_configs": THREAD_CONFIGS,
+ "thread_m_blocks": [1, 2, 3, 4],
+ "group_blocks": [-1, 2, 4, 8],
+ },
+ # GPTQ-INT4 with INT8 activation
+ {
+ "a_type": ["kS8"],
+ "b_type": "kU4B8",
+ "thread_configs": THREAD_CONFIGS,
+ "thread_m_blocks": [1, 2, 3, 4],
+ "group_blocks": [-1, 2, 4, 8],
+ },
+ # GPTQ-INT4 with FP8 activation
+ {
+ "a_type": ["kFE4M3fn"],
+ "b_type": "kU4B8",
+ "thread_configs": THREAD_CONFIGS,
+ "thread_m_blocks": [1, 2, 3, 4],
+ "group_blocks": [-1, 2, 4, 8],
+ },
+ # AWQ-INT4 with FP8 activation
+ {
+ "a_type": ["kFE4M3fn"],
+ "b_type": "kU4",
+ "thread_configs": THREAD_CONFIGS,
+ "thread_m_blocks": [1, 2, 3, 4],
+ "group_blocks": [-1, 2, 4, 8],
+ },
+ # MXFP4 with FP8 activation
+ {
+ "a_type": ["kFE4M3fn"],
+ "b_type": "kFE2M1f",
+ "c_type": ["kBFloat16"],
+ "s_type": "kFE8M0fnu",
+ "thread_configs": THREAD_CONFIGS,
+ "thread_m_blocks": [1, 2, 3, 4],
+ "group_blocks": [2],
+ },
+]
def remove_old_kernels():
- for filename in glob.glob(os.path.dirname(__file__) + "/kernel_*.cu"):
+ for filename in glob.glob(os.path.dirname(__file__) + "/*kernel_*.cu"):
subprocess.call(["rm", "-f", filename])
+ filename = os.path.dirname(__file__) + "/kernel_selector.h"
+ subprocess.call(["rm", "-f", filename])
+
def generate_new_kernels():
- for scalar_type, dtype in itertools.product(SCALAR_TYPES, DTYPES):
+ result_dict = {}
+
+ for quant_config in QUANT_CONFIGS:
+ c_types = quant_config.get("c_type", ["kFloat16", "kBFloat16"])
+ a_types = quant_config.get("a_type", ["kFloat16", "kBFloat16"])
+ b_type = quant_config["b_type"]
+ all_group_blocks = quant_config["group_blocks"]
+ all_m_blocks = quant_config["thread_m_blocks"]
+ all_thread_configs = quant_config["thread_configs"]
+
+ for a_type, c_type in itertools.product(a_types, c_types):
+ if not SUPPORT_FP8 and a_type == "kFE4M3fn":
+ continue
+ if "16" in a_type and "16" in c_type and a_type != c_type:
+ continue
+ s_type = quant_config.get("s_type", c_type)
+ if (a_type, b_type, c_type) not in result_dict:
+ result_dict[(a_type, b_type, c_type)] = []
+
+ for group_blocks, m_blocks, thread_configs in itertools.product(
+ all_group_blocks, all_m_blocks, all_thread_configs
+ ):
+ thread_k, thread_n, threads = thread_configs
+
+ if threads == 256:
+ # for small batch (m_blocks == 1),
+ # we only need (128, 128, 256)
+ # for large batch (m_blocks > 1),
+ # we only need (64, 256, 256)
+ if m_blocks <= 1 and (thread_k, thread_n) != (128, 128):
+ continue
+ if m_blocks > 1 and (thread_k, thread_n) != (64, 256):
+ continue
+
+ config = {
+ "threads": threads,
+ "s_type": s_type,
+ "thread_m_blocks": max(m_blocks, 1),
+ "thread_k_blocks": thread_k // 16,
+ "thread_n_blocks": thread_n // 16,
+ "m_block_size_8": "true" if m_blocks == 0.5 else "false",
+ "stages": "pipe_stages",
+ "group_blocks": group_blocks,
+ "is_zp_float": "false",
+ }
+
+ result_dict[(a_type, b_type, c_type)].append(config)
+
+ kernel_selector_str = FILE_HEAD_COMMENT
+
+ for (a_type, b_type, c_type), config_list in result_dict.items():
all_template_str_list = []
-
- for group_blocks, m_blocks, thread_configs in itertools.product(
- GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS
- ):
- # act order case only support gptq-int4 and gptq-int8
- if group_blocks == 0 and scalar_type not in [
- "vllm::kU4B8",
- "vllm::kU8B128",
- ]:
- continue
- if thread_configs[2] == 256:
- # for small batch (m_blocks == 1), we only need (128, 128, 256)
- # for large batch (m_blocks > 1), we only need (64, 256, 256)
- if m_blocks <= 1 and thread_configs[0] != 128:
- continue
- if m_blocks > 1 and thread_configs[0] != 64:
- continue
-
- # we only support channelwise quantization and group_size == 128
- # for fp8
- if scalar_type == "vllm::kFE4M3fn" and group_blocks not in [-1, 8]:
- continue
- # nvfp4 only supports group_size == 16
- # mxfp4 only supports group_size == 32
- if scalar_type == "vllm::kFE2M1f" and group_blocks not in [1, 2]:
- continue
- # other quantization methods don't support group_size = 16
- if scalar_type != "vllm::kFE2M1f" and group_blocks == 1:
- continue
-
- k_blocks = thread_configs[0] // 16
- n_blocks = thread_configs[1] // 16
- threads = thread_configs[2]
-
- c_dtype = "half" if dtype == "fp16" else "nv_bfloat16"
-
- if scalar_type == "vllm::kFE2M1f" and group_blocks == 1:
- s_type = "vllm::kFE4M3fn"
- elif scalar_type == "vllm::kFE2M1f" and group_blocks == 2:
- s_type = "vllm::kFE8M0fnu"
- if dtype == "fp16":
- # we cannot safely dequantize e8m0 to fp16, so skip this
- continue
- elif dtype == "fp16":
- s_type = "vllm::kFloat16"
- elif dtype == "bf16":
- s_type = "vllm::kBFloat16"
-
+ for config in config_list:
+ s_type = config["s_type"]
template_str = jinja2.Template(TEMPLATE).render(
- scalar_t=c_dtype,
- w_type_id=scalar_type + ".id()",
- s_type_id=s_type + ".id()",
- threads=threads,
- thread_m_blocks=max(m_blocks, 1),
- thread_n_blocks=n_blocks,
- thread_k_blocks=k_blocks,
- m_block_size_8=m_blocks == 0.5,
- stages="pipe_stages",
- group_blocks=group_blocks,
- is_zp_float=False,
+ a_type_id=f"vllm::{a_type}.id()",
+ b_type_id=f"vllm::{b_type}.id()",
+ c_type_id=f"vllm::{c_type}.id()",
+ s_type_id=f"vllm::{s_type}.id()",
+ **config,
+ )
+ all_template_str_list.append(template_str)
+
+ conditions = [
+ f"a_type == vllm::{a_type}",
+ f"b_type == vllm::{b_type}",
+ f"c_type == vllm::{c_type}",
+ f"s_type == vllm::{s_type}",
+ f"threads == {config['threads']}",
+ f"thread_m_blocks == {config['thread_m_blocks']}",
+ f"thread_n_blocks == {config['thread_n_blocks']}",
+ f"thread_k_blocks == {config['thread_k_blocks']}",
+ f"m_block_size_8 == {config['m_block_size_8']}",
+ f"group_blocks == {config['group_blocks']}",
+ f"is_zp_float == {config['is_zp_float']}",
+ ]
+ conditions = " && ".join(conditions)
+
+ if kernel_selector_str == FILE_HEAD_COMMENT:
+ kernel_selector_str += f"if ({conditions})\n kernel = "
+ else:
+ kernel_selector_str += f"else if ({conditions})\n kernel = "
+
+ kernel_template2 = (
+ "Marlin<{{a_type_id}}, {{b_type_id}}, {{c_type_id}}, "
+ "{{s_type_id}}, {{threads}}, {{thread_m_blocks}}, "
+ "{{thread_n_blocks}}, {{thread_k_blocks}}, "
+ "{{m_block_size_8}}, {{stages}}, {{group_blocks}}, "
+ "{{is_zp_float}}>;"
)
- all_template_str_list.append(template_str)
+ kernel_selector_str += (
+ jinja2.Template(kernel_template2).render(
+ a_type_id=f"vllm::{a_type}.id()",
+ b_type_id=f"vllm::{b_type}.id()",
+ c_type_id=f"vllm::{c_type}.id()",
+ s_type_id=f"vllm::{s_type}.id()",
+ **config,
+ )
+ + "\n"
+ )
file_content = FILE_HEAD + "\n\n"
file_content += "\n\n".join(all_template_str_list) + "\n\n}\n"
- filename = f"kernel_{dtype}_{scalar_type[6:].lower()}.cu"
+ if a_type == "kFE4M3fn":
+ filename = f"sm89_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
+ else:
+ filename = f"sm80_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
+
+ filename = filename.lower()
with open(os.path.join(os.path.dirname(__file__), filename), "w") as f:
f.write(file_content)
+ if not SUPPORT_FP8 and kernel_selector_str != FILE_HEAD_COMMENT:
+ kernel_selector_str += (
+ "else if (a_type == vllm::kFE4M3fn)\n"
+ " TORCH_CHECK(false, "
+ '"marlin kernel with fp8 activation is not built.");'
+ )
+
+ with open(os.path.join(os.path.dirname(__file__), "kernel_selector.h"), "w") as f:
+ f.write(kernel_selector_str)
+
if __name__ == "__main__":
remove_old_kernels()
diff --git a/csrc/moe/marlin_moe_wna16/kernel.h b/csrc/moe/marlin_moe_wna16/kernel.h
index 6190f7ee21ece..57f5a17932d44 100644
--- a/csrc/moe/marlin_moe_wna16/kernel.h
+++ b/csrc/moe/marlin_moe_wna16/kernel.h
@@ -11,8 +11,9 @@
const int4 *__restrict__ A, const int4 *__restrict__ B, \
int4 *__restrict__ C, int4 *__restrict__ C_tmp, \
const int4 *__restrict__ b_bias_ptr, \
+ const float *__restrict__ a_scales_ptr, \
const int4 *__restrict__ scales_ptr, \
- const uint16_t *__restrict__ scale2_ptr, \
+ const uint16_t *__restrict__ global_scale_ptr, \
const int4 *__restrict__ zp_ptr, const int *__restrict__ g_idx, \
const int32_t *__restrict__ sorted_token_ids_ptr, \
const int32_t *__restrict__ expert_ids_ptr, \
@@ -20,12 +21,13 @@
const float *__restrict__ topk_weights_ptr, int top_k, \
bool mul_topk_weights, bool is_ep, int num_groups, int prob_m, \
int prob_n, int prob_k, int *locks, bool has_bias, bool use_atomic_add, \
- bool use_fp32_reduce, int max_shared_mem
+ bool use_fp32_reduce
namespace MARLIN_NAMESPACE_NAME {
-template shared
// fetch pipeline
- const int group_blocks, // number of consecutive 16x16 blocks
- // with a separate quantization scale
- const bool is_zp_float // is zero point of float16 type?
+ const bool has_act_order, // whether act_order is enabled
+ const int group_blocks, // number of consecutive 16x16 blocks
+ // with a separate quantization scale
+ const bool is_zp_float // is zero point of float16 type?
>
__global__ void Marlin(
const int4* __restrict__ A, // fp16 input matrix of shape mxk
@@ -76,8 +77,8 @@ __global__ void Marlin(
int prob_k, // reduction dimension k
int* locks, // extra global storage for barrier synchronization
bool use_atomic_add, // whether to use atomic add to reduce
- bool use_fp32_reduce, // whether to use fp32 global reduce
- int max_shared_mem) {}
+ bool use_fp32_reduce // whether to use fp32 global reduce
+) {}
} // namespace MARLIN_NAMESPACE_NAME
@@ -85,65 +86,148 @@ __global__ void Marlin(
// m16n8k16 tensor core mma instruction with fp16 inputs and fp32
// output/accumulation.
-template
-__device__ inline void mma(const typename ScalarType::FragA& a_frag,
- const typename ScalarType::FragB& frag_b,
- typename ScalarType::FragC& frag_c) {
+template
+__device__ inline void mma(
+ const typename MarlinScalarType::FragA& a_frag,
+ const typename MarlinScalarType::FragB& frag_b,
+ typename MarlinScalarType::FragC& frag_c, int idx = 0) {
const uint32_t* a = reinterpret_cast(&a_frag);
const uint32_t* b = reinterpret_cast(&frag_b);
- float* c = reinterpret_cast(&frag_c);
- if constexpr (std::is_same::value) {
- asm volatile(
- "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
- "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
- : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
- : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
- "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
- } else if constexpr (std::is_same::value) {
- asm volatile(
- "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
- "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
- : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
- : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
- "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
- } else {
- STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
+ using scalar_t = typename MarlinScalarType::scalar_t;
+ if constexpr (k_size == 16) {
+ if constexpr (std::is_same::value) {
+ float* c = reinterpret_cast(&frag_c);
+ asm volatile(
+ "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
+ "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+ : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+ : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+ "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+ } else if constexpr (std::is_same::value) {
+ float* c = reinterpret_cast(&frag_c);
+ asm volatile(
+ "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
+ "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+ : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+ : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+ "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+ } else if constexpr (std::is_same::value) {
+ float* c = reinterpret_cast(&frag_c);
+ asm volatile(
+ "mma.sync.aligned.m16n8k16.row.col.f32.e4m3.e4m3.f32 "
+ "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+ : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+ : "r"(a[idx * 2]), "r"(a[idx * 2 + 1]), "r"(b[idx]), "f"(c[0]),
+ "f"(c[1]), "f"(c[2]), "f"(c[3]));
+ } else if constexpr (std::is_same::value) {
+ int32_t* c = reinterpret_cast(&frag_c);
+ asm volatile(
+ "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite "
+ "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+ : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
+ : "r"(a[idx * 2]), "r"(a[idx * 2 + 1]), "r"(b[idx]), "r"(c[0]),
+ "r"(c[1]), "r"(c[2]), "r"(c[3]));
+ }
+ } else if (k_size == 32) {
+ if constexpr (std::is_same::value) {
+ float* c = reinterpret_cast(&frag_c);
+ asm volatile(
+ "mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
+ "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+ : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+ : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+ "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+ } else if constexpr (std::is_same::value) {
+ int32_t* c = reinterpret_cast(&frag_c);
+ asm volatile(
+ "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite "
+ "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+ : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
+ : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+ "r"(c[0]), "r"(c[1]), "r"(c[2]), "r"(c[3]));
+ }
}
}
-template
+template
__device__ inline void mma_trans(
- const typename ScalarType::FragA& a_frag,
- const typename ScalarType::FragB& frag_b,
- const typename ScalarType::FragB& frag_b2,
- typename ScalarType::FragC& frag_c) {
+ const typename MarlinScalarType::FragA& a_frag,
+ const typename MarlinScalarType::FragB& frag_b,
+ const typename MarlinScalarType::FragB& frag_b2,
+ typename MarlinScalarType::FragC& frag_c) {
const uint32_t* a = reinterpret_cast(&a_frag);
const uint32_t* b = reinterpret_cast(&frag_b);
const uint32_t* b2 = reinterpret_cast(&frag_b2);
float* c = reinterpret_cast(&frag_c);
- if constexpr (std::is_same::value) {
- asm volatile(
- "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
- "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
- : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
- : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
- "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
- } else if constexpr (std::is_same::value) {
- asm volatile(
- "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
- "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
- : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
- : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
- "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+ using scalar_t = typename MarlinScalarType::scalar_t;
+ if constexpr (k_size == 16) {
+ if constexpr (std::is_same::value) {
+ float* c = reinterpret_cast(&frag_c);
+ asm volatile(
+ "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
+ "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+ : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+ : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+ "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+ } else if constexpr (std::is_same::value) {
+ float* c = reinterpret_cast(&frag_c);
+ asm volatile(
+ "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
+ "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+ : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+ : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+ "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+ } else if constexpr (std::is_same::value) {
+ float* c = reinterpret_cast(&frag_c);
+ asm volatile(
+ "mma.sync.aligned.m16n8k16.row.col.f32.e4m3.e4m3.f32 "
+ "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+ : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+ : "r"(b[0]), "r"(b2[0]), "r"(a[0]), "f"(c[0]), "f"(c[1]), "f"(c[2]),
+ "f"(c[3]));
+ } else if constexpr (std::is_same::value) {
+ int32_t* c = reinterpret_cast(&frag_c);
+ asm volatile(
+ "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite "
+ "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+ : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
+ : "r"(b[0]), "r"(b2[0]), "r"(a[0]), "r"(c[0]), "r"(c[1]), "r"(c[2]),
+ "r"(c[3]));
+ }
} else {
- STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
+ if constexpr (std::is_same::value) {
+ float* c = reinterpret_cast(&frag_c);
+ #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 1200
+ asm volatile(
+ "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
+ "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+ : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+ : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+ "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+ #else
+ asm volatile(
+ "mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
+ "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+ : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+ : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+ "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+ #endif
+ } else if constexpr (std::is_same::value) {
+ int32_t* c = reinterpret_cast(&frag_c);
+ asm volatile(
+ "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite "
+ "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+ : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
+ : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+ "r"(c[0]), "r"(c[1]), "r"(c[2]), "r"(c[3]));
+ }
}
}
// Instruction for loading a full 16x16 matrix fragment of operand A from shared
// memory, directly in tensor core layout.
-template
-__device__ inline void ldsm(typename ScalarType::FragA& frag_a,
+template
+__device__ inline void ldsm(typename MarlinScalarType::FragA& frag_a,
const void* smem_ptr) {
uint32_t* a = reinterpret_cast(&frag_a);
uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr));
@@ -167,47 +251,54 @@ __device__ inline void ldsm(typename ScalarType::FragA& frag_a,
// Multiply dequantized values by the corresponding quantization scale; used
// only for grouped quantization.
-template
-__device__ inline void scale(typename ScalarType::FragB& frag_b,
- typename ScalarType::FragS& frag_s,
+template
+__device__ inline void scale(typename MarlinScalarType::FragB& frag_b,
+ typename MarlinScalarType::FragS& frag_s,
int i) {
- using scalar_t2 = typename ScalarType::scalar_t2;
- scalar_t2 s =
- ScalarType::num2num2(reinterpret_cast(&frag_s)[i]);
+ using scalar_t = typename MarlinScalarType::scalar_t;
+ using scalar_t2 = typename MarlinScalarType::scalar_t2;
+ scalar_t2 s = MarlinScalarType::num2num2(
+ reinterpret_cast(&frag_s)[i]);
frag_b[0] = __hmul2(frag_b[0], s);
frag_b[1] = __hmul2(frag_b[1], s);
}
-template
+template
__device__ inline void scale_and_sub(
- typename ScalarType::FragB& frag_b, scalar_t s, scalar_t zp) {
- using scalar_t2 = typename ScalarType::scalar_t2;
- scalar_t2 s2 = ScalarType::num2num2(s);
- scalar_t2 zp2 = ScalarType::num2num2(zp);
+ typename MarlinScalarType::FragB& frag_b,
+ typename MarlinScalarType::scalar_t s,
+ typename MarlinScalarType::scalar_t zp) {
+ using scalar_t = typename MarlinScalarType::scalar_t;
+ using scalar_t2 = typename MarlinScalarType::scalar_t2;
+ scalar_t2 s2 = MarlinScalarType::num2num2(s);
+ scalar_t2 zp2 = MarlinScalarType::num2num2(zp);
frag_b[0] = __hfma2(frag_b[0], s2, __hneg2(zp2));
frag_b[1] = __hfma2(frag_b[1], s2, __hneg2(zp2));
}
-template
-__device__ inline void sub_zp(typename ScalarType::FragB& frag_b,
- typename ScalarType::scalar_t2& frag_zp,
- int i) {
- using scalar_t2 = typename ScalarType::scalar_t2;
- scalar_t2 zp =
- ScalarType::num2num2(reinterpret_cast(&frag_zp)[i]);
+template
+__device__ inline void sub_zp(
+ typename MarlinScalarType::FragB& frag_b,
+ typename MarlinScalarType::scalar_t2& frag_zp, int i) {
+ using scalar_t = typename MarlinScalarType::scalar_t;
+ using scalar_t2 = typename MarlinScalarType::scalar_t2;
+ scalar_t2 zp = MarlinScalarType::num2num2(
+ reinterpret_cast(&frag_zp)[i]);
frag_b[0] = __hsub2(frag_b[0], zp);
frag_b[1] = __hsub2(frag_b[1], zp);
}
// Same as above, but for act_order (each K is multiplied individually)
-template
-__device__ inline void scale4(typename ScalarType::FragB& frag_b,
- typename ScalarType::FragS& frag_s_1,
- typename ScalarType::FragS& frag_s_2,
- typename ScalarType::FragS& frag_s_3,
- typename ScalarType::FragS& frag_s_4,
- int i) {
- using scalar_t2 = typename ScalarType::scalar_t2;
+template
+__device__ inline void scale4(
+ typename MarlinScalarType::FragB& frag_b,
+ typename MarlinScalarType::FragS& frag_s_1,
+ typename MarlinScalarType::FragS& frag_s_2,
+ typename MarlinScalarType::FragS& frag_s_3,
+ typename MarlinScalarType::FragS& frag_s_4, int i) {
+ using scalar_t = typename MarlinScalarType::scalar_t;
+ using scalar_t2 = typename MarlinScalarType::scalar_t2;
+
scalar_t2 s_val_1_2;
s_val_1_2.x = reinterpret_cast(&frag_s_1)[i];
s_val_1_2.y = reinterpret_cast(&frag_s_2)[i];
@@ -221,12 +312,13 @@ __device__ inline void scale4(typename ScalarType::FragB& frag_b,
}
// Given 2 floats multiply by 2 scales (halves)
-template
-__device__ inline void scale_float(float* c,
- typename ScalarType::FragS& s) {
+template
+__device__ inline void scale_float(
+ float* c, typename MarlinScalarType::FragS& s) {
+ using scalar_t = typename MarlinScalarType::scalar_t;
scalar_t* s_ptr = reinterpret_cast(&s);
- c[0] = __fmul_rn(c[0], ScalarType::num2float(s_ptr[0]));
- c[1] = __fmul_rn(c[1], ScalarType::num2float(s_ptr[1]));
+ c[0] = __fmul_rn(c[0], MarlinScalarType::num2float(s_ptr[0]));
+ c[1] = __fmul_rn(c[1], MarlinScalarType::num2float(s_ptr[1]));
}
// Wait until barrier reaches `count`, then lock for current threadblock.
@@ -278,9 +370,10 @@ __device__ inline void wait_negative_and_add(int* lock) {
__syncthreads();
}
-template ;
- using scalar_t2 = typename ScalarType::scalar_t2;
- using FragA = typename ScalarType::FragA;
- using FragB = typename ScalarType::FragB;
- using FragC = typename ScalarType::FragC;
- using FragS = typename ScalarType::FragS;
- using FragZP = typename ScalarType::FragZP;
+
+ #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 890
+ // FP8 computation is only supported for Ada Lovelace or newer architectures.
+ if constexpr (a_type_id == vllm::kFE4M3fn.id()) return;
+ #endif
+
+ int num_tokens_past_padded = num_tokens_past_padded_ptr[0];
+ constexpr int moe_block_size = m_block_size_8 ? 8 : (16 * thread_m_blocks);
+
+ using Adtype = MarlinScalarType;
+ using Cdtype = MarlinScalarType;
+
+ using scalar_t = typename MarlinScalarType::scalar_t;
+ using scalar_t2 = typename MarlinScalarType::scalar_t2;
+ using scalar_32bit_t = typename MarlinScalarType::scalar_32bit_t;
+
+ using c_scalar_t = typename MarlinScalarType::scalar_t;
+ using c_scalar_t2 = typename MarlinScalarType::scalar_t2;
+
+ using FragA = typename MarlinScalarType::FragA;
+ using FragB = typename MarlinScalarType::FragB;
+ using FragC = typename MarlinScalarType::FragC;
+ using FragS = typename MarlinScalarType::FragS;
+ using FragZP = typename MarlinScalarType::FragZP;
extern __shared__ int4 sh[];
- static constexpr auto w_type = vllm::ScalarType::from_id(w_type_id);
+ static constexpr auto a_type = vllm::ScalarType::from_id(a_type_id);
+ static constexpr auto b_type = vllm::ScalarType::from_id(b_type_id);
+ static constexpr auto c_type = vllm::ScalarType::from_id(c_type_id);
static constexpr auto s_type = vllm::ScalarType::from_id(s_type_id);
- if constexpr (w_type == vllm::kFE2M1f) {
+ if constexpr (b_type == vllm::kFE2M1f) {
static_assert(s_type == vllm::kFE4M3fn && group_blocks == 1 ||
s_type == vllm::kFE8M0fnu && group_blocks == 2);
} else if constexpr (std::is_same::value) {
@@ -355,34 +472,37 @@ __global__ void Marlin(
static_assert(s_type == vllm::kFloat16);
}
- constexpr bool has_zp = w_type == vllm::kU4 || w_type == vllm::kU8;
- constexpr bool is_int_type = w_type == vllm::kU4 || w_type == vllm::kU8 ||
- w_type == vllm::kU4B8 || w_type == vllm::kU8B128;
+ constexpr bool is_a_8bit = a_type.size_bits() == 8;
+ if constexpr (!is_a_8bit) {
+ static_assert(std::is_same::value);
+ }
+ constexpr bool has_zp = b_type == vllm::kU4 || b_type == vllm::kU8;
+ constexpr bool is_int_type = b_type == vllm::kU4 || b_type == vllm::kU8 ||
+ b_type == vllm::kS4 || b_type == vllm::kS8 ||
+ b_type == vllm::kU4B8 || b_type == vllm::kU8B128;
// see comments of dequant.h for more details
constexpr bool dequant_skip_flop =
- w_type == vllm::kFE4M3fn ||
- w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn ||
+ is_a_8bit || b_type == vllm::kFE4M3fn ||
+ b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn ||
has_zp && !is_zp_float && !std::is_same::value ||
- has_zp && !is_zp_float && !(w_type == vllm::kU8);
+ has_zp && !is_zp_float && !(b_type == vllm::kU8);
- scalar_t2 global_scale;
+ c_scalar_t2 global_scale;
constexpr bool has_act_order = group_blocks == 0;
- constexpr int pack_factor = 32 / w_type.size_bits();
+ constexpr int pack_factor = 32 / b_type.size_bits();
static_assert(thread_m_blocks == 1 || !m_block_size_8);
- constexpr int moe_block_size = m_block_size_8 ? 8 : (16 * thread_m_blocks);
const int group_size =
(!has_act_order && group_blocks == -1) ? prob_k : prob_k / num_groups;
const int scales_expert_stride =
- prob_n * prob_k / group_size / (w_type == vllm::kFE2M1f ? 16 : 8);
+ prob_n * prob_k / group_size / (b_type == vllm::kFE2M1f ? 16 : 8);
const int zp_expert_stride =
is_zp_float ? prob_n * prob_k / group_size / 8
: prob_n * prob_k / group_size / (pack_factor * 4);
const int b_bias_expert_stride = prob_n / 8;
// parallel: num valid moe blocks
- int num_tokens_past_padded = num_tokens_past_padded_ptr[0];
int parallel = num_tokens_past_padded / moe_block_size;
int num_valid_blocks = parallel;
if (is_ep) {
@@ -395,7 +515,23 @@ __global__ void Marlin(
int k_tiles = prob_k / 16 / thread_k_blocks;
int n_tiles = prob_n / 16 / thread_n_blocks;
- int iters = div_ceil(k_tiles * n_tiles * parallel, gridDim.x);
+
+ int global_mn_tiles = parallel * n_tiles;
+ int part2_mn_tiles = global_mn_tiles;
+ int part1_mn_iters = 0;
+ bool in_part2 = false;
+
+ // we use DP + two-tile SK here
+ // part1: DP
+ // part2: two-tile SK
+ // see https://github.com/vllm-project/vllm/pull/24722 for more details
+ if (global_mn_tiles > gridDim.x) {
+ part2_mn_tiles = global_mn_tiles % gridDim.x;
+ if (part2_mn_tiles * 3 <= gridDim.x) part2_mn_tiles += gridDim.x;
+ part1_mn_iters = (global_mn_tiles - part2_mn_tiles) / gridDim.x;
+ }
+
+ int iters = div_ceil(k_tiles * part2_mn_tiles, gridDim.x);
if constexpr (!has_act_order && group_blocks != -1) {
if (group_blocks >= thread_k_blocks) {
@@ -407,14 +543,15 @@ __global__ void Marlin(
}
}
- int slice_row = (iters * blockIdx.x) % k_tiles;
- int slice_col_par = (iters * blockIdx.x) / k_tiles;
- int slice_col = slice_col_par;
- int slice_iters; // number of threadblock tiles in the current slice
- int slice_count =
- 0; // total number of active threadblocks in the current slice
- int slice_idx; // index of threadblock in current slice; numbered bottom to
- // top
+ int slice_row = 0;
+ int slice_col_par = blockIdx.x;
+ int slice_col;
+ int slice_iters =
+ k_tiles; // number of threadblock tiles in the current slice
+ // total number of active threadblocks in the current slice
+ int slice_count = 1;
+ // index of threadblock in current slice; numbered bottom to top
+ int slice_idx = 0;
int par_id = 0;
int block_id = -1;
@@ -422,85 +559,89 @@ __global__ void Marlin(
int old_expert_id = 0;
int64_t B_expert_off = 0;
- int4* sh_block_sorted_ids_int4 = sh;
+ float* sh_a_s = reinterpret_cast(sh);
+ int4* sh_block_sorted_ids_int4 = sh + (is_a_8bit ? (4 * thread_m_blocks) : 0);
int4* sh_rd_block_sorted_ids_int4 =
sh_block_sorted_ids_int4 + moe_block_size / 4;
int4* sh_block_topk_weights_int4 =
sh_rd_block_sorted_ids_int4 + moe_block_size / 4;
// sh_block_topk_weights_int4 only need (moe_block_size / 4);
// but we pad to align to 256 bytes
- int4* sh_new =
- sh_block_topk_weights_int4 + moe_block_size / 2 + moe_block_size;
+ int4* sh_new = sh_block_topk_weights_int4 + moe_block_size / 2;
int32_t* sh_block_sorted_ids =
reinterpret_cast(sh_block_sorted_ids_int4);
int32_t* sh_rd_block_sorted_ids =
reinterpret_cast(sh_rd_block_sorted_ids_int4);
- scalar_t2* sh_block_topk_weights =
- reinterpret_cast(sh_block_topk_weights_int4);
+ c_scalar_t2* sh_block_topk_weights =
+ reinterpret_cast(sh_block_topk_weights_int4);
int32_t block_num_valid_tokens = 0;
int32_t locks_off = 0;
// We can easily implement parallel problem execution by just remapping
// indices and advancing global pointers
- if (slice_col_par >= n_tiles) {
- slice_col = slice_col_par % n_tiles;
- par_id = slice_col_par / n_tiles;
- }
- if (parallel * n_tiles >= gridDim.x) {
- // when parallel * n_tiles >= sms
+ if (part2_mn_tiles >= gridDim.x) {
+ // when part2_mn_tiles >= sms
// then there are at most $sms$ conflict tile blocks
locks_off = blockIdx.x;
} else {
locks_off = (iters * blockIdx.x) / k_tiles - 1;
}
+ int prob_m_top_k = prob_m * top_k;
// read moe block data given block_id
// block_sorted_ids / block_num_valid_tokens / block_topk_weights
auto read_moe_block_data = [&](int block_id) {
block_num_valid_tokens = moe_block_size;
+
+ cp_async4_pred(sh_block_sorted_ids_int4 + threadIdx.x,
+ reinterpret_cast(sorted_token_ids_ptr) +
+ (block_id * moe_block_size / 4 + threadIdx.x),
+ threadIdx.x < moe_block_size / 4);
+
+ cp_async_fence();
+ cp_async_wait<0>();
+
+ __syncthreads();
+
+ if (threadIdx.x >= threads - 32) {
+ constexpr int size_per_thread = div_ceil(moe_block_size, 32);
+ int lane_id = threadIdx.x - (threads - 32);
+
+ int local_count = 0;
#pragma unroll
- for (int i = 0; i < moe_block_size / 4; i++) {
- int4 sorted_token_ids_int4 = reinterpret_cast(
- sorted_token_ids_ptr)[block_id * moe_block_size / 4 + i];
- int* sorted_token_ids = reinterpret_cast(&sorted_token_ids_int4);
- #pragma unroll
- for (int j = 0; j < 4; j++) {
- if (sorted_token_ids[j] >= prob_m * top_k) {
- block_num_valid_tokens = i * 4 + j;
- break;
+ for (int i = 0; i < size_per_thread; i++) {
+ int j = lane_id * size_per_thread + i;
+ if (j < moe_block_size) {
+ int idx = sh_block_sorted_ids[j];
+ if (idx < prob_m_top_k) local_count++;
}
}
- if (block_num_valid_tokens != moe_block_size) break;
+
+ block_num_valid_tokens = __reduce_add_sync(0xffffffff, local_count);
+
+ if (lane_id == 0)
+ reinterpret_cast(sh_new)[0] = block_num_valid_tokens;
+ }
+
+ if (threadIdx.x < moe_block_size) {
+ int idx = sh_block_sorted_ids[threadIdx.x];
+ sh_rd_block_sorted_ids[threadIdx.x] = idx / top_k;
+
+ if (mul_topk_weights) {
+ idx = idx < prob_m_top_k ? idx : 0;
+ c_scalar_t2 topk_weight_val =
+ Cdtype::num2num2(Cdtype::float2num(topk_weights_ptr[idx]));
+ if constexpr (b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
+ topk_weight_val = __hmul2(topk_weight_val, global_scale);
+ }
+ sh_block_topk_weights[threadIdx.x] = topk_weight_val;
+ }
}
__syncthreads();
- int tid4 = threadIdx.x / 4;
- if (threadIdx.x % 4 == 0 && threadIdx.x < block_num_valid_tokens) {
- sh_block_sorted_ids_int4[tid4] = reinterpret_cast(
- sorted_token_ids_ptr)[block_id * moe_block_size / 4 + tid4];
- #pragma unroll
- for (int i = 0; i < 4; i++)
- sh_rd_block_sorted_ids[tid4 * 4 + i] =
- sh_block_sorted_ids[tid4 * 4 + i] / top_k;
-
- if (mul_topk_weights) {
- #pragma unroll
- for (int i = 0; i < 4; i++) {
- int idx = tid4 * 4 + i;
- idx = idx < block_num_valid_tokens ? idx : 0;
- if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
- sh_block_topk_weights[idx] = __hmul2(
- global_scale, Dtype::num2num2(Dtype::float2num(
- topk_weights_ptr[sh_block_sorted_ids[idx]])));
- } else {
- sh_block_topk_weights[idx] = Dtype::num2num2(
- Dtype::float2num(topk_weights_ptr[sh_block_sorted_ids[idx]]));
- }
- }
- }
- }
+ block_num_valid_tokens = reinterpret_cast(sh_new)[0];
__syncthreads();
};
@@ -511,9 +652,8 @@ __global__ void Marlin(
old_expert_id = expert_id;
if (num_invalid_blocks > 0) {
- int skip_count = block_id == -1 ? par_id : 0;
- block_id++;
- for (int i = block_id; i < num_tokens_past_padded / moe_block_size; i++) {
+ int skip_count = par_id;
+ for (int i = 0; i < num_tokens_past_padded / moe_block_size; i++) {
expert_id = expert_ids_ptr[i];
if (expert_id != -1) {
if (skip_count == 0) {
@@ -528,9 +668,9 @@ __global__ void Marlin(
expert_id = expert_ids_ptr[block_id];
}
- if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
- uint16_t val = scale2_ptr[expert_id];
- global_scale = Dtype::num2num2(*reinterpret_cast(&val));
+ if constexpr (b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
+ uint16_t val = global_scale_ptr[expert_id];
+ global_scale = Cdtype::num2num2(*reinterpret_cast(&val));
}
B_expert_off = expert_id * prob_n * prob_k / (pack_factor * 4);
@@ -550,10 +690,11 @@ __global__ void Marlin(
// Compute all information about the current slice which is required for
// synchronization.
- auto init_slice = [&](bool first_init = false) {
+ bool first_init = true;
+ auto init_part2_slice = [&]() {
slice_iters =
iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row);
- if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0;
+ if (slice_iters < 0 || slice_col_par >= part2_mn_tiles) slice_iters = 0;
if (slice_iters == 0) return;
if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row;
slice_count = 1;
@@ -571,7 +712,7 @@ __global__ void Marlin(
if (col_off > 0) slice_idx--;
}
}
- if (parallel * n_tiles >= gridDim.x) {
+ if (part2_mn_tiles >= gridDim.x) {
if (slice_count > 1 && slice_idx == slice_count - 1) {
locks_off++;
}
@@ -605,25 +746,61 @@ __global__ void Marlin(
par_id++;
update_next_moe_block_data();
}
+ if (is_a_8bit && (first_init || slice_col == 0)) {
+ __syncthreads();
+ cp_async1_ca_pred(&sh_a_s[threadIdx.x],
+ &a_scales_ptr[sh_rd_block_sorted_ids[threadIdx.x]],
+ threadIdx.x < block_num_valid_tokens);
+ }
};
- update_next_moe_block_data();
- init_slice(true);
+ auto init_part1_slice = [&]() {
+ if (part1_mn_iters) {
+ part1_mn_iters--;
+ par_id = slice_col_par / n_tiles;
+ slice_col = slice_col_par % n_tiles;
+ slice_iters = k_tiles;
+ update_next_moe_block_data();
+ if (is_a_8bit) {
+ __syncthreads();
+ cp_async1_ca_pred(&sh_a_s[threadIdx.x],
+ &a_scales_ptr[sh_rd_block_sorted_ids[threadIdx.x]],
+ threadIdx.x < block_num_valid_tokens);
+ }
+ }
+ };
+
+ auto init_slice = [&]() {
+ if (!in_part2 && !part1_mn_iters) {
+ in_part2 = true;
+ slice_col_par = (iters * blockIdx.x) / k_tiles;
+ slice_row = (iters * blockIdx.x) % k_tiles;
+ slice_col = (slice_col_par + global_mn_tiles - part2_mn_tiles) % n_tiles;
+ par_id = (slice_col_par + global_mn_tiles - part2_mn_tiles) / n_tiles;
+ update_next_moe_block_data();
+ }
+ if (!in_part2) {
+ init_part1_slice();
+ } else {
+ init_part2_slice();
+ first_init = false;
+ }
+ };
+
+ init_slice();
// A sizes/strides
// stride of the A matrix in global memory
- int a_gl_stride = prob_k / 8;
+ int a_gl_stride = prob_k / (is_a_8bit ? 16 : 8);
// stride of an A matrix tile in shared memory
- constexpr int a_sh_stride = 16 * thread_k_blocks / 8;
+ constexpr int a_sh_stride = 16 * thread_k_blocks / (is_a_8bit ? 16 : 8);
// delta between subsequent A tiles in global memory
- constexpr int a_gl_rd_delta_o = 16 * thread_k_blocks / 8;
+ constexpr int a_gl_rd_delta_o = 16 * thread_k_blocks / (is_a_8bit ? 16 : 8);
// between subsequent accesses within a tile
int a_gl_rd_delta_i = a_gl_stride * (threads / a_gl_rd_delta_o);
// between shared memory writes
constexpr int a_sh_wr_delta = a_sh_stride * (threads / a_gl_rd_delta_o);
- // between shared memory tile reads
- constexpr int a_sh_rd_delta_o = 2 * ((threads / 32) / (thread_n_blocks / 4));
// within a shared memory tile
constexpr int a_sh_rd_delta_i = a_sh_stride * 16;
// overall size of a tile
@@ -632,24 +809,25 @@ __global__ void Marlin(
constexpr int a_sh_wr_iters = div_ceil(a_sh_stage, a_sh_wr_delta);
// B sizes/strides
- int b_gl_stride = 16 * prob_n / (pack_factor * 4);
- constexpr int b_sh_stride = ((thread_n_blocks * 16) * 16 / pack_factor) / 4;
- constexpr int b_thread_vecs = w_type.size_bits() == 4 ? 1 : 2;
+ int b_gl_stride = 16 * prob_n / (pack_factor * (is_a_8bit ? 2 : 4));
+ constexpr int b_sh_stride =
+ ((thread_n_blocks * 16) * 16 / pack_factor) / (is_a_8bit ? 2 : 4);
+ constexpr int b_thread_vecs = b_type.size_bits() == 4 ? 1 : 2;
constexpr int b_sh_stride_threads = b_sh_stride / b_thread_vecs;
- int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
- int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride_threads);
+ int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks / (is_a_8bit ? 2 : 1);
constexpr int b_sh_wr_delta = threads * b_thread_vecs;
- constexpr int b_sh_rd_delta = threads * b_thread_vecs;
- constexpr int b_sh_stage = b_sh_stride * thread_k_blocks;
+ constexpr int b_sh_stage =
+ b_sh_stride * thread_k_blocks / (is_a_8bit ? 2 : 1);
constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta;
// Scale sizes/strides without act_order
- int s_gl_stride = prob_n / 8;
- constexpr int s_sh_stride = 16 * thread_n_blocks / 8;
+ int s_gl_stride = prob_n / (b_type == vllm::kFE2M1f ? 16 : 8);
+ constexpr int s_sh_stride =
+ 16 * thread_n_blocks / (b_type == vllm::kFE2M1f ? 16 : 8);
constexpr int s_tb_groups =
!has_act_order && group_blocks != -1 && group_blocks < thread_k_blocks
- ? thread_k_blocks / group_blocks / (w_type == vllm::kFE2M1f ? 2 : 1)
+ ? thread_k_blocks / group_blocks
: 1;
constexpr int s_sh_stage = s_tb_groups * s_sh_stride;
int s_gl_rd_delta = s_gl_stride;
@@ -662,7 +840,8 @@ __global__ void Marlin(
constexpr int act_s_max_num_groups = 32;
int act_s_col_stride = 1;
int act_s_col_warp_stride = act_s_col_stride * 8;
- int tb_n_warps = thread_n_blocks / 4;
+
+ constexpr int tb_n_warps = thread_n_blocks / (is_a_8bit ? 2 : 4);
int act_s_col_tb_stride = act_s_col_warp_stride * tb_n_warps;
// Zero-points sizes/strides
@@ -677,7 +856,6 @@ __global__ void Marlin(
// Global A read index of current thread.
int a_gl_rd_row = threadIdx.x / a_gl_rd_delta_o;
int a_gl_rd_col = a_gl_rd_delta_o * slice_row + threadIdx.x % a_gl_rd_delta_o;
-
// Shared write index of current thread.
int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) +
(threadIdx.x % a_gl_rd_delta_o);
@@ -685,17 +863,22 @@ __global__ void Marlin(
int a_sh_rd =
a_sh_stride * ((threadIdx.x % 32) % (16 / (m_block_size_8 ? 2 : 1))) +
(threadIdx.x % 32) / (16 / (m_block_size_8 ? 2 : 1));
- a_sh_rd += 2 * ((threadIdx.x / 32) / (thread_n_blocks / 4));
+ a_sh_rd += 2 * ((threadIdx.x / 32) / tb_n_warps) * b_sh_wr_iters;
- int b_gl_rd = b_gl_stride * (threadIdx.x / b_sh_stride_threads) +
- (threadIdx.x % b_sh_stride_threads) * b_thread_vecs;
- b_gl_rd += b_sh_stride * slice_col;
+ int b_gl_rd;
+ if (threads <= b_sh_stride) {
+ b_gl_rd = threadIdx.x;
+ } else {
+ b_gl_rd =
+ b_gl_stride * (threadIdx.x / b_sh_stride) + (threadIdx.x % b_sh_stride);
+ }
+
+ b_gl_rd += B_expert_off + b_sh_stride * slice_col;
b_gl_rd += b_gl_rd_delta_o * slice_row;
- auto b_sh_wr = threadIdx.x * b_thread_vecs;
auto b_sh_rd = threadIdx.x * b_thread_vecs;
+ b_sh_rd += b_sh_rd / b_sh_stride * (b_sh_stride * (b_sh_wr_iters - 1));
// For act_order
- constexpr int k_iter_size = tb_k / b_sh_wr_iters;
int slice_k_start = tb_k * slice_row;
int slice_k_finish = slice_k_start + tb_k * slice_iters;
int slice_k_start_shared_fetch = slice_k_start;
@@ -706,58 +889,54 @@ __global__ void Marlin(
if constexpr (!has_act_order) {
if constexpr (group_blocks == -1) {
s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
- } else {
- s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) /
- (w_type == vllm::kFE2M1f ? 2 : 1) +
+ } else if constexpr (group_blocks >= thread_k_blocks) {
+ s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
s_sh_stride * slice_col + threadIdx.x;
+ } else {
+ s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks +
+ threadIdx.x / s_sh_stride) +
+ s_sh_stride * slice_col + threadIdx.x % s_sh_stride;
}
}
auto s_sh_wr = threadIdx.x;
- bool s_sh_wr_pred = threadIdx.x < s_sh_stride;
+ bool s_sh_wr_pred = threadIdx.x < s_sh_stage;
// Zero-points
int zp_gl_rd;
if constexpr (has_zp) {
if constexpr (group_blocks == -1) {
zp_gl_rd = zp_sh_stride * slice_col + threadIdx.x;
- } else {
+ } else if constexpr (group_blocks >= thread_k_blocks) {
zp_gl_rd = zp_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
zp_sh_stride * slice_col + threadIdx.x;
+ } else {
+ zp_gl_rd = zp_gl_stride * ((thread_k_blocks * slice_row) / group_blocks +
+ threadIdx.x / zp_sh_stride) +
+ zp_sh_stride * slice_col + threadIdx.x % zp_sh_stride;
}
}
auto zp_sh_wr = threadIdx.x;
- bool zp_sh_wr_pred = threadIdx.x < zp_sh_stride;
+ bool zp_sh_wr_pred = zp_sh_stage > 0 && threadIdx.x < zp_sh_stage;
// We use a different scale layout for grouped and column-wise quantization as
// we scale a `half2` tile in column-major layout in the former and in
// row-major in the latter case.
int s_sh_rd;
- if constexpr (group_blocks != -1 && w_type == vllm::kFE2M1f) {
- auto warp_id = threadIdx.x / 32;
- int n_warps = thread_n_blocks / 4;
- int warp_row = warp_id / n_warps;
-
- s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
- (threadIdx.x % 32) / 4;
- s_sh_rd = s_sh_rd * 2 + (warp_row / group_blocks) % 2;
-
+ if constexpr (is_a_8bit) {
+ s_sh_rd = 4 * ((threadIdx.x / 32) % tb_n_warps) + (threadIdx.x % 4);
} else if constexpr (group_blocks != -1)
- s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
- (threadIdx.x % 32) / 4;
+ s_sh_rd = 8 * ((threadIdx.x / 32) % tb_n_warps) + (threadIdx.x % 32) / 4;
else if constexpr (group_blocks == -1 &&
(m_block_size_8 || (has_zp && !dequant_skip_flop)))
- s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
- (threadIdx.x % 32) / 8;
+ s_sh_rd = 8 * ((threadIdx.x / 32) % tb_n_warps) + (threadIdx.x % 32) / 8;
else
- s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
- (threadIdx.x % 32) % 4;
+ s_sh_rd = 8 * ((threadIdx.x / 32) % tb_n_warps) + (threadIdx.x % 32) % 4;
int bias_sh_rd;
if constexpr (m_block_size_8) {
- bias_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
- (threadIdx.x % 32) / 8;
+ bias_sh_rd = 8 * ((threadIdx.x / 32) % tb_n_warps) + (threadIdx.x % 32) / 8;
} else {
- bias_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+ bias_sh_rd = (is_a_8bit ? 4 : 8) * ((threadIdx.x / 32) % tb_n_warps) +
(threadIdx.x % 32) % 4;
}
@@ -773,12 +952,16 @@ __global__ void Marlin(
if constexpr (has_zp) {
if constexpr (is_zp_float) {
if constexpr (group_blocks != -1) {
- zp_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
- (threadIdx.x % 32) / 4;
+ zp_sh_rd =
+ 8 * ((threadIdx.x / 32) % tb_n_warps) + (threadIdx.x % 32) / 4;
}
+ } else if (is_a_8bit) {
+ zp_sh_rd = num_ints_per_thread * num_col_threads *
+ ((threadIdx.x / 32) % tb_n_warps / 2) +
+ num_ints_per_thread * ((threadIdx.x % 32) / num_row_threads);
} else {
zp_sh_rd = num_ints_per_thread * num_col_threads *
- ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+ ((threadIdx.x / 32) % tb_n_warps) +
num_ints_per_thread * ((threadIdx.x % 32) / num_row_threads);
}
}
@@ -805,18 +988,13 @@ __global__ void Marlin(
for (int i = 0; i < b_sh_wr_iters; i++) {
#pragma unroll
for (int j = 0; j < thread_m_blocks; j++)
- a_sh_rd_trans[i][j] =
- transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd);
+ a_sh_rd_trans[i][j] = transform_a(2 * i + a_sh_rd_delta_i * j + a_sh_rd);
}
// Since B-accesses have non-constant stride they have to be computed at
// runtime; we break dependencies between subsequent accesses with a tile by
// maintining multiple pointers (we have enough registers), a tiny
// optimization.
- const int4* B_ptr[b_sh_wr_iters];
- #pragma unroll
- for (int i = 0; i < b_sh_wr_iters; i++)
- B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd;
// Shared memory storage for global fetch pipelines.
constexpr int sh_red_size = (2 * thread_n_blocks + 1) * 16 * thread_m_blocks;
@@ -845,19 +1023,12 @@ __global__ void Marlin(
static_assert(thread_m_blocks * 16 * thread_n_blocks * 16 / 8 <=
stages * b_sh_stage);
int4* sh_a = sh_s + sh_s_size;
- constexpr int shm_size_used = moe_block_size +
- stages * (g_idx_stage + zp_sh_stage) +
- sh_s_size + sh_b_red_bias_size;
-
- // all remaining shared memory is used to cache A (input)
- // sh_a_max_row is at least ` stages * 16 * thread_m_blocks `
- int sh_a_max_row =
- ((max_shared_mem - 1024) / 16 - shm_size_used) / (thread_k_blocks * 2);
// Register storage for double buffer of shared memory reads.
FragA frag_a[2][thread_m_blocks];
I4 frag_b_quant[2][b_thread_vecs];
- FragC frag_c[thread_m_blocks][4][2];
+ FragC frag_c[thread_m_blocks][is_a_8bit ? 2 : 4][2];
+ FragC frag_c_tmp[thread_m_blocks][is_a_8bit ? 2 : 4][2];
FragS frag_s[2][4]; // No act-order
FragS frag_bias[2][4];
FragS act_frag_s[2][4][4]; // For act-order
@@ -865,6 +1036,24 @@ __global__ void Marlin(
FragZP frag_zp; // Zero-points in fp16
FragZP frag_zpf[2]; // Zero-points in fp16 in HQQ
+ if constexpr (is_a_8bit && group_blocks != -1) {
+ #pragma unroll
+ for (int j = 0; j < 2; j++) {
+ #pragma unroll
+ for (int i = 0; i < thread_m_blocks; i++) {
+ #pragma unroll
+ for (int g = 0; g < 4; g++) {
+ frag_c_tmp[i][j][0][g] = 0.0f;
+ }
+
+ #pragma unroll
+ for (int g = 0; g < 4; g++) {
+ frag_c_tmp[i][j][1][g] = 0.0f;
+ }
+ }
+ }
+ }
+
// Zero accumulators.
auto zero_accums = [&]() {
#pragma unroll
@@ -908,43 +1097,36 @@ __global__ void Marlin(
}
}
};
-
// Asynchronously fetch the next A, B and s tile from global to the next
// shared memory pipeline location.
- bool should_load_a = true;
- int max_num_stage_groups =
- ((sh_a_max_row - moe_block_size) / moe_block_size + 1) / stages;
- max_num_stage_groups = max(max_num_stage_groups, 1);
- auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true,
- int pipe_a = 0) {
+ auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) {
if (pred) {
- if (should_load_a) {
- int4* sh_a_stage = sh_a + moe_block_size * a_sh_stride * pipe_a;
+ int4* sh_a_stage = sh_a + moe_block_size * a_sh_stride * pipe;
#pragma unroll
- for (int i = 0; i < a_sh_wr_iters; i++) {
- int row = a_gl_rd_delta_i / a_gl_stride * i + a_gl_rd_row;
- int64_t sorted_row = 0;
- if (!m_block_size_8 || row < 8)
- sorted_row = sh_rd_block_sorted_ids[row];
- int64_t true_idx =
- sorted_row * a_gl_stride + a_gl_rd_col + a_gl_rd_delta_o * a_off;
- cp_async4_pred(&sh_a_stage[a_sh_wr_trans[i]], &A[true_idx],
- row < block_num_valid_tokens);
- }
+ for (int i = 0; i < a_sh_wr_iters; i++) {
+ int row = a_gl_rd_delta_i / a_gl_stride * i + a_gl_rd_row;
+ int64_t sorted_row = 0;
+ if (!m_block_size_8 || row < 8)
+ sorted_row = sh_rd_block_sorted_ids[row];
+ int64_t true_idx =
+ sorted_row * a_gl_stride + a_gl_rd_col + a_gl_rd_delta_o * a_off;
+ cp_async4_pred(&sh_a_stage[a_sh_wr_trans[i]], &A[true_idx],
+ row < block_num_valid_tokens);
}
int4* sh_b_stage = sh_b + b_sh_stage * pipe;
#pragma unroll
- for (int i = 0; i < b_sh_wr_iters; i++) {
- #pragma unroll
- for (int j = 0; j < b_thread_vecs; j++) {
- cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr + j],
- B_ptr[i] + j + B_expert_off);
- }
+ for (int i = 0; i < (b_sh_wr_iters * b_thread_vecs); i++) {
+ constexpr int count = div_ceil(b_sh_stride, threads);
+ int b_gl_idx =
+ b_gl_rd + (i % count) * threads +
+ b_gl_stride * (i / count) * div_ceil(threads, b_sh_stride);
- B_ptr[i] += b_gl_rd_delta_o;
+ cp_async4(&sh_b_stage[threads * i + threadIdx.x], &B[b_gl_idx]);
}
+ b_gl_rd += b_gl_rd_delta_o;
+
if constexpr (has_act_order) {
// Fetch g_idx thread-block portion
int full_pipe = a_off;
@@ -964,44 +1146,24 @@ __global__ void Marlin(
if constexpr (group_blocks != -1) {
int4* sh_s_stage = sh_s + s_sh_stage * pipe;
- if constexpr (group_blocks >= thread_k_blocks) {
- // Only fetch scales if this tile starts a new group
- if (pipe % (group_blocks / thread_k_blocks) == 0) {
- if (s_sh_wr_pred) {
- cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]);
- }
- s_gl_rd += s_gl_rd_delta;
- }
- } else {
- for (int i = 0; i < s_tb_groups; i++) {
- if (s_sh_wr_pred) {
- cp_async4(&sh_s_stage[i * s_sh_stride + s_sh_wr],
- &scales_ptr[s_gl_rd]);
- }
- s_gl_rd += s_gl_rd_delta;
+ // Only fetch scales if this tile starts a new group
+ if (pipe % div_ceil(group_blocks, thread_k_blocks) == 0) {
+ if (s_sh_wr_pred) {
+ cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]);
}
+ s_gl_rd += s_gl_rd_delta * s_tb_groups;
}
}
if constexpr (has_zp && group_blocks != -1) {
int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
- if constexpr (group_blocks >= thread_k_blocks) {
- // Only fetch zero-points if this tile starts a new group
- if (pipe % (group_blocks / thread_k_blocks) == 0) {
- if (zp_sh_wr_pred) {
- cp_async4(&sh_zp_stage[zp_sh_wr], &zp_ptr[zp_gl_rd]);
- }
- zp_gl_rd += zp_gl_rd_delta;
- }
- } else {
- for (int i = 0; i < zp_tb_groups; i++) {
- if (zp_sh_wr_pred) {
- cp_async4(&sh_zp_stage[i * zp_sh_stride + zp_sh_wr],
- &zp_ptr[zp_gl_rd]);
- }
- zp_gl_rd += zp_gl_rd_delta;
+ // Only fetch zero points if this tile starts a new group
+ if (pipe % div_ceil(group_blocks, thread_k_blocks) == 0) {
+ if (zp_sh_wr_pred) {
+ cp_async4(&sh_zp_stage[zp_sh_wr], &zp_ptr[zp_gl_rd]);
}
+ zp_gl_rd += zp_gl_rd_delta * zp_tb_groups;
}
}
}
@@ -1035,18 +1197,18 @@ __global__ void Marlin(
// Load the next sub-tile from the current location in the shared memory pipe
// into the current register buffer.
- auto fetch_to_registers = [&](int k, int pipe, int pipe_a = 0) {
- int4* sh_a_stage = sh_a + moe_block_size * a_sh_stride * pipe_a;
+ auto fetch_to_registers = [&](int k, int pipe) {
+ int4* sh_a_stage = sh_a + moe_block_size * a_sh_stride * pipe;
#pragma unroll
for (int i = 0; i < thread_m_blocks; i++)
- ldsm(
+ ldsm(
frag_a[k % 2][i], &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]);
int4* sh_b_stage = sh_b + b_sh_stage * pipe;
#pragma unroll
for (int i = 0; i < b_thread_vecs; i++) {
frag_b_quant[k % 2][i] = *reinterpret_cast(
- &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd + i]);
+ &sh_b_stage[b_sh_stride * (k % b_sh_wr_iters) + b_sh_rd + i]);
}
};
@@ -1070,53 +1232,54 @@ __global__ void Marlin(
auto fetch_scales_to_registers = [&](int k, int full_pipe) {
int pipe = full_pipe % stages;
+ using IT1 = typename std::conditional_t;
+ using IT0 = typename std::conditional_t;
+ constexpr int group_blocks2 = div_ceil(group_blocks, is_a_8bit ? 2 : 1);
if constexpr (!has_act_order) {
// No act-order case
if constexpr (group_blocks == -1) {
// load only when starting a new slice
- if (k == 0 && full_pipe == 0) {
+ if (k == 0 && full_pipe == 0 && dequant_skip_flop) {
reinterpret_cast(&frag_s)[0] = sh_s[s_sh_rd];
reinterpret_cast(&frag_s)[1] = sh_s[s_sh_rd + 4];
}
} else if constexpr (group_blocks != -1) {
if constexpr (group_blocks >= thread_k_blocks) {
- if (k % b_sh_wr_iters == 0) {
- int4* sh_s_stage =
- sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) *
- (pipe / (group_blocks / thread_k_blocks)));
- reinterpret_cast(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
- } else {
- reinterpret_cast(&frag_s[1])[0] =
- reinterpret_cast(&frag_s[0])[0];
+ constexpr int g = group_blocks / thread_k_blocks;
+ if (pipe % g == 0) {
+ if (k % b_sh_wr_iters == 0) {
+ int4* sh_s_stage = sh_s + s_sh_stage * (g * (pipe / g));
+ reinterpret_cast(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
+ } else {
+ reinterpret_cast(&frag_s[1])[0] =
+ reinterpret_cast(&frag_s[0])[0];
+ }
}
- } else {
+ } else if (group_blocks2 < b_sh_wr_iters || k % b_sh_wr_iters == 0) {
auto warp_id = threadIdx.x / 32;
- int n_warps = thread_n_blocks / 4;
+ int warp_row = warp_id / tb_n_warps;
- int warp_row = warp_id / n_warps;
-
- int cur_k = warp_row * 16;
- cur_k += k_iter_size * (k % b_sh_wr_iters);
-
- int k_blocks = cur_k / 16;
- int cur_group_id =
- k_blocks / (group_blocks * (w_type == vllm::kFE2M1f ? 2 : 1));
+ int k_blocks = b_sh_wr_iters * warp_row + k % b_sh_wr_iters;
+ int cur_group_id = k_blocks / group_blocks2;
int4* sh_s_stage = sh_s + s_sh_stage * pipe;
- if constexpr (w_type_id != vllm::kFE2M1f.id()) {
+ if constexpr (b_type_id != vllm::kFE2M1f.id()) {
reinterpret_cast(&frag_s[k % 2])[0] =
sh_s_stage[s_sh_rd + cur_group_id * s_sh_stride];
- } else if constexpr (group_blocks == 1 || thread_k_blocks > 4) {
- reinterpret_cast(&frag_s[k % 2])[0] =
- reinterpret_cast(
- sh_s_stage)[s_sh_rd + cur_group_id * (2 * s_sh_stride)];
} else {
reinterpret_cast(&frag_s[k % 2])[0] =
reinterpret_cast(
- sh_s_stage)[s_sh_rd + cur_group_id * (2 * s_sh_stride) +
- k % 2];
+ sh_s_stage)[s_sh_rd + cur_group_id * (2 * s_sh_stride)];
+ }
+ } else if (group_blocks >= b_sh_wr_iters) {
+ if constexpr (b_type_id != vllm::kFE2M1f.id()) {
+ reinterpret_cast(&frag_s[1])[0] =
+ reinterpret_cast(&frag_s[0])[0];
+ } else {
+ reinterpret_cast(&frag_s[1])[0] =
+ reinterpret_cast(&frag_s[0])[0];
}
}
}
@@ -1137,18 +1300,15 @@ __global__ void Marlin(
cur_k = 0;
// Progress to current iteration
- cur_k += k_iter_size * (k % b_sh_wr_iters);
+ cur_k += k % b_sh_wr_iters;
// Determine "position" inside the thread-block (based on warp and
// thread-id)
auto warp_id = threadIdx.x / 32;
- int n_warps =
- thread_n_blocks / 4; // Each warp processes 4 16-size tiles over N
+ int warp_row = warp_id / tb_n_warps;
+ int warp_col = warp_id % tb_n_warps;
- int warp_row = warp_id / n_warps;
- int warp_col = warp_id % n_warps;
-
- cur_k += warp_row * 16;
+ cur_k += warp_row * 16 * b_sh_wr_iters;
auto th_id = threadIdx.x % 32;
cur_k += (th_id % 4) * 2; // Due to tensor-core layout for fp16 B matrix
@@ -1203,18 +1363,16 @@ __global__ void Marlin(
if constexpr (group_blocks == -1) {
// load only when starting a new slice
- if (k == 0 && full_pipe == 0) {
+ if (k == 0 && full_pipe == 0 || is_a_8bit) {
#pragma unroll
for (int i = 0; i < num_ints_per_thread; i++) {
frag_qzp[k % 2][i] = (reinterpret_cast(sh_zp))[zp_sh_rd + i];
}
}
-
} else if constexpr (group_blocks >= thread_k_blocks) {
- if (k % b_sh_wr_iters == 0) {
- int4* sh_zp_stage =
- sh_zp + zp_sh_stage * ((group_blocks / thread_k_blocks) *
- (pipe / (group_blocks / thread_k_blocks)));
+ constexpr int g = group_blocks / thread_k_blocks;
+ if (pipe % g == 0 && k % b_sh_wr_iters == 0 || is_a_8bit) {
+ int4* sh_zp_stage = sh_zp + zp_sh_stage * (g * (pipe / g));
#pragma unroll
for (int i = 0; i < num_ints_per_thread; i++) {
frag_qzp[k % 2][i] =
@@ -1223,21 +1381,11 @@ __global__ void Marlin(
}
} else {
auto warp_id = threadIdx.x / 32;
- int n_warps = thread_n_blocks / 4;
- int warp_row = warp_id / n_warps;
+ int warp_row = warp_id / tb_n_warps;
- int cur_k = warp_row * 16;
- cur_k += k_iter_size * (k % b_sh_wr_iters);
-
- int k_blocks = cur_k / 16;
- int cur_group_id = 0;
-
- // Suppress bogus and persistent divide-by-zero warning
- #pragma nv_diagnostic push
- #pragma nv_diag_suppress divide_by_zero
- cur_group_id = k_blocks / group_blocks;
- #pragma nv_diagnostic pop
+ int k_blocks = b_sh_wr_iters * warp_row + k % b_sh_wr_iters;
+ int cur_group_id = k_blocks / div_ceil(group_blocks, is_a_8bit ? 2 : 1);
int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
@@ -1256,29 +1404,18 @@ __global__ void Marlin(
if constexpr (group_blocks != -1) {
if constexpr (group_blocks >= thread_k_blocks) {
- if (k % b_sh_wr_iters == 0) {
- int4* sh_zp_stage =
- sh_zp +
- zp_sh_stage * ((group_blocks / thread_k_blocks) *
- (pipe / (group_blocks / thread_k_blocks)));
+ constexpr int g = group_blocks / thread_k_blocks;
+ if (pipe % g == 0 && k % b_sh_wr_iters == 0) {
+ int4* sh_zp_stage = sh_zp + zp_sh_stage * (g * (pipe / g));
reinterpret_cast(&frag_zpf[k % 2])[0] =
sh_zp_stage[zp_sh_rd];
}
- } else {
+ } else if (group_blocks < b_sh_wr_iters || k % b_sh_wr_iters == 0) {
auto warp_id = threadIdx.x / 32;
- int n_warps = thread_n_blocks / 4;
- int warp_row = warp_id / n_warps;
-
- int cur_k = warp_row * 16;
- cur_k += k_iter_size * (k % b_sh_wr_iters);
-
- int k_blocks = cur_k / 16;
- // Suppress bogus and persistent divide-by-zero warning
- #pragma nv_diagnostic push
- #pragma nv_diag_suppress divide_by_zero
+ int warp_row = warp_id / tb_n_warps;
+ int k_blocks = b_sh_wr_iters * warp_row + k % b_sh_wr_iters;
int cur_group_id = k_blocks / group_blocks;
- #pragma nv_diagnostic pop
int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
@@ -1289,33 +1426,46 @@ __global__ void Marlin(
}
};
- auto dequant_data = [&](int q, scalar_t2* frag_b_ptr) {
- dequant(q, frag_b_ptr);
+ auto dequant_data = [&](int q, scalar_32bit_t* frag_b_ptr, int zp = 0) {
+ if constexpr (a_type.size_bits() != b_type.size_bits()) {
+ if constexpr (is_a_8bit && has_zp) {
+ sub_zp_and_dequant(
+ q, frag_b_ptr, zp);
+ } else {
+ dequant(q, frag_b_ptr);
+ }
+ }
};
// Execute the actual tensor core matmul of a sub-tile.
bool is_first_matmul_in_slice = true;
- auto matmul = [&](int k) {
+ auto matmul = [&](int k, int pipe) {
+ if (is_a_8bit) return;
int k2 = k % 2;
+ constexpr int g =
+ group_blocks > 0 ? div_ceil(group_blocks, thread_k_blocks) : 1;
const bool is_new_zp =
- ((group_blocks != -1) && (group_blocks < thread_k_blocks || k == 0)) ||
+ (group_blocks == 0) ||
+ ((group_blocks > 0) && (group_blocks < b_sh_wr_iters || k == 0)) &&
+ (pipe % g == 0) ||
(group_blocks == -1 && is_first_matmul_in_slice);
if constexpr (has_zp && !is_zp_float) {
if (is_new_zp) {
if constexpr (group_blocks == -1) is_first_matmul_in_slice = false;
int zp_quant_0, zp_quant_1;
- if constexpr (w_type.size_bits() == 4) {
+ if constexpr (b_type.size_bits() == 4) {
zp_quant_0 = frag_qzp[k2][0];
zp_quant_1 = zp_quant_0 >> 8;
} else {
- static_assert(w_type.size_bits() == 8);
+ static_assert(b_type.size_bits() == 8);
zp_quant_0 = frag_qzp[k2][0];
zp_quant_1 = frag_qzp[k2][1];
}
- dequant_data(zp_quant_0, reinterpret_cast(&frag_zp));
- dequant_data(zp_quant_1, reinterpret_cast(&frag_zp) + 2);
+ dequant_data(zp_quant_0, reinterpret_cast(&frag_zp));
+ dequant_data(zp_quant_1,
+ reinterpret_cast(&frag_zp) + 2);
}
}
if constexpr (!dequant_skip_flop && has_zp && is_zp_float) {
@@ -1325,14 +1475,14 @@ __global__ void Marlin(
}
}
- if constexpr (w_type == vllm::kFE2M1f) {
+ if constexpr (b_type == vllm::kFE2M1f) {
int s_quant_0 = reinterpret_cast(frag_s[k2])[0];
int s_quant_1 = reinterpret_cast(frag_s[k2])[1];
- dequant_fp8_scales(
- s_quant_0, reinterpret_cast(&frag_s[k2]));
- dequant_fp8_scales(
- s_quant_1, reinterpret_cast(&frag_s[k2]) + 2);
+ dequant_fp8_scales(
+ s_quant_0, reinterpret_cast(&frag_s[k2]));
+ dequant_fp8_scales(
+ s_quant_1, reinterpret_cast(&frag_s[k2]) + 2);
}
// We have the m dimension as the inner loop in order to encourage overlapping
@@ -1343,61 +1493,168 @@ __global__ void Marlin(
FragB frag_b1;
int b_quant_0, b_quant_1;
- if constexpr (w_type_id == vllm::kFE2M1f.id()) {
+ if constexpr (b_type_id == vllm::kFE2M1f.id()) {
b_quant_1 = frag_b_quant[k2][0][j];
b_quant_0 = b_quant_1 << 8;
- } else if constexpr (w_type.size_bits() == 4) {
+ } else if constexpr (b_type.size_bits() == 4) {
b_quant_0 = frag_b_quant[k2][0][j];
b_quant_1 = b_quant_0 >> 8;
} else {
- static_assert(w_type.size_bits() == 8);
+ static_assert(b_type.size_bits() == 8);
int* frag_b_quant_ptr = reinterpret_cast(frag_b_quant[k2]);
b_quant_0 = frag_b_quant_ptr[j * 2 + 0];
b_quant_1 = frag_b_quant_ptr[j * 2 + 1];
}
- dequant_data(b_quant_0, reinterpret_cast(&frag_b0));
- dequant_data(b_quant_1, reinterpret_cast(&frag_b1));
+ dequant_data(b_quant_0, reinterpret_cast(&frag_b0));
+ dequant_data(b_quant_1, reinterpret_cast(&frag_b1));
- if constexpr (dequant_skip_flop && has_zp && !is_zp_float) {
- sub_zp(frag_b0, frag_zp[j], 0);
- sub_zp(frag_b1, frag_zp[j], 1);
+ if constexpr (dequant_skip_flop && has_zp && !is_zp_float && !is_a_8bit) {
+ sub_zp(frag_b0, frag_zp[j], 0);
+ sub_zp(frag_b1, frag_zp[j], 1);
}
// Apply scale to frag_b0
- if constexpr (has_act_order) {
+ if constexpr (has_act_order && !is_a_8bit) {
static_assert(group_blocks != -1);
- scale4(frag_b0, act_frag_s[k2][0][j], act_frag_s[k2][1][j],
- act_frag_s[k2][2][j], act_frag_s[k2][3][j], 0);
- scale4(frag_b1, act_frag_s[k2][0][j], act_frag_s[k2][1][j],
- act_frag_s[k2][2][j], act_frag_s[k2][3][j], 1);
+ scale4(frag_b0, act_frag_s[k2][0][j], act_frag_s[k2][1][j],
+ act_frag_s[k2][2][j], act_frag_s[k2][3][j], 0);
+ scale4(frag_b1, act_frag_s[k2][0][j], act_frag_s[k2][1][j],
+ act_frag_s[k2][2][j], act_frag_s[k2][3][j], 1);
} else if constexpr (!dequant_skip_flop && has_zp && !is_zp_float &&
- group_blocks == -1) {
+ group_blocks == -1 && !is_a_8bit) {
int idx = (threadIdx.x / 4) % 2;
- scalar_t2 s2 = Dtype::nums2num2(
+ scalar_t2 s2 = Adtype::nums2num2(
reinterpret_cast(&frag_s[j / 2][j % 2 * 2 + 0])[idx],
reinterpret_cast(&frag_s[j / 2][j % 2 * 2 + 1])[idx]);
if (is_new_zp) frag_zp[j] = __hmul2(frag_zp[j], s2);
- scale_and_sub(frag_b0, s2.x, frag_zp[j].x);
- scale_and_sub(frag_b1, s2.y, frag_zp[j].y);
- } else if constexpr (!dequant_skip_flop && has_zp && group_blocks != -1) {
+ scale_and_sub(frag_b0, s2.x, frag_zp[j].x);
+ scale_and_sub(frag_b1, s2.y, frag_zp[j].y);
+ } else if constexpr (!dequant_skip_flop && has_zp && group_blocks != -1 &&
+ !is_a_8bit) {
if (is_new_zp)
frag_zp[j] = __hmul2(frag_zp[j],
*reinterpret_cast(&frag_s[k2][j]));
- scale_and_sub(frag_b0, frag_s[k2][j][0].x, frag_zp[j].x);
- scale_and_sub(frag_b1, frag_s[k2][j][0].y, frag_zp[j].y);
- } else if constexpr (group_blocks != -1) {
- scale(frag_b0, frag_s[k2][j], 0);
- scale(frag_b1, frag_s[k2][j], 1);
+ scale_and_sub(frag_b0, frag_s[k2][j][0].x, frag_zp[j].x);
+ scale_and_sub