diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md
index cdf6a645147e5..ae42f70077cec 100644
--- a/.buildkite/nightly-benchmarks/README.md
+++ b/.buildkite/nightly-benchmarks/README.md
@@ -74,7 +74,7 @@ Here is an example of one test inside `latency-tests.json`:
In this example:
- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
-- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
+- The `parameters` attribute control the command line arguments to be used for `vllm bench latency`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `vllm bench latency`. For example, the corresponding command line arguments for `vllm bench latency` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
@@ -82,13 +82,13 @@ WARNING: The benchmarking script will save json results by itself, so please do
### Throughput test
-The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
+The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `vllm bench throughput`.
The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
### Serving test
-We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
+We test the throughput by using `vllm bench serve` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
```json
[
@@ -118,8 +118,8 @@ Inside this example:
- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
- The `server-parameters` includes the command line arguments for vLLM server.
-- The `client-parameters` includes the command line arguments for `benchmark_serving.py`.
-- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `benchmark_serving.py`
+- The `client-parameters` includes the command line arguments for `vllm bench serve`.
+- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `vllm bench serve`
The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly.
diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
index 724b53056ca8f..05623879c0c2c 100644
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -100,7 +100,7 @@ if __name__ == "__main__":
raw_result = json.loads(f.read())
if "serving" in str(test_file):
- # this result is generated via `benchmark_serving.py`
+ # this result is generated via `vllm bench serve` command
# attach the benchmarking command to raw_result
try:
@@ -120,7 +120,7 @@ if __name__ == "__main__":
continue
elif "latency" in f.name:
- # this result is generated via `benchmark_latency.py`
+ # this result is generated via `vllm bench latency` command
# attach the benchmarking command to raw_result
try:
@@ -148,7 +148,7 @@ if __name__ == "__main__":
continue
elif "throughput" in f.name:
- # this result is generated via `benchmark_throughput.py`
+ # this result is generated via `vllm bench throughput` command
# attach the benchmarking command to raw_result
try:
diff --git a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
index 4d01a314adc47..06d7b5ed484da 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
@@ -73,7 +73,7 @@ get_current_llm_serving_engine() {
echo "Container: vllm"
# move to a completely irrelevant directory, to avoid import vllm from current folder
export CURRENT_LLM_SERVING_ENGINE=vllm
-
+
return
fi
}
@@ -95,12 +95,14 @@ json2args() {
}
kill_gpu_processes() {
- pkill -f python
- pkill -f python3
- pkill -f tritonserver
- pkill -f pt_main_thread
- pkill -f text-generation
- pkill -f lmdeploy
+ pkill -f '[p]ython'
+ pkill -f '[p]ython3'
+ pkill -f '[t]ritonserver'
+ pkill -f '[p]t_main_thread'
+ pkill -f '[t]ext-generation'
+ pkill -f '[l]mdeploy'
+ # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
+ pkill -f '[V]LLM'
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
sleep 1
@@ -125,7 +127,7 @@ ensure_installed() {
}
run_serving_tests() {
- # run serving tests using `benchmark_serving.py`
+ # run serving tests using `vllm bench serve` command
# $1: a json file specifying serving test cases
local serving_test_file
@@ -225,7 +227,7 @@ run_serving_tests() {
if [[ "$dataset_name" = "sharegpt" ]]; then
- client_command="python3 benchmark_serving.py \
+ client_command="vllm bench serve \
--backend $backend \
--tokenizer /tokenizer_cache \
--model $model \
@@ -246,7 +248,7 @@ run_serving_tests() {
sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
- client_command="python3 benchmark_serving.py \
+ client_command="vllm bench serve \
--backend $backend \
--tokenizer /tokenizer_cache \
--model $model \
@@ -265,13 +267,13 @@ run_serving_tests() {
$client_args"
else
-
+
echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
exit 1
fi
-
+
echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command"
@@ -302,7 +304,7 @@ run_serving_tests() {
}
run_genai_perf_tests() {
- # run genai-perf tests
+ # run genai-perf tests
# $1: a json file specifying genai-perf test cases
local genai_perf_test_file
@@ -311,14 +313,14 @@ run_genai_perf_tests() {
# Iterate over genai-perf tests
jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
- test_name=$(echo "$params" | jq -r '.test_name')
-
+ test_name=$(echo "$params" | jq -r '.test_name')
+
# if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name."
continue
fi
-
+
# prepend the current serving engine to the test name
test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
@@ -369,10 +371,10 @@ run_genai_perf_tests() {
qps=$num_prompts
echo "now qps is $qps"
fi
-
+
new_test_name=$test_name"_qps_"$qps
backend=$CURRENT_LLM_SERVING_ENGINE
-
+
if [[ "$backend" == *"vllm"* ]]; then
backend="vllm"
fi
@@ -413,7 +415,7 @@ prepare_dataset() {
do
cat sonnet.txt >> sonnet_4x.txt
done
-
+
}
main() {
diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
index f05040618981c..b515ee43934d1 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@@ -126,7 +126,8 @@ kill_gpu_processes() {
ps -aux
lsof -t -i:8000 | xargs -r kill -9
pgrep python3 | xargs -r kill -9
-
+ # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
+ pgrep VLLM | xargs -r kill -9
# wait until GPU memory usage smaller than 1GB
if command -v nvidia-smi; then
@@ -164,7 +165,7 @@ upload_to_buildkite() {
}
run_latency_tests() {
- # run latency tests using `benchmark_latency.py`
+ # run latency tests using `vllm bench latency` command
# $1: a json file specifying latency test cases
local latency_test_file
@@ -205,7 +206,7 @@ run_latency_tests() {
fi
fi
- latency_command=" $latency_envs python3 benchmark_latency.py \
+ latency_command=" $latency_envs vllm bench latency \
--output-json $RESULTS_FOLDER/${test_name}.json \
$latency_args"
@@ -231,7 +232,7 @@ run_latency_tests() {
}
run_throughput_tests() {
- # run throughput tests using `benchmark_throughput.py`
+ # run throughput tests using `vllm bench throughput`
# $1: a json file specifying throughput test cases
local throughput_test_file
@@ -272,7 +273,7 @@ run_throughput_tests() {
fi
fi
- throughput_command=" $throughput_envs python3 benchmark_throughput.py \
+ throughput_command=" $throughput_envs vllm bench throughput \
--output-json $RESULTS_FOLDER/${test_name}.json \
$throughput_args"
@@ -297,7 +298,7 @@ run_throughput_tests() {
}
run_serving_tests() {
- # run serving tests using `benchmark_serving.py`
+ # run serving tests using `vllm bench serve` command
# $1: a json file specifying serving test cases
local serving_test_file
@@ -393,7 +394,7 @@ run_serving_tests() {
# pass the tensor parallel size to the client so that it can be displayed
# on the benchmark dashboard
- client_command="python3 benchmark_serving.py \
+ client_command="vllm bench serve \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
@@ -447,7 +448,7 @@ main() {
(which jq) || (apt-get update && apt-get -y install jq)
(which lsof) || (apt-get update && apt-get install -y lsof)
- # get the current IP address, required by benchmark_serving.py
+ # get the current IP address, required by `vllm bench serve` command
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
# turn of the reporting of the status of each request, to clean up the terminal output
export VLLM_LOGGING_LEVEL="WARNING"
diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
index 90cc9c8446223..7c7dbb461ce0d 100644
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -13,9 +13,9 @@ NUMA_NODE=${NUMA_NODE:-1}
export CMAKE_BUILD_PARALLEL_LEVEL=32
# Setup cleanup
-remove_docker_container() {
- set -e;
- docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
+remove_docker_container() {
+ set -e;
+ docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
}
trap remove_docker_container EXIT
remove_docker_container
@@ -69,7 +69,7 @@ function cpu_tests() {
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -s -v \
- tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
+ tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
# Note: disable it until supports V1
# Run AWQ test
@@ -83,7 +83,7 @@ function cpu_tests() {
set -e
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
- python3 benchmarks/benchmark_serving.py \
+ vllm bench serve \
--backend vllm \
--dataset-name random \
--model meta-llama/Llama-3.2-3B-Instruct \
diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
new file mode 100755
index 0000000000000..d998c1f73b514
--- /dev/null
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
@@ -0,0 +1,166 @@
+#!/bin/bash
+
+set -xu
+
+
+remove_docker_container() {
+ docker rm -f tpu-test || true;
+ docker rm -f vllm-tpu || true;
+}
+
+trap remove_docker_container EXIT
+
+# Remove the container that might not be cleaned up in the previous run.
+remove_docker_container
+
+# Build the docker image.
+docker build -f docker/Dockerfile.tpu -t vllm-tpu .
+
+# Set up cleanup.
+cleanup_docker() {
+ # Get Docker's root directory
+ docker_root=$(docker info -f '{{.DockerRootDir}}')
+ if [ -z "$docker_root" ]; then
+ echo "Failed to determine Docker root directory."
+ exit 1
+ fi
+ echo "Docker root directory: $docker_root"
+ # Check disk usage of the filesystem where Docker's root directory is located
+ disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
+ # Define the threshold
+ threshold=70
+ if [ "$disk_usage" -gt "$threshold" ]; then
+ echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
+ # Remove dangling images (those that are not tagged and not used by any container)
+ docker image prune -f
+ # Remove unused volumes / force the system prune for old images as well.
+ docker volume prune -f && docker system prune --force --filter "until=72h" --all
+ echo "Docker images and volumes cleanup completed."
+ else
+ echo "Disk usage is below $threshold%. No cleanup needed."
+ fi
+}
+cleanup_docker
+
+# For HF_TOKEN.
+source /etc/environment
+
+docker run --privileged --net host --shm-size=16G -it \
+ -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
+ vllm-tpu /bin/bash -c '
+set -e # Exit immediately if a command exits with a non-zero status.
+set -u # Treat unset variables as an error.
+
+echo "--- Starting script inside Docker container ---"
+
+# Create results directory
+RESULTS_DIR=$(mktemp -d)
+# If mktemp fails, set -e will cause the script to exit.
+echo "Results will be stored in: $RESULTS_DIR"
+
+# Install dependencies
+echo "--- Installing Python dependencies ---"
+python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
+ && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
+ && python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 \
+ && python3 -m pip install --progress-bar off hf-transfer
+echo "--- Python dependencies installed ---"
+export VLLM_USE_V1=1
+export VLLM_XLA_CHECK_RECOMPILATION=1
+export VLLM_XLA_CACHE_PATH=
+echo "Using VLLM V1"
+
+echo "--- Hardware Information ---"
+# tpu-info
+echo "--- Starting Tests ---"
+set +e
+overall_script_exit_code=0
+
+# --- Test Definitions ---
+# If a test fails, this function will print logs and will not cause the main script to exit.
+run_test() {
+ local test_num=$1
+ local test_name=$2
+ local test_command=$3
+ local log_file="$RESULTS_DIR/test_${test_num}.log"
+ local actual_exit_code
+
+ echo "--- TEST_$test_num: Running $test_name ---"
+
+ # Execute the test command.
+ eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2)
+ actual_exit_code=$?
+
+ echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log
+ echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log
+
+ if [ "$actual_exit_code" -ne 0 ]; then
+ echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2
+ echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2
+ if [ -f "$log_file" ]; then
+ cat "$log_file" >&2
+ else
+ echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2
+ fi
+ echo "--- End of log for TEST_$test_num ($test_name) ---" >&2
+ return "$actual_exit_code" # Return the failure code
+ else
+ echo "TEST_$test_num ($test_name) PASSED."
+ return 0 # Return success
+ fi
+}
+
+# Helper function to call run_test and update the overall script exit code
+run_and_track_test() {
+ local test_num_arg="$1"
+ local test_name_arg="$2"
+ local test_command_arg="$3"
+
+ # Run the test
+ run_test "$test_num_arg" "$test_name_arg" "$test_command_arg"
+ local test_specific_exit_code=$?
+
+ # If the test failed, set the overall script exit code to 1
+ if [ "$test_specific_exit_code" -ne 0 ]; then
+ # No need for extra echo here, run_test already logged the failure.
+ overall_script_exit_code=1
+ fi
+}
+
+# --- Actual Test Execution ---
+run_and_track_test 1 "test_struct_output_generate.py" \
+ "HF_HUB_DISABLE_XET=1 python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
+run_and_track_test 2 "test_moe_pallas.py" \
+ "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
+run_and_track_test 3 "test_lora.py" \
+ "VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
+run_and_track_test 4 "test_tpu_qkv_linear.py" \
+ "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
+run_and_track_test 5 "test_spmd_model_weight_loading.py" \
+ "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
+run_and_track_test 6 "test_kv_cache_update_kernel.py" \
+ "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
+
+# After all tests have been attempted, exit with the overall status.
+if [ "$overall_script_exit_code" -ne 0 ]; then
+ echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---"
+else
+ echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---"
+fi
+exit "$overall_script_exit_code"
+' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct.
+
+# Capture the exit code of the docker run command
+DOCKER_RUN_EXIT_CODE=$?
+
+# The trap will run for cleanup.
+# Exit the main script with the Docker run command's exit code.
+if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then
+ echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE."
+ exit "$DOCKER_RUN_EXIT_CODE"
+else
+ echo "Docker run command completed successfully."
+ exit 0
+fi
+# TODO: This test fails because it uses RANDOM_SEED sampling
+# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
index 5514d7770cff8..e565d4b246945 100755
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -150,18 +150,6 @@ run_and_track_test 9 "test_multimodal.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py"
run_and_track_test 10 "test_pallas.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
-run_and_track_test 11 "test_struct_output_generate.py" \
- "HF_HUB_DISABLE_XET=1 python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
-run_and_track_test 12 "test_moe_pallas.py" \
- "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
-run_and_track_test 13 "test_lora.py" \
- "VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
-run_and_track_test 14 "test_tpu_qkv_linear.py" \
- "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
-run_and_track_test 15 "test_spmd_model_weight_loading.py" \
- "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
-run_and_track_test 16 "test_kv_cache_update_kernel.py" \
- "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
# After all tests have been attempted, exit with the overall status.
if [ "$overall_script_exit_code" -ne 0 ]; then
diff --git a/.buildkite/scripts/run-benchmarks.sh b/.buildkite/scripts/run-benchmarks.sh
index 195a8063fd743..72812218cb668 100644
--- a/.buildkite/scripts/run-benchmarks.sh
+++ b/.buildkite/scripts/run-benchmarks.sh
@@ -11,10 +11,10 @@ cd "$(dirname "${BASH_SOURCE[0]}")/../.."
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
# run python-based benchmarks and upload the result to buildkite
-python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
+vllm bench latency --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
bench_latency_exit_code=$?
-python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
+vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
bench_throughput_exit_code=$?
# run server-based benchmarks and upload the result to buildkite
@@ -24,7 +24,7 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
# wait for server to start, timeout after 600 seconds
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
-python3 benchmarks/benchmark_serving.py \
+vllm bench serve \
--backend vllm \
--dataset-name sharegpt \
--dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \
diff --git a/.buildkite/scripts/tpu/run_bm.sh b/.buildkite/scripts/tpu/run_bm.sh
index 877669cd956ac..beecaf7a740ae 100755
--- a/.buildkite/scripts/tpu/run_bm.sh
+++ b/.buildkite/scripts/tpu/run_bm.sh
@@ -77,7 +77,7 @@ done
echo "run benchmark test..."
echo "logging to $BM_LOG"
echo
-python benchmarks/benchmark_serving.py \
+vllm bench serve \
--backend vllm \
--model $MODEL \
--dataset-name sonnet \
diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml
index 74a7a3a3530f5..d5736c0aee208 100644
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@@ -7,7 +7,7 @@ permissions:
jobs:
lint-and-deploy:
- runs-on: ubuntu-latest
+ runs-on: ubuntu-24.04-arm
steps:
- name: Checkout
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
diff --git a/benchmarks/README.md b/benchmarks/README.md
index fb8690d42db98..3b10963c3e014 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -98,7 +98,7 @@ Then run the benchmarking script
```bash
# download dataset
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-python3 vllm/benchmarks/benchmark_serving.py \
+vllm bench serve \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--endpoint /v1/completions \
@@ -111,25 +111,25 @@ If successful, you will see the following output
```
============ Serving Benchmark Result ============
-Successful requests: 10
-Benchmark duration (s): 5.78
-Total input tokens: 1369
-Total generated tokens: 2212
-Request throughput (req/s): 1.73
-Output token throughput (tok/s): 382.89
-Total Token throughput (tok/s): 619.85
+Successful requests: 10
+Benchmark duration (s): 5.78
+Total input tokens: 1369
+Total generated tokens: 2212
+Request throughput (req/s): 1.73
+Output token throughput (tok/s): 382.89
+Total Token throughput (tok/s): 619.85
---------------Time to First Token----------------
-Mean TTFT (ms): 71.54
-Median TTFT (ms): 73.88
-P99 TTFT (ms): 79.49
+Mean TTFT (ms): 71.54
+Median TTFT (ms): 73.88
+P99 TTFT (ms): 79.49
-----Time per Output Token (excl. 1st token)------
-Mean TPOT (ms): 7.91
-Median TPOT (ms): 7.96
-P99 TPOT (ms): 8.03
+Mean TPOT (ms): 7.91
+Median TPOT (ms): 7.96
+P99 TPOT (ms): 8.03
---------------Inter-token Latency----------------
-Mean ITL (ms): 7.74
-Median ITL (ms): 7.70
-P99 ITL (ms): 8.39
+Mean ITL (ms): 7.74
+Median ITL (ms): 7.70
+P99 ITL (ms): 8.39
==================================================
```
@@ -141,7 +141,7 @@ If the dataset you want to benchmark is not supported yet in vLLM, even then you
{"prompt": "What is the capital of India?"}
{"prompt": "What is the capital of Iran?"}
{"prompt": "What is the capital of China?"}
-```
+```
```bash
# start server
@@ -150,7 +150,7 @@ VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests
```bash
# run benchmarking script
-python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detailed \
+vllm bench serve --port 9001 --save-result --save-detailed \
--backend vllm \
--model meta-llama/Llama-3.1-8B-Instruct \
--endpoint /v1/completions \
@@ -174,7 +174,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
```
```bash
-python3 vllm/benchmarks/benchmark_serving.py \
+vllm bench serve \
--backend openai-chat \
--model Qwen/Qwen2-VL-7B-Instruct \
--endpoint /v1/chat/completions \
@@ -194,7 +194,7 @@ VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
```
``` bash
-python3 benchmarks/benchmark_serving.py \
+vllm bench serve \
--model meta-llama/Meta-Llama-3-8B-Instruct \
--dataset-name hf \
--dataset-path likaixin/InstructCoder \
@@ -210,7 +210,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
**`lmms-lab/LLaVA-OneVision-Data`**
```bash
-python3 vllm/benchmarks/benchmark_serving.py \
+vllm bench serve \
--backend openai-chat \
--model Qwen/Qwen2-VL-7B-Instruct \
--endpoint /v1/chat/completions \
@@ -224,7 +224,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
**`Aeala/ShareGPT_Vicuna_unfiltered`**
```bash
-python3 vllm/benchmarks/benchmark_serving.py \
+vllm bench serve \
--backend openai-chat \
--model Qwen/Qwen2-VL-7B-Instruct \
--endpoint /v1/chat/completions \
@@ -237,7 +237,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
**`AI-MO/aimo-validation-aime`**
``` bash
-python3 vllm/benchmarks/benchmark_serving.py \
+vllm bench serve \
--model Qwen/QwQ-32B \
--dataset-name hf \
--dataset-path AI-MO/aimo-validation-aime \
@@ -248,7 +248,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
**`philschmid/mt-bench`**
``` bash
-python3 vllm/benchmarks/benchmark_serving.py \
+vllm bench serve \
--model Qwen/QwQ-32B \
--dataset-name hf \
--dataset-path philschmid/mt-bench \
@@ -261,7 +261,7 @@ When using OpenAI-compatible backends such as `vllm`, optional sampling
parameters can be specified. Example client command:
```bash
-python3 vllm/benchmarks/benchmark_serving.py \
+vllm bench serve \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--endpoint /v1/completions \
@@ -296,7 +296,7 @@ The following arguments can be used to control the ramp-up:
```bash
-python3 vllm/benchmarks/benchmark_throughput.py \
+vllm bench throughput \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset-name sonnet \
--dataset-path vllm/benchmarks/sonnet.txt \
@@ -314,7 +314,7 @@ Total num output tokens: 1500
**VisionArena Benchmark for Vision Language Models**
``` bash
-python3 vllm/benchmarks/benchmark_throughput.py \
+vllm bench throughput \
--model Qwen/Qwen2-VL-7B-Instruct \
--backend vllm-chat \
--dataset-name hf \
@@ -336,7 +336,7 @@ Total num output tokens: 1280
``` bash
VLLM_WORKER_MULTIPROC_METHOD=spawn \
VLLM_USE_V1=1 \
-python3 vllm/benchmarks/benchmark_throughput.py \
+vllm bench throughput \
--dataset-name=hf \
--dataset-path=likaixin/InstructCoder \
--model=meta-llama/Meta-Llama-3-8B-Instruct \
@@ -360,7 +360,7 @@ Total num output tokens: 204800
**`lmms-lab/LLaVA-OneVision-Data`**
```bash
-python3 vllm/benchmarks/benchmark_throughput.py \
+vllm bench throughput \
--model Qwen/Qwen2-VL-7B-Instruct \
--backend vllm-chat \
--dataset-name hf \
@@ -373,7 +373,7 @@ python3 vllm/benchmarks/benchmark_throughput.py \
**`Aeala/ShareGPT_Vicuna_unfiltered`**
```bash
-python3 vllm/benchmarks/benchmark_throughput.py \
+vllm bench throughput \
--model Qwen/Qwen2-VL-7B-Instruct \
--backend vllm-chat \
--dataset-name hf \
@@ -385,7 +385,7 @@ python3 vllm/benchmarks/benchmark_throughput.py \
**`AI-MO/aimo-validation-aime`**
```bash
-python3 benchmarks/benchmark_throughput.py \
+vllm bench throughput \
--model Qwen/QwQ-32B \
--backend vllm \
--dataset-name hf \
@@ -399,7 +399,7 @@ python3 benchmarks/benchmark_throughput.py \
``` bash
# download dataset
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-python3 vllm/benchmarks/benchmark_throughput.py \
+vllm bench throughput \
--model meta-llama/Llama-2-7b-hf \
--backend vllm \
--dataset_path /ShareGPT_V3_unfiltered_cleaned_split.json \
diff --git a/benchmarks/auto_tune/README.md b/benchmarks/auto_tune/README.md
index ae5962fe92542..c479ff1aa29c0 100644
--- a/benchmarks/auto_tune/README.md
+++ b/benchmarks/auto_tune/README.md
@@ -105,7 +105,7 @@ After the script finishes, you will find the results in a new, timestamped direc
- **Log Files**: The directory (`$BASE/auto-benchmark/YYYY_MM_DD_HH_MM/`) contains detailed logs for each run:
- `vllm_log_...txt`: The log output from the vLLM server for each parameter combination.
- - `bm_log_...txt`: The log output from the `benchmark_serving.py` script for each benchmark run.
+ - `bm_log_...txt`: The log output from the `vllm bench serve` command for each benchmark run.
- **Final Result Summary**: A file named `result.txt` is created in the log directory. It contains a summary of each tested combination and concludes with the overall best parameters found.
diff --git a/benchmarks/auto_tune/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh
index 8d3e1d4bee352..3cd8580e065dd 100644
--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
@@ -1,6 +1,6 @@
#!/bin/bash
-# This script aims to tune the best server parameter combinations to maximize throughput for given requirement.
+# This script aims to tune the best server parameter combinations to maximize throughput for given requirement.
# See details in README (benchmarks/auto_tune/README.md).
TAG=$(date +"%Y_%m_%d_%H_%M")
@@ -56,7 +56,7 @@ start_server() {
local max_num_batched_tokens=$3
local vllm_log=$4
local profile_dir=$5
-
+
pkill -f vllm
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir vllm serve $MODEL \
@@ -73,9 +73,9 @@ start_server() {
# wait for 10 minutes...
server_started=0
- for i in {1..60}; do
+ for i in {1..60}; do
RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
- STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
+ STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
if [[ "$STATUS_CODE" -eq 200 ]]; then
server_started=1
break
@@ -98,10 +98,10 @@ update_best_profile() {
selected_profile_file=
if [[ "$SYSTEM" == "TPU" ]]; then
selected_profile_file="${sorted_paths[$profile_index]}/*.xplane.pb"
- fi
+ fi
if [[ "$SYSTEM" == "GPU" ]]; then
selected_profile_file="${sorted_paths[$profile_index]}"
- fi
+ fi
rm -f $PROFILE_PATH/*
cp $selected_profile_file $PROFILE_PATH
}
@@ -129,14 +129,14 @@ run_benchmark() {
echo "server started."
fi
echo
-
+
echo "run benchmark test..."
meet_latency_requirement=0
# get a basic qps by using request-rate inf
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
adjusted_input_len=$(( INPUT_LEN - prefix_len ))
- python3 benchmarks/benchmark_serving.py \
+ vllm bench serve \
--backend vllm \
--model $MODEL \
--dataset-name random \
@@ -169,7 +169,7 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len ))
curl -X POST http://0.0.0.0:8004/reset_prefix_cache
sleep 5
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
- python3 benchmarks/benchmark_serving.py \
+ vllm bench serve \
--backend vllm \
--model $MODEL \
--dataset-name random \
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 4d2ea126b24a5..d8b960edaa468 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -11,6 +11,7 @@ from typing import Any, Optional
import numpy as np
from tqdm import tqdm
+from typing_extensions import deprecated
import vllm.envs as envs
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
@@ -34,6 +35,10 @@ def save_to_pytorch_benchmark_format(
write_to_json(pt_file, pt_records)
+@deprecated(
+ "benchmark_latency.py is deprecated and will be removed in a "
+ "future version. Please use 'vllm bench latency' instead.",
+)
def main(args: argparse.Namespace):
print(args)
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index c597fb1068aba..a97fa280f37c0 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -38,6 +38,7 @@ from typing import Any, Literal, Optional
import numpy as np
from tqdm.asyncio import tqdm
from transformers import PreTrainedTokenizerBase
+from typing_extensions import deprecated
from backend_request_func import (
ASYNC_REQUEST_FUNCS,
@@ -593,6 +594,10 @@ def save_to_pytorch_benchmark_format(
write_to_json(pt_file, pt_records)
+@deprecated(
+ "benchmark_serving.py is deprecated and will be removed in a future "
+ "version. Please use 'vllm bench serve' instead.",
+)
def main(args: argparse.Namespace):
print(args)
random.seed(args.seed)
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index c0a7f1d582505..c51b579686529 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -15,6 +15,7 @@ import torch
import uvloop
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase
+from typing_extensions import deprecated
from benchmark_dataset import (
AIMODataset,
@@ -382,6 +383,10 @@ def get_requests(args, tokenizer):
return dataset_cls(**common_kwargs).sample(**sample_kwargs)
+@deprecated(
+ "benchmark_throughput.py is deprecated and will be removed in a "
+ "future version. Please use 'vllm bench throughput' instead.",
+)
def main(args: argparse.Namespace):
if args.seed is None:
args.seed = 0
diff --git a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
index 94999630bae12..92f97ffabea2a 100644
--- a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
@@ -3,7 +3,7 @@
# benchmark the overhead of disaggregated prefill.
# methodology:
# - send all request to prefill vLLM instance. It will buffer KV cache.
-# - then send all request to decode instance.
+# - then send all request to decode instance.
# - The TTFT of decode instance is the overhead.
set -ex
@@ -12,6 +12,8 @@ kill_gpu_processes() {
# kill all processes on GPU.
pgrep pt_main_thread | xargs -r kill -9
pgrep python3 | xargs -r kill -9
+ # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
+ pgrep VLLM | xargs -r kill -9
sleep 10
# remove vllm config file
@@ -61,7 +63,7 @@ benchmark() {
--gpu-memory-utilization 0.6 \
--kv-transfer-config \
'{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
-
+
CUDA_VISIBLE_DEVICES=1 python3 \
-m vllm.entrypoints.openai.api_server \
@@ -76,38 +78,38 @@ benchmark() {
wait_for_server 8200
# let the prefill instance finish prefill
- python3 ../benchmark_serving.py \
- --backend vllm \
- --model $model \
- --dataset-name $dataset_name \
- --dataset-path $dataset_path \
- --sonnet-input-len $input_len \
- --sonnet-output-len "$output_len" \
- --sonnet-prefix-len $prefix_len \
- --num-prompts $num_prompts \
- --port 8100 \
- --save-result \
- --result-dir $results_folder \
- --result-filename disagg_prefill_tp1.json \
- --request-rate "inf"
+ vllm bench serve \
+ --backend vllm \
+ --model $model \
+ --dataset-name $dataset_name \
+ --dataset-path $dataset_path \
+ --sonnet-input-len $input_len \
+ --sonnet-output-len "$output_len" \
+ --sonnet-prefix-len $prefix_len \
+ --num-prompts $num_prompts \
+ --port 8100 \
+ --save-result \
+ --result-dir $results_folder \
+ --result-filename disagg_prefill_tp1.json \
+ --request-rate "inf"
# send the request to decode.
# The TTFT of this command will be the overhead of disagg prefill impl.
- python3 ../benchmark_serving.py \
- --backend vllm \
- --model $model \
- --dataset-name $dataset_name \
- --dataset-path $dataset_path \
- --sonnet-input-len $input_len \
- --sonnet-output-len "$output_len" \
- --sonnet-prefix-len $prefix_len \
- --num-prompts $num_prompts \
- --port 8200 \
- --save-result \
- --result-dir $results_folder \
- --result-filename disagg_prefill_tp1_overhead.json \
- --request-rate "$qps"
+ vllm bench serve \
+ --backend vllm \
+ --model $model \
+ --dataset-name $dataset_name \
+ --dataset-path $dataset_path \
+ --sonnet-input-len $input_len \
+ --sonnet-output-len "$output_len" \
+ --sonnet-prefix-len $prefix_len \
+ --num-prompts $num_prompts \
+ --port 8200 \
+ --save-result \
+ --result-dir $results_folder \
+ --result-filename disagg_prefill_tp1_overhead.json \
+ --request-rate "$qps"
kill_gpu_processes
}
diff --git a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
index eb5d891d0d4a5..af2bcba3ea57a 100644
--- a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
@@ -18,6 +18,8 @@ kill_gpu_processes() {
# kill all processes on GPU.
pgrep pt_main_thread | xargs -r kill -9
pgrep python3 | xargs -r kill -9
+ # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
+ pgrep VLLM | xargs -r kill -9
for port in 8000 8100 8200; do lsof -t -i:$port | xargs -r kill -9; done
sleep 1
}
@@ -58,7 +60,7 @@ launch_chunked_prefill() {
launch_disagg_prefill() {
- model="meta-llama/Meta-Llama-3.1-8B-Instruct"
+ model="meta-llama/Meta-Llama-3.1-8B-Instruct"
# disagg prefill
CUDA_VISIBLE_DEVICES=0 python3 \
-m vllm.entrypoints.openai.api_server \
@@ -97,20 +99,20 @@ benchmark() {
output_len=$2
tag=$3
- python3 ../benchmark_serving.py \
- --backend vllm \
- --model $model \
- --dataset-name $dataset_name \
- --dataset-path $dataset_path \
- --sonnet-input-len $input_len \
- --sonnet-output-len "$output_len" \
- --sonnet-prefix-len $prefix_len \
- --num-prompts $num_prompts \
- --port 8000 \
- --save-result \
- --result-dir $results_folder \
- --result-filename "$tag"-qps-"$qps".json \
- --request-rate "$qps"
+ vllm bench serve \
+ --backend vllm \
+ --model $model \
+ --dataset-name $dataset_name \
+ --dataset-path $dataset_path \
+ --sonnet-input-len $input_len \
+ --sonnet-output-len "$output_len" \
+ --sonnet-prefix-len $prefix_len \
+ --num-prompts $num_prompts \
+ --port 8000 \
+ --save-result \
+ --result-dir $results_folder \
+ --result-filename "$tag"-qps-"$qps".json \
+ --request-rate "$qps"
sleep 2
}
diff --git a/benchmarks/kernels/benchmark_moe_align_block_size.py b/benchmarks/kernels/benchmark_moe_align_block_size.py
index 1af5a21caf465..f540cff6261a8 100644
--- a/benchmarks/kernels/benchmark_moe_align_block_size.py
+++ b/benchmarks/kernels/benchmark_moe_align_block_size.py
@@ -5,9 +5,8 @@ import itertools
import torch
-from vllm import _custom_ops as ops
from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
- moe_align_block_size_triton,
+ moe_align_block_size,
)
from vllm.triton_utils import triton
@@ -21,60 +20,6 @@ def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor:
)
-def check_correctness(num_tokens, num_experts=256, block_size=256, topk=8):
- """
- Verifies vllm vs. Triton
- """
- topk_ids = get_topk_ids(num_tokens, num_experts, topk)
-
- # 1. malloc space for triton and vllm
- # malloc enough space (max_num_tokens_padded) for the sorted ids
- max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
- sorted_ids_triton = torch.empty(
- (max_num_tokens_padded,), dtype=torch.int32, device="cuda"
- )
- expert_ids_triton = torch.empty(
- (max_num_tokens_padded // block_size,), dtype=torch.int32, device="cuda"
- )
- num_tokens_post_pad_triton = torch.empty((1,), dtype=torch.int32, device="cuda")
-
- sorted_ids_vllm = torch.empty_like(sorted_ids_triton)
- expert_ids_vllm = torch.empty_like(expert_ids_triton)
- num_tokens_post_pad_vllm = torch.empty_like(num_tokens_post_pad_triton)
-
- # 2. run implementations
- moe_align_block_size_triton(
- topk_ids,
- num_experts,
- block_size,
- sorted_ids_triton,
- expert_ids_triton,
- num_tokens_post_pad_triton,
- )
-
- ops.moe_align_block_size(
- topk_ids,
- num_experts,
- block_size,
- sorted_ids_vllm,
- expert_ids_vllm,
- num_tokens_post_pad_vllm,
- )
- print(f"✅ VLLM implementation works with {num_experts} experts!")
-
- # 3. compare results
- if torch.allclose(expert_ids_triton, expert_ids_vllm) and torch.allclose(
- num_tokens_post_pad_triton, num_tokens_post_pad_vllm
- ):
- print("✅ Triton and VLLM implementations match.")
- else:
- print("❌ Triton and VLLM implementations DO NOT match.")
- print("Triton expert_ids:", expert_ids_triton)
- print("VLLM expert_ids:", expert_ids_vllm)
- print("Triton num_tokens_post_pad:", num_tokens_post_pad_triton)
- print("VLLM num_tokens_post_pad:", num_tokens_post_pad_vllm)
-
-
# test configurations
num_tokens_range = [1, 16, 256, 4096]
num_experts_range = [16, 64, 224, 256, 280, 512]
@@ -87,8 +32,8 @@ configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range
x_names=["num_tokens", "num_experts", "topk"],
x_vals=configs,
line_arg="provider",
- line_vals=["vllm", "triton"], # "triton"
- line_names=["VLLM", "Triton"], # "Triton"
+ line_vals=["vllm"],
+ line_names=["vLLM"],
plot_name="moe-align-block-size-performance",
args={},
)
@@ -98,36 +43,11 @@ def benchmark(num_tokens, num_experts, topk, provider):
block_size = 256
topk_ids = get_topk_ids(num_tokens, num_experts, topk)
- max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
- sorted_ids = torch.empty((max_num_tokens_padded,), dtype=torch.int32, device="cuda")
- max_num_m_blocks = max_num_tokens_padded // block_size
- expert_ids = torch.empty((max_num_m_blocks,), dtype=torch.int32, device="cuda")
- num_tokens_post_pad = torch.empty((1,), dtype=torch.int32, device="cuda")
-
quantiles = [0.5, 0.2, 0.8]
if provider == "vllm":
ms, min_ms, max_ms = triton.testing.do_bench(
- lambda: ops.moe_align_block_size(
- topk_ids,
- num_experts,
- block_size,
- sorted_ids.clone(),
- expert_ids.clone(),
- num_tokens_post_pad.clone(),
- ),
- quantiles=quantiles,
- )
- elif provider == "triton":
- ms, min_ms, max_ms = triton.testing.do_bench(
- lambda: moe_align_block_size_triton(
- topk_ids,
- num_experts,
- block_size,
- sorted_ids.clone(),
- expert_ids.clone(),
- num_tokens_post_pad.clone(),
- ),
+ lambda: moe_align_block_size(topk_ids, block_size, num_experts),
quantiles=quantiles,
)
@@ -151,6 +71,4 @@ if __name__ == "__main__":
)
args = parser.parse_args()
- print("Running correctness check...")
- check_correctness(num_tokens=1024, num_experts=args.num_experts, topk=args.topk)
benchmark.run(print_data=True, show_plots=True)
diff --git a/benchmarks/kernels/benchmark_moe_permute_unpermute.py b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
index 4ed6900901442..04d2205aa3722 100644
--- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py
+++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
@@ -8,12 +8,13 @@ import ray
import torch
from transformers import AutoConfig
-from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
+from vllm.model_executor.layers.fused_moe.fused_moe import *
+from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
_moe_permute,
_moe_unpermute_and_reduce,
+ moe_permute,
+ moe_unpermute,
)
-from vllm.model_executor.layers.fused_moe.fused_moe import *
-from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import *
from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize
from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser
@@ -63,18 +64,19 @@ def benchmark_permute(
def run():
if use_customized_permute:
- (permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = (
- moe_permute(
- qhidden_states,
- topk_weights=topk_weights,
- topk_ids=topk_ids,
- token_expert_indices=token_expert_indices,
- topk=topk,
- n_expert=num_experts,
- n_local_expert=num_experts,
- expert_map=None,
- align_block_size=align_block_size,
- )
+ (
+ permuted_hidden_states,
+ a1q_scale,
+ first_token_off,
+ inv_perm_idx,
+ m_indices,
+ ) = moe_permute(
+ qhidden_states,
+ a1q_scale=None,
+ topk_ids=topk_ids,
+ n_expert=num_experts,
+ expert_map=None,
+ align_block_size=align_block_size,
)
else:
(
@@ -150,18 +152,19 @@ def benchmark_unpermute(
def prepare():
if use_customized_permute:
- (permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = (
- moe_permute(
- qhidden_states,
- topk_weights=topk_weights,
- topk_ids=topk_ids,
- token_expert_indices=token_expert_indices,
- topk=topk,
- n_expert=num_experts,
- n_local_expert=num_experts,
- expert_map=None,
- align_block_size=align_block_size,
- )
+ (
+ permuted_hidden_states,
+ a1q_scale,
+ first_token_off,
+ inv_perm_idx,
+ m_indices,
+ ) = moe_permute(
+ qhidden_states,
+ a1q_scale=None,
+ topk_ids=topk_ids,
+ n_expert=num_experts,
+ expert_map=None,
+ align_block_size=align_block_size,
)
# convert to fp16/bf16 as gemm output
return (
@@ -191,16 +194,19 @@ def benchmark_unpermute(
def run(input: tuple):
if use_customized_permute:
- (permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = input
+ (
+ permuted_hidden_states,
+ first_token_off,
+ inv_perm_idx,
+ m_indices,
+ ) = input
+ output = torch.empty_like(hidden_states)
moe_unpermute(
+ output,
permuted_hidden_states,
topk_weights,
- topk_ids,
inv_perm_idx,
first_token_off,
- topk,
- num_experts,
- num_experts,
)
else:
(
@@ -211,7 +217,11 @@ def benchmark_unpermute(
inv_perm,
) = input
_moe_unpermute_and_reduce(
- output_hidden_states, permuted_hidden_states, inv_perm, topk_weights
+ output_hidden_states,
+ permuted_hidden_states,
+ inv_perm,
+ topk_weights,
+ True,
)
# JIT compilation & warmup
diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp
index f1738aee980b6..b20a054648428 100644
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -151,7 +151,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding);
// Quantization
-#if defined(__AVX512F__) || defined(__aarch64__)
+#if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__))
at::Tag stride_tag = at::Tag::needs_fixed_stride_order;
// Compute int8 quantized tensor for given scaling factor.
diff --git a/csrc/moe/moe_permute_unpermute_op.cu b/csrc/moe/moe_permute_unpermute_op.cu
index a77471a7f2078..2922352a3f7cc 100644
--- a/csrc/moe/moe_permute_unpermute_op.cu
+++ b/csrc/moe/moe_permute_unpermute_op.cu
@@ -10,32 +10,28 @@
void moe_permute(
const torch::Tensor& input, // [n_token, hidden]
- const torch::Tensor& topk_weights, //[n_token, topk]
- torch::Tensor& topk_ids, // [n_token, topk]
+ const torch::Tensor& topk_ids, // [n_token, topk]
const torch::Tensor& token_expert_indices, // [n_token, topk]
const std::optional& expert_map, // [n_expert]
int64_t n_expert, int64_t n_local_expert, int64_t topk,
const std::optional& align_block_size,
- torch::Tensor&
- permuted_input, // [topk * n_token/align_block_size_m, hidden]
+ torch::Tensor& permuted_input, // [permuted_size, hidden]
torch::Tensor& expert_first_token_offset, // [n_local_expert + 1]
- torch::Tensor& src_row_id2dst_row_id_map, // [n_token, topk]
+ torch::Tensor& inv_permuted_idx, // [n_token, topk]
+ torch::Tensor& permuted_idx, // [permute_size]
torch::Tensor& m_indices) { // [align_expand_m]
- TORCH_CHECK(topk_weights.scalar_type() == at::ScalarType::Float,
- "topk_weights must be float32");
TORCH_CHECK(expert_first_token_offset.scalar_type() == at::ScalarType::Long,
"expert_first_token_offset must be int64");
TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int,
"topk_ids must be int32");
TORCH_CHECK(token_expert_indices.scalar_type() == at::ScalarType::Int,
"token_expert_indices must be int32");
- TORCH_CHECK(src_row_id2dst_row_id_map.scalar_type() == at::ScalarType::Int,
- "src_row_id2dst_row_id_map must be int32");
+ TORCH_CHECK(inv_permuted_idx.scalar_type() == at::ScalarType::Int,
+ "inv_permuted_idx must be int32");
TORCH_CHECK(expert_first_token_offset.size(0) == n_local_expert + 1,
"expert_first_token_offset shape != n_local_expert+1")
- TORCH_CHECK(
- src_row_id2dst_row_id_map.sizes() == token_expert_indices.sizes(),
- "token_expert_indices shape must be same as src_row_id2dst_row_id_map");
+ TORCH_CHECK(inv_permuted_idx.sizes() == token_expert_indices.sizes(),
+ "token_expert_indices shape must be same as inv_permuted_idx");
auto n_token = input.sizes()[0];
auto n_hidden = input.sizes()[1];
auto align_block_size_value =
@@ -46,8 +42,9 @@ void moe_permute(
auto sort_workspace = torch::empty(
{sorter_size},
torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false));
+ auto copy_topk_ids = topk_ids.clone(); // copy topk_ids for preprocess
auto permuted_experts_id = torch::empty_like(topk_ids);
- auto dst_row_id2src_row_id_map = torch::empty_like(src_row_id2dst_row_id_map);
+ auto sorted_row_idx = torch::empty_like(inv_permuted_idx);
auto align_expert_first_token_offset =
torch::zeros_like(expert_first_token_offset);
@@ -67,24 +64,22 @@ void moe_permute(
const int* expert_map_ptr = get_ptr(expert_map.value());
valid_num_ptr =
get_ptr(expert_first_token_offset) + n_local_expert;
- preprocessTopkIdLauncher(get_ptr(topk_ids), n_token * topk,
+ preprocessTopkIdLauncher(get_ptr(copy_topk_ids), n_token * topk,
expert_map_ptr, n_expert, stream);
}
// expert sort topk expert id and scan expert id get expert_first_token_offset
- sortAndScanExpert(get_ptr(topk_ids), get_ptr(token_expert_indices),
- get_ptr(permuted_experts_id),
- get_ptr(dst_row_id2src_row_id_map),
- get_ptr(expert_first_token_offset), n_token,
- n_expert, n_local_expert, topk, sorter,
- get_ptr(sort_workspace), stream);
+ sortAndScanExpert(
+ get_ptr(copy_topk_ids), get_ptr(token_expert_indices),
+ get_ptr(permuted_experts_id), get_ptr(sorted_row_idx),
+ get_ptr(expert_first_token_offset), n_token, n_expert,
+ n_local_expert, topk, sorter, get_ptr(sort_workspace), stream);
// dispatch expandInputRowsKernelLauncher
MOE_DISPATCH(input.scalar_type(), [&] {
expandInputRowsKernelLauncher(
get_ptr(input), get_ptr(permuted_input),
- get_ptr(topk_weights), get_ptr(permuted_experts_id),
- get_ptr(dst_row_id2src_row_id_map),
- get_ptr(src_row_id2dst_row_id_map),
+ get_ptr(permuted_experts_id), get_ptr(sorted_row_idx),
+ get_ptr(inv_permuted_idx), get_ptr(permuted_idx),
get_ptr(expert_first_token_offset), n_token, valid_num_ptr,
n_hidden, topk, n_local_expert, align_block_size_value, stream);
});
@@ -101,32 +96,34 @@ void moe_permute(
}
void moe_unpermute(
- const torch::Tensor& permuted_hidden_states, // [n_token * topk, hidden]
- const torch::Tensor& topk_weights, //[n_token, topk]
- const torch::Tensor& topk_ids, // [n_token, topk]
- const torch::Tensor& src_row_id2dst_row_id_map, // [n_token, topk]
- const torch::Tensor& expert_first_token_offset, // [n_local_expert+1]
- int64_t n_expert, int64_t n_local_expert, int64_t topk,
+ const torch::Tensor& permuted_hidden_states, // [n_token * topk, hidden]
+ const torch::Tensor& topk_weights, // [n_token, topk]
+ const torch::Tensor& inv_permuted_idx, // [n_token, topk]
+ const std::optional&
+ expert_first_token_offset, // [n_local_expert+1]
+ int64_t topk,
torch::Tensor& hidden_states // [n_token, hidden]
) {
- TORCH_CHECK(src_row_id2dst_row_id_map.sizes() == topk_ids.sizes(),
- "topk_ids shape must be same as src_row_id2dst_row_id_map");
- TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int,
- "topk_ids must be int32");
TORCH_CHECK(
permuted_hidden_states.scalar_type() == hidden_states.scalar_type(),
- "topk_ids dtype must be same as src_row_id2dst_row_id_map");
+ "permuted_hidden_states dtype must be same as hidden_states");
auto n_token = hidden_states.size(0);
auto n_hidden = hidden_states.size(1);
auto stream = at::cuda::getCurrentCUDAStream().stream();
- const int64_t* valid_ptr =
- get_ptr(expert_first_token_offset) + n_local_expert;
+
+ int64_t const* valid_ptr = nullptr;
+ if (expert_first_token_offset.has_value()) {
+ int n_local_expert = expert_first_token_offset.value().size(0) - 1;
+ valid_ptr =
+ get_ptr(expert_first_token_offset.value()) + n_local_expert;
+ }
+
MOE_DISPATCH(hidden_states.scalar_type(), [&] {
finalizeMoeRoutingKernelLauncher(
get_ptr(permuted_hidden_states),
get_ptr(hidden_states), get_ptr(topk_weights),
- get_ptr(src_row_id2dst_row_id_map), get_ptr(topk_ids),
- n_token, n_hidden, topk, valid_ptr, stream);
+ get_ptr(inv_permuted_idx), n_token, n_hidden, topk, valid_ptr,
+ stream);
});
}
diff --git a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu
index de2c153882d93..2271c1bc75b1f 100644
--- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu
+++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu
@@ -177,7 +177,7 @@ __global__ void getMIndicesKernel(int64_t* expert_first_token_offset,
int tidx = threadIdx.x;
extern __shared__ int64_t smem_expert_first_token_offset[];
for (int i = tidx; i <= num_local_expert; i += blockDim.x) {
- smem_expert_first_token_offset[tidx] = __ldg(expert_first_token_offset + i);
+ smem_expert_first_token_offset[i] = __ldg(expert_first_token_offset + i);
}
__syncthreads();
auto last_token_offset = smem_expert_first_token_offset[eidx + 1];
diff --git a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h
index 43c29721cd16e..108091efbefa8 100644
--- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h
+++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h
@@ -57,31 +57,19 @@ void sortAndScanExpert(int* expert_for_source_row, const int* source_rows,
template
void expandInputRowsKernelLauncher(
- T const* unpermuted_input, T* permuted_output,
- const float* unpermuted_scales, int* sorted_experts,
+ T const* unpermuted_input, T* permuted_output, int* sorted_experts,
int const* expanded_dest_row_to_expanded_source_row,
- int* expanded_source_row_to_expanded_dest_row,
+ int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
int64_t* expert_first_token_offset, int64_t const num_rows,
int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k,
int num_local_experts, const int& align_block_size, cudaStream_t stream);
-// Final kernel to unpermute and scale
-// This kernel unpermutes the original data, does the k-way reduction and
-// performs the final skip connection.
-template
-__global__ void finalizeMoeRoutingKernel(
- T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
- float const* scales, int const* expanded_source_row_to_expanded_dest_row,
- int const* expert_for_source_row, int64_t const orig_cols, int64_t const k,
- int64_t const* num_valid_ptr);
-
template
void finalizeMoeRoutingKernelLauncher(
T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
float const* scales, int const* expanded_source_row_to_expanded_dest_row,
- int const* expert_for_source_row, int64_t const num_rows,
- int64_t const cols, int64_t const k, int64_t const* num_valid_ptr,
- cudaStream_t stream);
+ int64_t const num_rows, int64_t const cols, int64_t const k,
+ int64_t const* num_valid_ptr, cudaStream_t stream);
void preprocessTopkIdLauncher(int* topk_id_ptr, int size,
const int* expert_map_ptr, int num_experts,
diff --git a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl
index ad0d390665a00..449243b92a283 100644
--- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl
+++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl
@@ -2,10 +2,9 @@
template
__global__ void expandInputRowsKernel(
- T const* unpermuted_input, T* permuted_output,
- const float* unpermuted_scales, int* sorted_experts,
+ T const* unpermuted_input, T* permuted_output, int* sorted_experts,
int const* expanded_dest_row_to_expanded_source_row,
- int* expanded_source_row_to_expanded_dest_row,
+ int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
int64_t* expert_first_token_offset, int64_t const num_rows,
int64_t const* num_dest_rows, int64_t const cols, int64_t k,
int num_local_experts, int align_block_size) {
@@ -54,6 +53,10 @@ __global__ void expandInputRowsKernel(
assert(expanded_dest_row <= INT32_MAX);
expanded_source_row_to_expanded_dest_row[expanded_source_row] =
static_cast(expanded_dest_row);
+ // skip non local expert token
+ if (!CHECK_SKIPPED || blockIdx.x < *num_dest_rows) {
+ permuted_idx[expanded_dest_row] = expanded_source_row;
+ }
}
if (!CHECK_SKIPPED || blockIdx.x < *num_dest_rows) {
@@ -62,7 +65,7 @@ __global__ void expandInputRowsKernel(
using DataElem = cutlass::Array;
// Duplicate and permute rows
- int64_t const source_row = expanded_source_row % num_rows;
+ int64_t const source_row = expanded_source_row / k;
auto const* source_row_ptr =
reinterpret_cast(unpermuted_input + source_row * cols);
@@ -82,10 +85,9 @@ __global__ void expandInputRowsKernel(
template
void expandInputRowsKernelLauncher(
- T const* unpermuted_input, T* permuted_output,
- const float* unpermuted_scales, int* sorted_experts,
+ T const* unpermuted_input, T* permuted_output, int* sorted_experts,
int const* expanded_dest_row_to_expanded_source_row,
- int* expanded_source_row_to_expanded_dest_row,
+ int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
int64_t* expert_first_token_offset, int64_t const num_rows,
int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k,
int num_local_experts, const int& align_block_size, cudaStream_t stream) {
@@ -105,11 +107,11 @@ void expandInputRowsKernelLauncher(
int64_t smem_size = sizeof(int64_t) * (num_local_experts + 1);
func<<>>(
- unpermuted_input, permuted_output, unpermuted_scales, sorted_experts,
+ unpermuted_input, permuted_output, sorted_experts,
expanded_dest_row_to_expanded_source_row,
- expanded_source_row_to_expanded_dest_row, expert_first_token_offset,
- num_rows, num_valid_tokens_ptr, cols, k, num_local_experts,
- align_block_size);
+ expanded_source_row_to_expanded_dest_row, permuted_idx,
+ expert_first_token_offset, num_rows, num_valid_tokens_ptr, cols, k,
+ num_local_experts, align_block_size);
}
template
@@ -128,11 +130,9 @@ template
__global__ void finalizeMoeRoutingKernel(
T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
float const* scales, int const* expanded_source_row_to_expanded_dest_row,
- int const* expert_for_source_row, int64_t const orig_cols, int64_t const k,
- int64_t const* num_valid_ptr) {
+ int64_t const orig_cols, int64_t const k, int64_t const* num_valid_ptr) {
assert(orig_cols % 4 == 0);
int64_t const original_row = blockIdx.x;
- int64_t const num_rows = gridDim.x;
auto const offset = original_row * orig_cols;
OutputType* reduced_row_ptr = reduced_unpermuted_output + offset;
int64_t const num_valid = *num_valid_ptr;
@@ -159,14 +159,13 @@ __global__ void finalizeMoeRoutingKernel(
ComputeElem thread_output;
thread_output.fill(0);
for (int k_idx = 0; k_idx < k; ++k_idx) {
- int64_t const expanded_original_row = original_row + k_idx * num_rows;
+ int64_t const expanded_original_row = original_row * k + k_idx;
int64_t const expanded_permuted_row =
expanded_source_row_to_expanded_dest_row[expanded_original_row];
int64_t const k_offset = original_row * k + k_idx;
float const row_scale = scales[k_offset];
- // Check after row_rescale has accumulated
if (CHECK_SKIPPED && expanded_permuted_row >= num_valid) {
continue;
}
@@ -189,9 +188,8 @@ template
void finalizeMoeRoutingKernelLauncher(
T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
float const* scales, int const* expanded_source_row_to_expanded_dest_row,
- int const* expert_for_source_row, int64_t const num_rows,
- int64_t const cols, int64_t const k, int64_t const* num_valid_ptr,
- cudaStream_t stream) {
+ int64_t const num_rows, int64_t const cols, int64_t const k,
+ int64_t const* num_valid_ptr, cudaStream_t stream) {
int64_t const blocks = num_rows;
int64_t const threads = 256;
bool const check_finished = num_valid_ptr != nullptr;
@@ -201,6 +199,5 @@ void finalizeMoeRoutingKernelLauncher(
auto* const kernel = func_map[check_finished];
kernel<<>>(
expanded_permuted_rows, reduced_unpermuted_output, scales,
- expanded_source_row_to_expanded_dest_row, expert_for_source_row, cols, k,
- num_valid_ptr);
+ expanded_source_row_to_expanded_dest_row, cols, k, num_valid_ptr);
}
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index 97df311d04409..d96e082f6ef11 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -56,18 +56,17 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
" -> Tensor");
m.def(
- "moe_permute(Tensor input, Tensor topk_weight, Tensor! topk_ids,"
+ "moe_permute(Tensor input, Tensor topk_ids,"
"Tensor token_expert_indices, Tensor? expert_map, int n_expert,"
"int n_local_expert,"
"int topk, int? align_block_size,Tensor! permuted_input, Tensor! "
- "expert_first_token_offset, Tensor! src_row_id2dst_row_id_map, Tensor! "
- "m_indices)->()");
+ "expert_first_token_offset, Tensor! inv_permuted_idx, Tensor! "
+ "permuted_idx, Tensor! m_indices)->()");
m.def(
"moe_unpermute(Tensor permuted_hidden_states, Tensor topk_weights,"
- "Tensor topk_ids,Tensor src_row_id2dst_row_id_map, Tensor "
- "expert_first_token_offset, int n_expert, int n_local_expert,int "
- "topk, Tensor! hidden_states)->()");
+ "Tensor inv_permuted_idx, Tensor? expert_first_token_offset, "
+ "int topk, Tensor! hidden_states)->()");
m.def("moe_permute_unpermute_supported() -> bool");
m.impl("moe_permute_unpermute_supported", &moe_permute_unpermute_supported);
diff --git a/csrc/ops.h b/csrc/ops.h
index 97a247d9d628c..207291eceb169 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -292,6 +292,11 @@ void per_token_group_quant_fp8(const torch::Tensor& input,
torch::Tensor& output_q, torch::Tensor& output_s,
int64_t group_size, double eps, double fp8_min,
double fp8_max, bool scale_ue8m0);
+
+void per_token_group_quant_int8(const torch::Tensor& input,
+ torch::Tensor& output_q,
+ torch::Tensor& output_s, int64_t group_size,
+ double eps, double int8_min, double int8_max);
#endif
void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
diff --git a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
index 5cd2ac179768b..6a81f159f46ae 100644
--- a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
+++ b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
@@ -1,6 +1,8 @@
#include
#include
+#include "../per_token_group_quant_8bit.h"
+
#include
#include "../../dispatch_utils.h"
@@ -336,3 +338,11 @@ void dynamic_scaled_int8_quant(
}
});
}
+
+void per_token_group_quant_int8(const torch::Tensor& input,
+ torch::Tensor& output_q,
+ torch::Tensor& output_s, int64_t group_size,
+ double eps, double int8_min, double int8_max) {
+ per_token_group_quant_8bit(input, output_q, output_s, group_size, eps,
+ int8_min, int8_max);
+}
\ No newline at end of file
diff --git a/csrc/quantization/fp8/per_token_group_quant.cu b/csrc/quantization/fp8/per_token_group_quant.cu
index afc41faeca902..2609054f2072b 100644
--- a/csrc/quantization/fp8/per_token_group_quant.cu
+++ b/csrc/quantization/fp8/per_token_group_quant.cu
@@ -1,6 +1,8 @@
#include
#include
+#include "../per_token_group_quant_8bit.h"
+
#include
#include
@@ -120,7 +122,7 @@ void per_token_group_quant_8bit(const torch::Tensor& input,
torch::Tensor& output_q,
torch::Tensor& output_s, int64_t group_size,
double eps, double min_8bit, double max_8bit,
- bool scale_ue8m0 = false) {
+ bool scale_ue8m0) {
TORCH_CHECK(input.is_contiguous());
TORCH_CHECK(output_q.is_contiguous());
@@ -198,6 +200,8 @@ void per_token_group_quant_8bit(const torch::Tensor& input,
input.scalar_type(), "per_token_group_quant_8bit", ([&] {
if (dst_type == at::ScalarType::Float8_e4m3fn) {
LAUNCH_KERNEL(scalar_t, c10::Float8_e4m3fn);
+ } else if (dst_type == at::ScalarType::Char) {
+ LAUNCH_KERNEL(scalar_t, int8_t);
}
}));
diff --git a/csrc/quantization/machete/machete_prepacked_layout.cuh b/csrc/quantization/machete/machete_prepacked_layout.cuh
index 81aaa6c4f3a28..4a7d6341e6c00 100644
--- a/csrc/quantization/machete/machete_prepacked_layout.cuh
+++ b/csrc/quantization/machete/machete_prepacked_layout.cuh
@@ -187,8 +187,12 @@ struct PrepackedLayoutBTemplate {
CUTE_HOST_DEVICE static constexpr auto TVbNbKL_to_offset_copy(
Shape_NKL shape_mkl) {
auto layout = TVbNbKL_to_offset(shape_mkl);
- return make_layout(coalesce(get<0>(layout)), get<1>(layout),
- get<2>(layout));
+ // for 4-bit elements, having >= 64 values per column
+ // allows TMA to load full 32-byte sectors
+ auto inner_layout =
+ make_layout(make_shape(_256{}, size<0>(layout) / _256{}));
+
+ return make_layout(inner_layout, get<1>(layout), get<2>(layout));
}
// ((BlockN, BlockK), (BlocksN, BlocksK), L) -> (storage_idx)
diff --git a/csrc/quantization/per_token_group_quant_8bit.h b/csrc/quantization/per_token_group_quant_8bit.h
new file mode 100644
index 0000000000000..537b61bc4303f
--- /dev/null
+++ b/csrc/quantization/per_token_group_quant_8bit.h
@@ -0,0 +1,10 @@
+#pragma once
+#include
+
+// TODO(wentao): refactor the folder to 8bit, then includes fp8 and int8 folders
+// 8-bit per-token-group quantization helper used by both FP8 and INT8
+void per_token_group_quant_8bit(const torch::Tensor& input,
+ torch::Tensor& output_q,
+ torch::Tensor& output_s, int64_t group_size,
+ double eps, double min_8bit, double max_8bit,
+ bool scale_ue8m0 = false);
\ No newline at end of file
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 95f8541bc9e2d..85b6abef00b03 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -624,6 +624,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
ops.impl("per_token_group_fp8_quant", torch::kCUDA,
&per_token_group_quant_fp8);
+ // Compute per-token-group INT8 quantized tensor and scaling factor.
+ ops.def(
+ "per_token_group_quant_int8(Tensor input, Tensor! output_q, Tensor! "
+ "output_s, int group_size, float eps, float int8_min, float int8_max) -> "
+ "()");
+ ops.impl("per_token_group_quant_int8", torch::kCUDA,
+ &per_token_group_quant_int8);
+
// reorder weight for AllSpark Ampere W8A16 Fused Gemm kernel
ops.def(
"rearrange_kn_weight_as_n32k16_order(Tensor b_qweight, Tensor b_scales, "
diff --git a/docker/Dockerfile.arm b/docker/Dockerfile.arm
deleted file mode 100644
index bad093684239c..0000000000000
--- a/docker/Dockerfile.arm
+++ /dev/null
@@ -1,62 +0,0 @@
-# This vLLM Dockerfile is used to construct an image that can build and run vLLM on ARM CPU platform.
-
-FROM ubuntu:22.04 AS cpu-test-arm
-
-ENV CCACHE_DIR=/root/.cache/ccache
-
-ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
-
-RUN --mount=type=cache,target=/var/cache/apt \
- apt-get update -y \
- && apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
- && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
- && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
-
-# tcmalloc provides better memory allocation efficiency, e.g., holding memory in caches to speed up access of commonly-used objects.
-RUN --mount=type=cache,target=/root/.cache/pip \
- pip install py-cpuinfo # Use this to gather CPU info and optimize based on ARM Neoverse cores
-
-# Set LD_PRELOAD for tcmalloc on ARM
-ENV LD_PRELOAD="/usr/lib/aarch64-linux-gnu/libtcmalloc_minimal.so.4"
-
-RUN echo 'ulimit -c 0' >> ~/.bashrc
-
-WORKDIR /workspace
-
-ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
-ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
-RUN --mount=type=cache,target=/root/.cache/pip \
- --mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
- pip install --upgrade pip && \
- pip install -r requirements/build.txt
-
-FROM cpu-test-arm AS build
-
-WORKDIR /workspace/vllm
-
-RUN --mount=type=cache,target=/root/.cache/pip \
- --mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
- --mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
- pip install -v -r requirements/cpu.txt
-
-COPY . .
-ARG GIT_REPO_CHECK=0
-RUN --mount=type=bind,source=.git,target=.git \
- if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
-
-# Disabling AVX512 specific optimizations for ARM
-ARG VLLM_CPU_DISABLE_AVX512="true"
-ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
-
-RUN --mount=type=cache,target=/root/.cache/pip \
- --mount=type=cache,target=/root/.cache/ccache \
- --mount=type=bind,source=.git,target=.git \
- VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
- pip install dist/*.whl && \
- rm -rf dist
-
-WORKDIR /workspace/
-
-RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
-
-ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
\ No newline at end of file
diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu
index 982c1ddf27438..5e49e87131ece 100644
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -1,4 +1,11 @@
-# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
+# This vLLM Dockerfile is used to build images that can run vLLM on both x86_64 and arm64 CPU platforms.
+#
+# Supported platforms:
+# - linux/amd64 (x86_64)
+# - linux/arm64 (aarch64)
+#
+# Use the `--platform` option with `docker buildx build` to specify the target architecture, e.g.:
+# docker buildx build --platform=linux/arm64 -f docker/Dockerfile.cpu .
#
# Build targets:
# vllm-openai (default): used for serving deployment
@@ -53,7 +60,20 @@ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --upgrade pip && \
uv pip install -r requirements/cpu.txt
-ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/venv/lib/libiomp5.so:$LD_PRELOAD"
+ARG TARGETARCH
+ENV TARGETARCH=${TARGETARCH}
+
+RUN if [ "$TARGETARCH" = "arm64" ]; then \
+ PRELOAD_PATH="/usr/lib/aarch64-linux-gnu/libtcmalloc_minimal.so.4"; \
+ else \
+ PRELOAD_PATH="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/venv/lib/libiomp5.so"; \
+ fi && \
+ echo "export LD_PRELOAD=$PRELOAD_PATH" >> ~/.bashrc
+
+# Ensure that the LD_PRELOAD environment variable for export is in effect.
+SHELL ["/bin/bash", "-c"]
+
+ENV LD_PRELOAD=${LD_PRELOAD}
RUN echo 'ulimit -c 0' >> ~/.bashrc
diff --git a/docker/Dockerfile.tpu b/docker/Dockerfile.tpu
index 3474ff50de7bd..b9fc9def88190 100644
--- a/docker/Dockerfile.tpu
+++ b/docker/Dockerfile.tpu
@@ -1,4 +1,4 @@
-ARG NIGHTLY_DATE="20250714"
+ARG NIGHTLY_DATE="20250724"
ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.12_tpuvm_$NIGHTLY_DATE"
FROM $BASE_IMAGE
diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md
index a5851cfe963d2..13c3bc2c7e031 100644
--- a/docs/contributing/profiling.md
+++ b/docs/contributing/profiling.md
@@ -9,10 +9,13 @@ We support tracing vLLM workers using the `torch.profiler` module. You can enabl
The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` environment variable set.
-When using `benchmarks/benchmark_serving.py`, you can enable profiling by passing the `--profile` flag.
+When using `vllm bench serve`, you can enable profiling by passing the `--profile` flag.
Traces can be visualized using .
+!!! tip
+You can directly call bench module without installing vllm using `python -m vllm.entrypoints.cli.main bench`.
+
!!! tip
Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly.
@@ -35,10 +38,10 @@ VLLM_TORCH_PROFILER_DIR=./vllm_profile \
--model meta-llama/Meta-Llama-3-70B
```
-benchmark_serving.py:
+vllm bench command:
```bash
-python benchmarks/benchmark_serving.py \
+vllm bench serve \
--backend vllm \
--model meta-llama/Meta-Llama-3-70B \
--dataset-name sharegpt \
@@ -69,13 +72,13 @@ apt install nsight-systems-cli
For basic usage, you can just append `nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace=node` before any existing script you would run for offline inference.
-The following is an example using the `benchmarks/benchmark_latency.py` script:
+The following is an example using the `vllm bench latency` script:
```bash
nsys profile -o report.nsys-rep \
--trace-fork-before-exec=true \
--cuda-graph-trace=node \
- python benchmarks/benchmark_latency.py \
+vllm bench latency \
--model meta-llama/Llama-3.1-8B-Instruct \
--num-iters-warmup 5 \
--num-iters 1 \
@@ -98,7 +101,7 @@ nsys profile -o report.nsys-rep \
vllm serve meta-llama/Llama-3.1-8B-Instruct
# client
-python benchmarks/benchmark_serving.py \
+vllm bench serve \
--backend vllm \
--model meta-llama/Llama-3.1-8B-Instruct \
--num-prompts 1 \
@@ -132,7 +135,7 @@ You can view these profiles either as summaries in the CLI, using `nsys stats [p
...
** CUDA GPU Kernel Summary (cuda_gpu_kern_sum):
- Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
+ Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
-------- --------------- --------- ----------- ----------- -------- --------- ----------- ----------------------------------------------------------------------------------------------------
46.3 10,327,352,338 17,505 589,965.9 144,383.0 27,040 3,126,460 944,263.8 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize128x128x64_warpgroupsize1x1x1_execute_segment_k_of…
14.8 3,305,114,764 5,152 641,520.7 293,408.0 287,296 2,822,716 867,124.9 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize256x128x64_warpgroupsize2x1x1_execute_segment_k_of…
@@ -143,7 +146,7 @@ You can view these profiles either as summaries in the CLI, using `nsys stats [p
2.6 587,283,113 37,824 15,526.7 3,008.0 2,719 2,517,756 139,091.1 std::enable_if(int)0&&vllm::_typeConvert::exists, void>::type vllm::fused_add_rms_norm_kern…
1.9 418,362,605 18,912 22,121.5 3,871.0 3,328 2,523,870 175,248.2 void vllm::rotary_embedding_kernel(const long *, T1 *, T1 *, const T1 *, in…
0.7 167,083,069 18,880 8,849.7 2,240.0 1,471 2,499,996 101,436.1 void vllm::reshape_and_cache_flash_kernel<__nv_bfloat16, __nv_bfloat16, (vllm::Fp8KVCacheDataType)0…
- ...
+ ...
```
GUI example:
diff --git a/docs/design/v1/p2p_nccl_connector.md b/docs/design/v1/p2p_nccl_connector.md
index 9f6acf3291dd2..9d334f8873d97 100644
--- a/docs/design/v1/p2p_nccl_connector.md
+++ b/docs/design/v1/p2p_nccl_connector.md
@@ -3,14 +3,14 @@ An implementation of xPyD with dynamic scaling based on point-to-point communica
# Detailed Design
## Overall Process
-As shown in Figure 1, the overall process of this **PD disaggregation** solution is described through a request flow:
+As shown in Figure 1, the overall process of this **PD disaggregation** solution is described through a request flow:
-1. The client sends an HTTP request to the Proxy/Router's `/v1/completions` interface.
-2. The Proxy/Router selects a **1P1D (1 Prefill instance + 1 Decode instance)** through either through round-robin or random selection, generates a `request_id` (rules to be introduced later), modifies the `max_tokens` in the HTTP request message to **1**, and then forwards the request to the **P instance**.
-3. Immediately afterward, the Proxy/Router forwards the **original HTTP request** to the **D instance**.
-4. The **P instance** performs **Prefill** and then **actively sends the generated KV cache** to the D instance (using **PUT_ASYNC** mode). The D instance's `zmq_addr` can be resolved through the `request_id`.
-5. The **D instance** has a **dedicated thread** for receiving the KV cache (to avoid blocking the main process). The received KV cache is saved into the **GPU memory buffer**, the size of which is determined by the vLLM startup parameter `kv_buffer_size`. When the GPU buffer is full, the KV cache is stored in the **local Tensor memory pool**.
-6. During the **Decode**, the D instance's main process retrieves the KV cache (transmitted by the P instance) from either the **GPU buffer** or the **memory pool**, thereby **skipping Prefill**.
+1. The client sends an HTTP request to the Proxy/Router's `/v1/completions` interface.
+2. The Proxy/Router selects a **1P1D (1 Prefill instance + 1 Decode instance)** through either through round-robin or random selection, generates a `request_id` (rules to be introduced later), modifies the `max_tokens` in the HTTP request message to **1**, and then forwards the request to the **P instance**.
+3. Immediately afterward, the Proxy/Router forwards the **original HTTP request** to the **D instance**.
+4. The **P instance** performs **Prefill** and then **actively sends the generated KV cache** to the D instance (using **PUT_ASYNC** mode). The D instance's `zmq_addr` can be resolved through the `request_id`.
+5. The **D instance** has a **dedicated thread** for receiving the KV cache (to avoid blocking the main process). The received KV cache is saved into the **GPU memory buffer**, the size of which is determined by the vLLM startup parameter `kv_buffer_size`. When the GPU buffer is full, the KV cache is stored in the **local Tensor memory pool**.
+6. During the **Decode**, the D instance's main process retrieves the KV cache (transmitted by the P instance) from either the **GPU buffer** or the **memory pool**, thereby **skipping Prefill**.
7. After completing **Decode**, the D instance returns the result to the **Proxy/Router**, which then forwards it to the **client**.

@@ -291,7 +291,7 @@ curl -X POST -s http://10.0.1.1:10001/v1/completions \
??? console "Command"
```shell
- python3 benchmark_serving.py \
+ vllm bench serve \
--backend vllm \
--model base_model \
--tokenizer meta-llama/Llama-3.1-8B-Instruct \
diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md
index e820ace4f8fe7..e83dfdb11dadc 100644
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@@ -177,6 +177,70 @@ Multi-image input can be extended to perform video captioning. We show this with
You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary
instead of using multi-image input.
+Instead of NumPy arrays, you can also pass `'torch.Tensor'` instances, as shown in this example using Qwen2.5-VL:
+
+??? code
+
+ ```python
+ from transformers import AutoProcessor
+ from vllm import LLM, SamplingParams
+ from qwen_vl_utils import process_vision_info
+
+ model_path = "Qwen/Qwen2.5-VL-3B-Instruct/"
+ video_path = "https://content.pexels.com/videos/free-videos.mp4"
+
+ llm = LLM(
+ model=model_path,
+ gpu_memory_utilization=0.8,
+ enforce_eager=True,
+ limit_mm_per_prompt={"video": 1},
+ )
+
+ sampling_params = SamplingParams(
+ max_tokens=1024,
+ )
+
+ video_messages = [
+ {"role": "system", "content": "You are a helpful assistant."},
+ {"role": "user", "content": [
+ {"type": "text", "text": "describe this video."},
+ {
+ "type": "video",
+ "video": video_path,
+ "total_pixels": 20480 * 28 * 28,
+ "min_pixels": 16 * 28 * 28
+ }
+ ]
+ },
+ ]
+
+ messages = video_messages
+ processor = AutoProcessor.from_pretrained(model_path)
+ prompt = processor.apply_chat_template(
+ messages,
+ tokenize=False,
+ add_generation_prompt=True,
+ )
+
+ image_inputs, video_inputs = process_vision_info(messages)
+ mm_data = {}
+ if video_inputs is not None:
+ mm_data["video"] = video_inputs
+
+ llm_inputs = {
+ "prompt": prompt,
+ "multi_modal_data": mm_data,
+ }
+
+ outputs = llm.generate([llm_inputs], sampling_params=sampling_params)
+ for o in outputs:
+ generated_text = o.outputs[0].text
+ print(generated_text)
+ ```
+
+ !!! note
+ 'process_vision_info' is only applicable to Qwen2.5-VL and similar models.
+
Full example:
### Audio Inputs
diff --git a/docs/features/quantization/README.md b/docs/features/quantization/README.md
index e8c3b11230786..e18c128f30fc9 100644
--- a/docs/features/quantization/README.md
+++ b/docs/features/quantization/README.md
@@ -6,6 +6,7 @@ Contents:
- [Supported Hardware](supported_hardware.md)
- [AutoAWQ](auto_awq.md)
+- [AutoRound](auto_round.md)
- [BitsAndBytes](bnb.md)
- [BitBLAS](bitblas.md)
- [GGUF](gguf.md)
diff --git a/docs/features/quantization/auto_round.md b/docs/features/quantization/auto_round.md
new file mode 100644
index 0000000000000..2dfd847bb7d9a
--- /dev/null
+++ b/docs/features/quantization/auto_round.md
@@ -0,0 +1,103 @@
+# AutoRound
+
+[AutoRound](https://github.com/intel/auto-round) is Intel’s advanced quantization algorithm designed to produce highly efficient **INT2, INT3, INT4, and INT8**
+quantized large language models—striking an optimal balance between accuracy and deployment performance.
+
+AutoRound applies weight-only quantization to transformer-based models, enabling significant memory savings and faster
+inference while maintaining near-original accuracy. It supports a wide range of hardware platforms, including **CPUs,
+Intel GPUs, HPUs, and CUDA-enabled devices**.
+
+Please refer to the [AutoRound guide](https://github.com/intel/auto-round/blob/main/docs/step_by_step.md) for more details.
+
+Key Features:
+
+✅ **AutoRound, AutoAWQ, AutoGPTQ, and GGUF** are supported
+
+✅ **10+ vision-language models (VLMs)** are supported
+
+✅ **Per-layer mixed-bit quantization** for fine-grained control
+
+✅ **RTN (Round-To-Nearest) mode** for quick quantization with slight accuracy loss
+
+✅ **Multiple quantization recipes**: best, base, and light
+
+✅ Advanced utilities such as immediate packing and support for **10+ backends**
+
+## Installation
+
+```bash
+uv pip install auto-round
+```
+
+## Quantizing a model
+
+For VLMs, please change to `auto-round-mllm` in CLI usage and `AutoRoundMLLM` in API usage.
+
+### CLI usage
+
+```bash
+auto-round \
+ --model Qwen/Qwen3-0.6B \
+ --bits 4 \
+ --group_size 128 \
+ --format "auto_round" \
+ --output_dir ./tmp_autoround
+```
+
+```bash
+auto-round \
+ --model Qwen/Qwen3-0.6B \
+ --format "gguf:q4_k_m" \
+ --output_dir ./tmp_autoround
+```
+
+### API usage
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from auto_round import AutoRound
+
+model_name = "Qwen/Qwen3-0.6B"
+model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+bits, group_size, sym = 4, 128, True
+autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym)
+
+# the best accuracy, 4-5X slower, low_gpu_mem_usage could save ~20G but ~30% slower
+# autoround = AutoRound(model, tokenizer, nsamples=512, iters=1000, low_gpu_mem_usage=True, bits=bits, group_size=group_size, sym=sym)
+
+# 2-3X speedup, slight accuracy drop at W4G128
+# autoround = AutoRound(model, tokenizer, nsamples=128, iters=50, lr=5e-3, bits=bits, group_size=group_size, sym=sym )
+
+output_dir = "./tmp_autoround"
+# format= 'auto_round'(default), 'auto_gptq', 'auto_awq'
+autoround.quantize_and_save(output_dir, format="auto_round")
+```
+
+## Running a quantized model with vLLM
+
+Here is some example code to run auto-round format in vLLM:
+
+```python
+from vllm import LLM, SamplingParams
+
+prompts = [
+ "Hello, my name is",
+]
+sampling_params = SamplingParams(temperature=0.6, top_p=0.95)
+model_name = "Intel/DeepSeek-R1-0528-Qwen3-8B-int4-AutoRound"
+llm = LLM(model=model_name)
+
+outputs = llm.generate(prompts, sampling_params)
+
+for output in outputs:
+ prompt = output.prompt
+ generated_text = output.outputs[0].text
+ print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+```
+
+# Acknowledgement
+
+Special thanks to open-source low precision libraries such as AutoGPTQ, AutoAWQ, GPTQModel, Triton, Marlin, and
+ExLLaMAV2 for providing low-precision CUDA kernels, which are leveraged in AutoRound.
diff --git a/docs/getting_started/installation/cpu/arm.inc.md b/docs/getting_started/installation/cpu/arm.inc.md
index 63ae351b395fb..cac578eefb1d7 100644
--- a/docs/getting_started/installation/cpu/arm.inc.md
+++ b/docs/getting_started/installation/cpu/arm.inc.md
@@ -33,7 +33,7 @@ Testing has been conducted on AWS Graviton3 instances for compatibility.
# --8<-- [end:pre-built-images]
# --8<-- [start:build-image-from-source]
```bash
-docker build -f docker/Dockerfile.arm \
+docker build -f docker/Dockerfile.cpu \
--tag vllm-cpu-env .
# Launching OpenAI server
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 0143d137ff3f9..355ac57094195 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -365,6 +365,7 @@ th {
| `Grok1ModelForCausalLM` | Grok1 | `hpcai-tech/grok-1`. | ✅︎ | ✅︎ | ✅︎ |
| `HunYuanDenseV1ForCausalLM` | Hunyuan-7B-Instruct-0124 | `tencent/Hunyuan-7B-Instruct-0124` | ✅︎ | | ✅︎ |
| `HunYuanMoEV1ForCausalLM` | Hunyuan-80B-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | | ✅︎ |
+| `HCXVisionForCausalLM` | HyperCLOVAX-SEED-Vision-Instruct-3B | `naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B` | | | ✅︎ |
| `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `InternLM3ForCausalLM` | InternLM3 | `internlm/internlm3-8b-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
@@ -592,6 +593,7 @@ Specified using `--task generate`.
| `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ |
| `H2OVLChatModel` | H2OVL | T + IE+ | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ |
| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ |
+| `InternS1ForConditionalGeneration` | Intern-S1 | T + IE+ + VE+ | `internlm/Intern-S1`, etc. | | ✅︎ | ✅︎ |
| `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + IE+ + (VE+) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + IE+ + VE+ | `Kwai-Keye/Keye-VL-8B-Preview` | | | ✅︎ |
| `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I+ | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | | ✅︎ |
@@ -612,6 +614,7 @@ Specified using `--task generate`.
| `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + IE | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ | ⚠️ |
| `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + IE+ | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ | ✅︎ |
| `Phi4MMForCausalLM` | Phi-4-multimodal | T + I+ / T + A+ / I+ + A+ | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Phi4MultimodalForCausalLM` | Phi-4-multimodal (HF Transformers) | T + I+ / T + A+ / I+ + A+ | `microsoft/Phi-4-multimodal-instruct` (with revision `refs/pr/70`), etc. | ✅︎ | ✅︎ | ✅︎ |
| `PixtralForConditionalGeneration` | Mistral 3 (Mistral format), Pixtral (Mistral format) | T + I+ | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistralai/Pixtral-12B-2409`, etc. | | ✅︎ | ✅︎ |
| `QwenVLForConditionalGeneration`^ | Qwen-VL | T + IE+ | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A+ | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ | ✅︎ |
diff --git a/docs/training/rlhf.md b/docs/training/rlhf.md
index 4f75e4e01495c..f608a630ab7a5 100644
--- a/docs/training/rlhf.md
+++ b/docs/training/rlhf.md
@@ -2,10 +2,14 @@
Reinforcement Learning from Human Feedback (RLHF) is a technique that fine-tunes language models using human-generated preference data to align model outputs with desired behaviors.
-vLLM can be used to generate the completions for RLHF. The best way to do this is with libraries like [TRL](https://github.com/huggingface/trl), [OpenRLHF](https://github.com/OpenRLHF/OpenRLHF) and [verl](https://github.com/volcengine/verl).
+vLLM can be used to generate the completions for RLHF. Some ways to do this include using libraries like [TRL](https://github.com/huggingface/trl), [OpenRLHF](https://github.com/OpenRLHF/OpenRLHF), [verl](https://github.com/volcengine/verl) and [unsloth](https://github.com/unslothai/unsloth).
See the following basic examples to get started if you don't want to use an existing library:
- [Training and inference processes are located on separate GPUs (inspired by OpenRLHF)](../examples/offline_inference/rlhf.md)
- [Training and inference processes are colocated on the same GPUs using Ray](../examples/offline_inference/rlhf_colocate.md)
- [Utilities for performing RLHF with vLLM](../examples/offline_inference/rlhf_utils.md)
+
+See the following notebooks showing how to use vLLM for GRPO:
+
+- [Qwen-3 4B GRPO using Unsloth + vLLM](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_(4B)-GRPO.ipynb)
diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
index 8014cb53f16a8..01d6a188be994 100644
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -190,6 +190,37 @@ def run_phi4mm(question: str, audio_count: int) -> ModelRequestData:
)
+def run_phi4_multimodal(question: str, audio_count: int) -> ModelRequestData:
+ """
+ Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
+ show how to process audio inputs.
+ """
+ model_path = snapshot_download(
+ "microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70"
+ )
+ # Since the vision-lora and speech-lora co-exist with the base model,
+ # we have to manually specify the path of the lora weights.
+ speech_lora_path = os.path.join(model_path, "speech-lora")
+ placeholders = "<|audio|>" * audio_count
+
+ prompts = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
+
+ engine_args = EngineArgs(
+ model=model_path,
+ max_model_len=12800,
+ max_num_seqs=2,
+ enable_lora=True,
+ max_lora_rank=320,
+ limit_mm_per_prompt={"audio": audio_count},
+ )
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompt=prompts,
+ lora_requests=[LoRARequest("speech", 1, speech_lora_path)],
+ )
+
+
# Qwen2-Audio
def run_qwen2_audio(question: str, audio_count: int) -> ModelRequestData:
model_name = "Qwen/Qwen2-Audio-7B-Instruct"
@@ -303,6 +334,7 @@ model_example_map = {
"granite_speech": run_granite_speech,
"minicpmo": run_minicpmo,
"phi4_mm": run_phi4mm,
+ "phi4_multimodal": run_phi4_multimodal,
"qwen2_audio": run_qwen2_audio,
"qwen2_5_omni": run_qwen2_5_omni,
"ultravox": run_ultravox,
diff --git a/examples/offline_inference/prithvi_geospatial_mae.py b/examples/offline_inference/prithvi_geospatial_mae.py
index 4fdc7a3cf709e..b6007b9f46301 100644
--- a/examples/offline_inference/prithvi_geospatial_mae.py
+++ b/examples/offline_inference/prithvi_geospatial_mae.py
@@ -3,12 +3,12 @@
import argparse
import datetime
import os
-import re
from typing import Union
import albumentations
import numpy as np
import rasterio
+import regex as re
import torch
from einops import rearrange
from terratorch.datamodules import Sen1Floods11NonGeoDataModule
diff --git a/examples/offline_inference/save_sharded_state.py b/examples/offline_inference/save_sharded_state.py
index 9b154e370642b..d6b8b7e6838d7 100644
--- a/examples/offline_inference/save_sharded_state.py
+++ b/examples/offline_inference/save_sharded_state.py
@@ -29,6 +29,7 @@ import shutil
from pathlib import Path
from vllm import LLM, EngineArgs
+from vllm.model_executor.model_loader import ShardedStateLoader
from vllm.utils import FlexibleArgumentParser
@@ -39,7 +40,10 @@ def parse_args():
"--output", "-o", required=True, type=str, help="path to output checkpoint"
)
parser.add_argument(
- "--file-pattern", type=str, help="string pattern of saved filenames"
+ "--file-pattern",
+ type=str,
+ default=ShardedStateLoader.DEFAULT_PATTERN,
+ help="string pattern of saved filenames",
)
parser.add_argument(
"--max-file-size",
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index e4811c023377f..6f23a29e72f71 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -316,6 +316,85 @@ def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
)
+# naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B
+def run_hyperclovax_seed_vision(
+ questions: list[str], modality: str
+) -> ModelRequestData:
+ model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+
+ engine_args = EngineArgs(
+ model=model_name,
+ trust_remote_code=True,
+ max_model_len=8192 if modality == "image" else 16384,
+ limit_mm_per_prompt={modality: 1},
+ )
+
+ messages = list()
+ for question in questions:
+ if modality == "image":
+ """
+ ocr: List the words in the image in raster order.
+ Even if the word order feels unnatural for reading,
+ the model will handle it as long as it follows raster order.
+ e.g. "Naver, CLOVA, bigshane"
+ lens_keywords: List the entity names in the image.
+ e.g. "iPhone"
+ lens_local_keywords: List the entity names with quads in the image.
+ e.g. "[0.07, 0.21, 0.92, 0.90] iPhone"
+ """
+ messages.append(
+ [
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "image",
+ "ocr": "",
+ "lens_keywords": "",
+ "lens_local_keywords": "",
+ },
+ {
+ "type": "text",
+ "text": question,
+ },
+ ],
+ }
+ ]
+ )
+ elif modality == "video":
+ messages.append(
+ [
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "video",
+ },
+ {
+ "type": "text",
+ "text": question,
+ },
+ ],
+ }
+ ]
+ )
+ else:
+ raise ValueError(f"Unsupported modality: {modality}")
+
+ prompts = tokenizer.apply_chat_template(
+ messages,
+ tokenize=False,
+ add_generation_prompt=True,
+ )
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompts=prompts,
+ stop_token_ids=None,
+ )
+
+
# Idefics3-8B-Llama3
def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
@@ -389,6 +468,39 @@ def run_tarsier(questions: list[str], modality: str) -> ModelRequestData:
)
+# Intern-S1
+def run_interns1(questions: list[str], modality: str) -> ModelRequestData:
+ model_name = "internlm/Intern-S1"
+
+ engine_args = EngineArgs(
+ model=model_name,
+ trust_remote_code=True,
+ max_model_len=8192,
+ max_num_seqs=2,
+ limit_mm_per_prompt={modality: 1},
+ enforce_eager=True,
+ )
+
+ if modality == "image":
+ placeholder = ""
+ elif modality == "video":
+ placeholder = "