Merge remote-tracking branch 'nm/lwilkinson/fix-flashmla-full-cudagraph' into wide_ep_working_branch

This commit is contained in:
Tyler Michael Smith 2025-07-27 21:22:09 +00:00
commit f1c9ef3afd
148 changed files with 7438 additions and 1598 deletions

View File

@ -74,7 +74,7 @@ Here is an example of one test inside `latency-tests.json`:
In this example: In this example:
- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`. - The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15` - The `parameters` attribute control the command line arguments to be used for `vllm bench latency`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `vllm bench latency`. For example, the corresponding command line arguments for `vllm bench latency` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly. Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
@ -82,13 +82,13 @@ WARNING: The benchmarking script will save json results by itself, so please do
### Throughput test ### Throughput test
The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`. The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `vllm bench throughput`.
The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot. The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
### Serving test ### Serving test
We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example: We test the throughput by using `vllm bench serve` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
```json ```json
[ [
@ -118,8 +118,8 @@ Inside this example:
- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`. - The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
- The `server-parameters` includes the command line arguments for vLLM server. - The `server-parameters` includes the command line arguments for vLLM server.
- The `client-parameters` includes the command line arguments for `benchmark_serving.py`. - The `client-parameters` includes the command line arguments for `vllm bench serve`.
- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `benchmark_serving.py` - The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `vllm bench serve`
The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly. The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly.

View File

@ -100,7 +100,7 @@ if __name__ == "__main__":
raw_result = json.loads(f.read()) raw_result = json.loads(f.read())
if "serving" in str(test_file): if "serving" in str(test_file):
# this result is generated via `benchmark_serving.py` # this result is generated via `vllm bench serve` command
# attach the benchmarking command to raw_result # attach the benchmarking command to raw_result
try: try:
@ -120,7 +120,7 @@ if __name__ == "__main__":
continue continue
elif "latency" in f.name: elif "latency" in f.name:
# this result is generated via `benchmark_latency.py` # this result is generated via `vllm bench latency` command
# attach the benchmarking command to raw_result # attach the benchmarking command to raw_result
try: try:
@ -148,7 +148,7 @@ if __name__ == "__main__":
continue continue
elif "throughput" in f.name: elif "throughput" in f.name:
# this result is generated via `benchmark_throughput.py` # this result is generated via `vllm bench throughput` command
# attach the benchmarking command to raw_result # attach the benchmarking command to raw_result
try: try:

View File

@ -73,7 +73,7 @@ get_current_llm_serving_engine() {
echo "Container: vllm" echo "Container: vllm"
# move to a completely irrelevant directory, to avoid import vllm from current folder # move to a completely irrelevant directory, to avoid import vllm from current folder
export CURRENT_LLM_SERVING_ENGINE=vllm export CURRENT_LLM_SERVING_ENGINE=vllm
return return
fi fi
} }
@ -95,12 +95,14 @@ json2args() {
} }
kill_gpu_processes() { kill_gpu_processes() {
pkill -f python pkill -f '[p]ython'
pkill -f python3 pkill -f '[p]ython3'
pkill -f tritonserver pkill -f '[t]ritonserver'
pkill -f pt_main_thread pkill -f '[p]t_main_thread'
pkill -f text-generation pkill -f '[t]ext-generation'
pkill -f lmdeploy pkill -f '[l]mdeploy'
# vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
pkill -f '[V]LLM'
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
sleep 1 sleep 1
@ -125,7 +127,7 @@ ensure_installed() {
} }
run_serving_tests() { run_serving_tests() {
# run serving tests using `benchmark_serving.py` # run serving tests using `vllm bench serve` command
# $1: a json file specifying serving test cases # $1: a json file specifying serving test cases
local serving_test_file local serving_test_file
@ -225,7 +227,7 @@ run_serving_tests() {
if [[ "$dataset_name" = "sharegpt" ]]; then if [[ "$dataset_name" = "sharegpt" ]]; then
client_command="python3 benchmark_serving.py \ client_command="vllm bench serve \
--backend $backend \ --backend $backend \
--tokenizer /tokenizer_cache \ --tokenizer /tokenizer_cache \
--model $model \ --model $model \
@ -246,7 +248,7 @@ run_serving_tests() {
sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len') sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len') sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
client_command="python3 benchmark_serving.py \ client_command="vllm bench serve \
--backend $backend \ --backend $backend \
--tokenizer /tokenizer_cache \ --tokenizer /tokenizer_cache \
--model $model \ --model $model \
@ -265,13 +267,13 @@ run_serving_tests() {
$client_args" $client_args"
else else
echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name." echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
exit 1 exit 1
fi fi
echo "Running test case $test_name with qps $qps" echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command" echo "Client command: $client_command"
@ -302,7 +304,7 @@ run_serving_tests() {
} }
run_genai_perf_tests() { run_genai_perf_tests() {
# run genai-perf tests # run genai-perf tests
# $1: a json file specifying genai-perf test cases # $1: a json file specifying genai-perf test cases
local genai_perf_test_file local genai_perf_test_file
@ -311,14 +313,14 @@ run_genai_perf_tests() {
# Iterate over genai-perf tests # Iterate over genai-perf tests
jq -c '.[]' "$genai_perf_test_file" | while read -r params; do jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it. # get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name') test_name=$(echo "$params" | jq -r '.test_name')
# if TEST_SELECTOR is set, only run the test cases that match the selector # if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name." echo "Skip test case $test_name."
continue continue
fi fi
# prepend the current serving engine to the test name # prepend the current serving engine to the test name
test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name} test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
@ -369,10 +371,10 @@ run_genai_perf_tests() {
qps=$num_prompts qps=$num_prompts
echo "now qps is $qps" echo "now qps is $qps"
fi fi
new_test_name=$test_name"_qps_"$qps new_test_name=$test_name"_qps_"$qps
backend=$CURRENT_LLM_SERVING_ENGINE backend=$CURRENT_LLM_SERVING_ENGINE
if [[ "$backend" == *"vllm"* ]]; then if [[ "$backend" == *"vllm"* ]]; then
backend="vllm" backend="vllm"
fi fi
@ -413,7 +415,7 @@ prepare_dataset() {
do do
cat sonnet.txt >> sonnet_4x.txt cat sonnet.txt >> sonnet_4x.txt
done done
} }
main() { main() {

View File

@ -126,7 +126,8 @@ kill_gpu_processes() {
ps -aux ps -aux
lsof -t -i:8000 | xargs -r kill -9 lsof -t -i:8000 | xargs -r kill -9
pgrep python3 | xargs -r kill -9 pgrep python3 | xargs -r kill -9
# vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
pgrep VLLM | xargs -r kill -9
# wait until GPU memory usage smaller than 1GB # wait until GPU memory usage smaller than 1GB
if command -v nvidia-smi; then if command -v nvidia-smi; then
@ -164,7 +165,7 @@ upload_to_buildkite() {
} }
run_latency_tests() { run_latency_tests() {
# run latency tests using `benchmark_latency.py` # run latency tests using `vllm bench latency` command
# $1: a json file specifying latency test cases # $1: a json file specifying latency test cases
local latency_test_file local latency_test_file
@ -205,7 +206,7 @@ run_latency_tests() {
fi fi
fi fi
latency_command=" $latency_envs python3 benchmark_latency.py \ latency_command=" $latency_envs vllm bench latency \
--output-json $RESULTS_FOLDER/${test_name}.json \ --output-json $RESULTS_FOLDER/${test_name}.json \
$latency_args" $latency_args"
@ -231,7 +232,7 @@ run_latency_tests() {
} }
run_throughput_tests() { run_throughput_tests() {
# run throughput tests using `benchmark_throughput.py` # run throughput tests using `vllm bench throughput`
# $1: a json file specifying throughput test cases # $1: a json file specifying throughput test cases
local throughput_test_file local throughput_test_file
@ -272,7 +273,7 @@ run_throughput_tests() {
fi fi
fi fi
throughput_command=" $throughput_envs python3 benchmark_throughput.py \ throughput_command=" $throughput_envs vllm bench throughput \
--output-json $RESULTS_FOLDER/${test_name}.json \ --output-json $RESULTS_FOLDER/${test_name}.json \
$throughput_args" $throughput_args"
@ -297,7 +298,7 @@ run_throughput_tests() {
} }
run_serving_tests() { run_serving_tests() {
# run serving tests using `benchmark_serving.py` # run serving tests using `vllm bench serve` command
# $1: a json file specifying serving test cases # $1: a json file specifying serving test cases
local serving_test_file local serving_test_file
@ -393,7 +394,7 @@ run_serving_tests() {
# pass the tensor parallel size to the client so that it can be displayed # pass the tensor parallel size to the client so that it can be displayed
# on the benchmark dashboard # on the benchmark dashboard
client_command="python3 benchmark_serving.py \ client_command="vllm bench serve \
--save-result \ --save-result \
--result-dir $RESULTS_FOLDER \ --result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \ --result-filename ${new_test_name}.json \
@ -447,7 +448,7 @@ main() {
(which jq) || (apt-get update && apt-get -y install jq) (which jq) || (apt-get update && apt-get -y install jq)
(which lsof) || (apt-get update && apt-get install -y lsof) (which lsof) || (apt-get update && apt-get install -y lsof)
# get the current IP address, required by benchmark_serving.py # get the current IP address, required by `vllm bench serve` command
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}') export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
# turn of the reporting of the status of each request, to clean up the terminal output # turn of the reporting of the status of each request, to clean up the terminal output
export VLLM_LOGGING_LEVEL="WARNING" export VLLM_LOGGING_LEVEL="WARNING"

View File

@ -13,9 +13,9 @@ NUMA_NODE=${NUMA_NODE:-1}
export CMAKE_BUILD_PARALLEL_LEVEL=32 export CMAKE_BUILD_PARALLEL_LEVEL=32
# Setup cleanup # Setup cleanup
remove_docker_container() { remove_docker_container() {
set -e; set -e;
docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true; docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
} }
trap remove_docker_container EXIT trap remove_docker_container EXIT
remove_docker_container remove_docker_container
@ -69,7 +69,7 @@ function cpu_tests() {
docker exec cpu-test-"$NUMA_NODE" bash -c " docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e set -e
pytest -s -v \ pytest -s -v \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]" tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
# Note: disable it until supports V1 # Note: disable it until supports V1
# Run AWQ test # Run AWQ test
@ -83,7 +83,7 @@ function cpu_tests() {
set -e set -e
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 & VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1 timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
python3 benchmarks/benchmark_serving.py \ vllm bench serve \
--backend vllm \ --backend vllm \
--dataset-name random \ --dataset-name random \
--model meta-llama/Llama-3.2-3B-Instruct \ --model meta-llama/Llama-3.2-3B-Instruct \

View File

@ -0,0 +1,166 @@
#!/bin/bash
set -xu
remove_docker_container() {
docker rm -f tpu-test || true;
docker rm -f vllm-tpu || true;
}
trap remove_docker_container EXIT
# Remove the container that might not be cleaned up in the previous run.
remove_docker_container
# Build the docker image.
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
# Set up cleanup.
cleanup_docker() {
# Get Docker's root directory
docker_root=$(docker info -f '{{.DockerRootDir}}')
if [ -z "$docker_root" ]; then
echo "Failed to determine Docker root directory."
exit 1
fi
echo "Docker root directory: $docker_root"
# Check disk usage of the filesystem where Docker's root directory is located
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
# Define the threshold
threshold=70
if [ "$disk_usage" -gt "$threshold" ]; then
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
# Remove dangling images (those that are not tagged and not used by any container)
docker image prune -f
# Remove unused volumes / force the system prune for old images as well.
docker volume prune -f && docker system prune --force --filter "until=72h" --all
echo "Docker images and volumes cleanup completed."
else
echo "Disk usage is below $threshold%. No cleanup needed."
fi
}
cleanup_docker
# For HF_TOKEN.
source /etc/environment
docker run --privileged --net host --shm-size=16G -it \
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
vllm-tpu /bin/bash -c '
set -e # Exit immediately if a command exits with a non-zero status.
set -u # Treat unset variables as an error.
echo "--- Starting script inside Docker container ---"
# Create results directory
RESULTS_DIR=$(mktemp -d)
# If mktemp fails, set -e will cause the script to exit.
echo "Results will be stored in: $RESULTS_DIR"
# Install dependencies
echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
&& python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 \
&& python3 -m pip install --progress-bar off hf-transfer
echo "--- Python dependencies installed ---"
export VLLM_USE_V1=1
export VLLM_XLA_CHECK_RECOMPILATION=1
export VLLM_XLA_CACHE_PATH=
echo "Using VLLM V1"
echo "--- Hardware Information ---"
# tpu-info
echo "--- Starting Tests ---"
set +e
overall_script_exit_code=0
# --- Test Definitions ---
# If a test fails, this function will print logs and will not cause the main script to exit.
run_test() {
local test_num=$1
local test_name=$2
local test_command=$3
local log_file="$RESULTS_DIR/test_${test_num}.log"
local actual_exit_code
echo "--- TEST_$test_num: Running $test_name ---"
# Execute the test command.
eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2)
actual_exit_code=$?
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log
if [ "$actual_exit_code" -ne 0 ]; then
echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2
echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2
if [ -f "$log_file" ]; then
cat "$log_file" >&2
else
echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2
fi
echo "--- End of log for TEST_$test_num ($test_name) ---" >&2
return "$actual_exit_code" # Return the failure code
else
echo "TEST_$test_num ($test_name) PASSED."
return 0 # Return success
fi
}
# Helper function to call run_test and update the overall script exit code
run_and_track_test() {
local test_num_arg="$1"
local test_name_arg="$2"
local test_command_arg="$3"
# Run the test
run_test "$test_num_arg" "$test_name_arg" "$test_command_arg"
local test_specific_exit_code=$?
# If the test failed, set the overall script exit code to 1
if [ "$test_specific_exit_code" -ne 0 ]; then
# No need for extra echo here, run_test already logged the failure.
overall_script_exit_code=1
fi
}
# --- Actual Test Execution ---
run_and_track_test 1 "test_struct_output_generate.py" \
"HF_HUB_DISABLE_XET=1 python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
run_and_track_test 2 "test_moe_pallas.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
run_and_track_test 3 "test_lora.py" \
"VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
run_and_track_test 4 "test_tpu_qkv_linear.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
run_and_track_test 5 "test_spmd_model_weight_loading.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
run_and_track_test 6 "test_kv_cache_update_kernel.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
# After all tests have been attempted, exit with the overall status.
if [ "$overall_script_exit_code" -ne 0 ]; then
echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---"
else
echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---"
fi
exit "$overall_script_exit_code"
' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct.
# Capture the exit code of the docker run command
DOCKER_RUN_EXIT_CODE=$?
# The trap will run for cleanup.
# Exit the main script with the Docker run command's exit code.
if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then
echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE."
exit "$DOCKER_RUN_EXIT_CODE"
else
echo "Docker run command completed successfully."
exit 0
fi
# TODO: This test fails because it uses RANDOM_SEED sampling
# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \

View File

@ -150,18 +150,6 @@ run_and_track_test 9 "test_multimodal.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py" "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py"
run_and_track_test 10 "test_pallas.py" \ run_and_track_test 10 "test_pallas.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py" "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
run_and_track_test 11 "test_struct_output_generate.py" \
"HF_HUB_DISABLE_XET=1 python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
run_and_track_test 12 "test_moe_pallas.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
run_and_track_test 13 "test_lora.py" \
"VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
run_and_track_test 14 "test_tpu_qkv_linear.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
run_and_track_test 15 "test_spmd_model_weight_loading.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
run_and_track_test 16 "test_kv_cache_update_kernel.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
# After all tests have been attempted, exit with the overall status. # After all tests have been attempted, exit with the overall status.
if [ "$overall_script_exit_code" -ne 0 ]; then if [ "$overall_script_exit_code" -ne 0 ]; then

View File

@ -11,10 +11,10 @@ cd "$(dirname "${BASH_SOURCE[0]}")/../.."
(which wget && which curl) || (apt-get update && apt-get install -y wget curl) (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
# run python-based benchmarks and upload the result to buildkite # run python-based benchmarks and upload the result to buildkite
python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt vllm bench latency --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
bench_latency_exit_code=$? bench_latency_exit_code=$?
python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
bench_throughput_exit_code=$? bench_throughput_exit_code=$?
# run server-based benchmarks and upload the result to buildkite # run server-based benchmarks and upload the result to buildkite
@ -24,7 +24,7 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
# wait for server to start, timeout after 600 seconds # wait for server to start, timeout after 600 seconds
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1 timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
python3 benchmarks/benchmark_serving.py \ vllm bench serve \
--backend vllm \ --backend vllm \
--dataset-name sharegpt \ --dataset-name sharegpt \
--dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \ --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \

View File

@ -77,7 +77,7 @@ done
echo "run benchmark test..." echo "run benchmark test..."
echo "logging to $BM_LOG" echo "logging to $BM_LOG"
echo echo
python benchmarks/benchmark_serving.py \ vllm bench serve \
--backend vllm \ --backend vllm \
--model $MODEL \ --model $MODEL \
--dataset-name sonnet \ --dataset-name sonnet \

View File

@ -7,7 +7,7 @@ permissions:
jobs: jobs:
lint-and-deploy: lint-and-deploy:
runs-on: ubuntu-latest runs-on: ubuntu-24.04-arm
steps: steps:
- name: Checkout - name: Checkout
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

View File

@ -98,7 +98,7 @@ Then run the benchmarking script
```bash ```bash
# download dataset # download dataset
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
python3 vllm/benchmarks/benchmark_serving.py \ vllm bench serve \
--backend vllm \ --backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \ --model NousResearch/Hermes-3-Llama-3.1-8B \
--endpoint /v1/completions \ --endpoint /v1/completions \
@ -111,25 +111,25 @@ If successful, you will see the following output
``` ```
============ Serving Benchmark Result ============ ============ Serving Benchmark Result ============
Successful requests: 10 Successful requests: 10
Benchmark duration (s): 5.78 Benchmark duration (s): 5.78
Total input tokens: 1369 Total input tokens: 1369
Total generated tokens: 2212 Total generated tokens: 2212
Request throughput (req/s): 1.73 Request throughput (req/s): 1.73
Output token throughput (tok/s): 382.89 Output token throughput (tok/s): 382.89
Total Token throughput (tok/s): 619.85 Total Token throughput (tok/s): 619.85
---------------Time to First Token---------------- ---------------Time to First Token----------------
Mean TTFT (ms): 71.54 Mean TTFT (ms): 71.54
Median TTFT (ms): 73.88 Median TTFT (ms): 73.88
P99 TTFT (ms): 79.49 P99 TTFT (ms): 79.49
-----Time per Output Token (excl. 1st token)------ -----Time per Output Token (excl. 1st token)------
Mean TPOT (ms): 7.91 Mean TPOT (ms): 7.91
Median TPOT (ms): 7.96 Median TPOT (ms): 7.96
P99 TPOT (ms): 8.03 P99 TPOT (ms): 8.03
---------------Inter-token Latency---------------- ---------------Inter-token Latency----------------
Mean ITL (ms): 7.74 Mean ITL (ms): 7.74
Median ITL (ms): 7.70 Median ITL (ms): 7.70
P99 ITL (ms): 8.39 P99 ITL (ms): 8.39
================================================== ==================================================
``` ```
@ -141,7 +141,7 @@ If the dataset you want to benchmark is not supported yet in vLLM, even then you
{"prompt": "What is the capital of India?"} {"prompt": "What is the capital of India?"}
{"prompt": "What is the capital of Iran?"} {"prompt": "What is the capital of Iran?"}
{"prompt": "What is the capital of China?"} {"prompt": "What is the capital of China?"}
``` ```
```bash ```bash
# start server # start server
@ -150,7 +150,7 @@ VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests
```bash ```bash
# run benchmarking script # run benchmarking script
python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detailed \ vllm bench serve --port 9001 --save-result --save-detailed \
--backend vllm \ --backend vllm \
--model meta-llama/Llama-3.1-8B-Instruct \ --model meta-llama/Llama-3.1-8B-Instruct \
--endpoint /v1/completions \ --endpoint /v1/completions \
@ -174,7 +174,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
``` ```
```bash ```bash
python3 vllm/benchmarks/benchmark_serving.py \ vllm bench serve \
--backend openai-chat \ --backend openai-chat \
--model Qwen/Qwen2-VL-7B-Instruct \ --model Qwen/Qwen2-VL-7B-Instruct \
--endpoint /v1/chat/completions \ --endpoint /v1/chat/completions \
@ -194,7 +194,7 @@ VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
``` ```
``` bash ``` bash
python3 benchmarks/benchmark_serving.py \ vllm bench serve \
--model meta-llama/Meta-Llama-3-8B-Instruct \ --model meta-llama/Meta-Llama-3-8B-Instruct \
--dataset-name hf \ --dataset-name hf \
--dataset-path likaixin/InstructCoder \ --dataset-path likaixin/InstructCoder \
@ -210,7 +210,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
**`lmms-lab/LLaVA-OneVision-Data`** **`lmms-lab/LLaVA-OneVision-Data`**
```bash ```bash
python3 vllm/benchmarks/benchmark_serving.py \ vllm bench serve \
--backend openai-chat \ --backend openai-chat \
--model Qwen/Qwen2-VL-7B-Instruct \ --model Qwen/Qwen2-VL-7B-Instruct \
--endpoint /v1/chat/completions \ --endpoint /v1/chat/completions \
@ -224,7 +224,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
**`Aeala/ShareGPT_Vicuna_unfiltered`** **`Aeala/ShareGPT_Vicuna_unfiltered`**
```bash ```bash
python3 vllm/benchmarks/benchmark_serving.py \ vllm bench serve \
--backend openai-chat \ --backend openai-chat \
--model Qwen/Qwen2-VL-7B-Instruct \ --model Qwen/Qwen2-VL-7B-Instruct \
--endpoint /v1/chat/completions \ --endpoint /v1/chat/completions \
@ -237,7 +237,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
**`AI-MO/aimo-validation-aime`** **`AI-MO/aimo-validation-aime`**
``` bash ``` bash
python3 vllm/benchmarks/benchmark_serving.py \ vllm bench serve \
--model Qwen/QwQ-32B \ --model Qwen/QwQ-32B \
--dataset-name hf \ --dataset-name hf \
--dataset-path AI-MO/aimo-validation-aime \ --dataset-path AI-MO/aimo-validation-aime \
@ -248,7 +248,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
**`philschmid/mt-bench`** **`philschmid/mt-bench`**
``` bash ``` bash
python3 vllm/benchmarks/benchmark_serving.py \ vllm bench serve \
--model Qwen/QwQ-32B \ --model Qwen/QwQ-32B \
--dataset-name hf \ --dataset-name hf \
--dataset-path philschmid/mt-bench \ --dataset-path philschmid/mt-bench \
@ -261,7 +261,7 @@ When using OpenAI-compatible backends such as `vllm`, optional sampling
parameters can be specified. Example client command: parameters can be specified. Example client command:
```bash ```bash
python3 vllm/benchmarks/benchmark_serving.py \ vllm bench serve \
--backend vllm \ --backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \ --model NousResearch/Hermes-3-Llama-3.1-8B \
--endpoint /v1/completions \ --endpoint /v1/completions \
@ -296,7 +296,7 @@ The following arguments can be used to control the ramp-up:
<br/> <br/>
```bash ```bash
python3 vllm/benchmarks/benchmark_throughput.py \ vllm bench throughput \
--model NousResearch/Hermes-3-Llama-3.1-8B \ --model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset-name sonnet \ --dataset-name sonnet \
--dataset-path vllm/benchmarks/sonnet.txt \ --dataset-path vllm/benchmarks/sonnet.txt \
@ -314,7 +314,7 @@ Total num output tokens: 1500
**VisionArena Benchmark for Vision Language Models** **VisionArena Benchmark for Vision Language Models**
``` bash ``` bash
python3 vllm/benchmarks/benchmark_throughput.py \ vllm bench throughput \
--model Qwen/Qwen2-VL-7B-Instruct \ --model Qwen/Qwen2-VL-7B-Instruct \
--backend vllm-chat \ --backend vllm-chat \
--dataset-name hf \ --dataset-name hf \
@ -336,7 +336,7 @@ Total num output tokens: 1280
``` bash ``` bash
VLLM_WORKER_MULTIPROC_METHOD=spawn \ VLLM_WORKER_MULTIPROC_METHOD=spawn \
VLLM_USE_V1=1 \ VLLM_USE_V1=1 \
python3 vllm/benchmarks/benchmark_throughput.py \ vllm bench throughput \
--dataset-name=hf \ --dataset-name=hf \
--dataset-path=likaixin/InstructCoder \ --dataset-path=likaixin/InstructCoder \
--model=meta-llama/Meta-Llama-3-8B-Instruct \ --model=meta-llama/Meta-Llama-3-8B-Instruct \
@ -360,7 +360,7 @@ Total num output tokens: 204800
**`lmms-lab/LLaVA-OneVision-Data`** **`lmms-lab/LLaVA-OneVision-Data`**
```bash ```bash
python3 vllm/benchmarks/benchmark_throughput.py \ vllm bench throughput \
--model Qwen/Qwen2-VL-7B-Instruct \ --model Qwen/Qwen2-VL-7B-Instruct \
--backend vllm-chat \ --backend vllm-chat \
--dataset-name hf \ --dataset-name hf \
@ -373,7 +373,7 @@ python3 vllm/benchmarks/benchmark_throughput.py \
**`Aeala/ShareGPT_Vicuna_unfiltered`** **`Aeala/ShareGPT_Vicuna_unfiltered`**
```bash ```bash
python3 vllm/benchmarks/benchmark_throughput.py \ vllm bench throughput \
--model Qwen/Qwen2-VL-7B-Instruct \ --model Qwen/Qwen2-VL-7B-Instruct \
--backend vllm-chat \ --backend vllm-chat \
--dataset-name hf \ --dataset-name hf \
@ -385,7 +385,7 @@ python3 vllm/benchmarks/benchmark_throughput.py \
**`AI-MO/aimo-validation-aime`** **`AI-MO/aimo-validation-aime`**
```bash ```bash
python3 benchmarks/benchmark_throughput.py \ vllm bench throughput \
--model Qwen/QwQ-32B \ --model Qwen/QwQ-32B \
--backend vllm \ --backend vllm \
--dataset-name hf \ --dataset-name hf \
@ -399,7 +399,7 @@ python3 benchmarks/benchmark_throughput.py \
``` bash ``` bash
# download dataset # download dataset
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
python3 vllm/benchmarks/benchmark_throughput.py \ vllm bench throughput \
--model meta-llama/Llama-2-7b-hf \ --model meta-llama/Llama-2-7b-hf \
--backend vllm \ --backend vllm \
--dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \ --dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \

View File

@ -105,7 +105,7 @@ After the script finishes, you will find the results in a new, timestamped direc
- **Log Files**: The directory (`$BASE/auto-benchmark/YYYY_MM_DD_HH_MM/`) contains detailed logs for each run: - **Log Files**: The directory (`$BASE/auto-benchmark/YYYY_MM_DD_HH_MM/`) contains detailed logs for each run:
- `vllm_log_...txt`: The log output from the vLLM server for each parameter combination. - `vllm_log_...txt`: The log output from the vLLM server for each parameter combination.
- `bm_log_...txt`: The log output from the `benchmark_serving.py` script for each benchmark run. - `bm_log_...txt`: The log output from the `vllm bench serve` command for each benchmark run.
- **Final Result Summary**: A file named `result.txt` is created in the log directory. It contains a summary of each tested combination and concludes with the overall best parameters found. - **Final Result Summary**: A file named `result.txt` is created in the log directory. It contains a summary of each tested combination and concludes with the overall best parameters found.

View File

@ -1,6 +1,6 @@
#!/bin/bash #!/bin/bash
# This script aims to tune the best server parameter combinations to maximize throughput for given requirement. # This script aims to tune the best server parameter combinations to maximize throughput for given requirement.
# See details in README (benchmarks/auto_tune/README.md). # See details in README (benchmarks/auto_tune/README.md).
TAG=$(date +"%Y_%m_%d_%H_%M") TAG=$(date +"%Y_%m_%d_%H_%M")
@ -56,7 +56,7 @@ start_server() {
local max_num_batched_tokens=$3 local max_num_batched_tokens=$3
local vllm_log=$4 local vllm_log=$4
local profile_dir=$5 local profile_dir=$5
pkill -f vllm pkill -f vllm
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir vllm serve $MODEL \ VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir vllm serve $MODEL \
@ -73,9 +73,9 @@ start_server() {
# wait for 10 minutes... # wait for 10 minutes...
server_started=0 server_started=0
for i in {1..60}; do for i in {1..60}; do
RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout) RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1) STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
if [[ "$STATUS_CODE" -eq 200 ]]; then if [[ "$STATUS_CODE" -eq 200 ]]; then
server_started=1 server_started=1
break break
@ -98,10 +98,10 @@ update_best_profile() {
selected_profile_file= selected_profile_file=
if [[ "$SYSTEM" == "TPU" ]]; then if [[ "$SYSTEM" == "TPU" ]]; then
selected_profile_file="${sorted_paths[$profile_index]}/*.xplane.pb" selected_profile_file="${sorted_paths[$profile_index]}/*.xplane.pb"
fi fi
if [[ "$SYSTEM" == "GPU" ]]; then if [[ "$SYSTEM" == "GPU" ]]; then
selected_profile_file="${sorted_paths[$profile_index]}" selected_profile_file="${sorted_paths[$profile_index]}"
fi fi
rm -f $PROFILE_PATH/* rm -f $PROFILE_PATH/*
cp $selected_profile_file $PROFILE_PATH cp $selected_profile_file $PROFILE_PATH
} }
@ -129,14 +129,14 @@ run_benchmark() {
echo "server started." echo "server started."
fi fi
echo echo
echo "run benchmark test..." echo "run benchmark test..."
meet_latency_requirement=0 meet_latency_requirement=0
# get a basic qps by using request-rate inf # get a basic qps by using request-rate inf
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt" bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 )) prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
adjusted_input_len=$(( INPUT_LEN - prefix_len )) adjusted_input_len=$(( INPUT_LEN - prefix_len ))
python3 benchmarks/benchmark_serving.py \ vllm bench serve \
--backend vllm \ --backend vllm \
--model $MODEL \ --model $MODEL \
--dataset-name random \ --dataset-name random \
@ -169,7 +169,7 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len ))
curl -X POST http://0.0.0.0:8004/reset_prefix_cache curl -X POST http://0.0.0.0:8004/reset_prefix_cache
sleep 5 sleep 5
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt" bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
python3 benchmarks/benchmark_serving.py \ vllm bench serve \
--backend vllm \ --backend vllm \
--model $MODEL \ --model $MODEL \
--dataset-name random \ --dataset-name random \

View File

@ -11,6 +11,7 @@ from typing import Any, Optional
import numpy as np import numpy as np
from tqdm import tqdm from tqdm import tqdm
from typing_extensions import deprecated
import vllm.envs as envs import vllm.envs as envs
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
@ -34,6 +35,10 @@ def save_to_pytorch_benchmark_format(
write_to_json(pt_file, pt_records) write_to_json(pt_file, pt_records)
@deprecated(
"benchmark_latency.py is deprecated and will be removed in a "
"future version. Please use 'vllm bench latency' instead.",
)
def main(args: argparse.Namespace): def main(args: argparse.Namespace):
print(args) print(args)

View File

@ -38,6 +38,7 @@ from typing import Any, Literal, Optional
import numpy as np import numpy as np
from tqdm.asyncio import tqdm from tqdm.asyncio import tqdm
from transformers import PreTrainedTokenizerBase from transformers import PreTrainedTokenizerBase
from typing_extensions import deprecated
from backend_request_func import ( from backend_request_func import (
ASYNC_REQUEST_FUNCS, ASYNC_REQUEST_FUNCS,
@ -593,6 +594,10 @@ def save_to_pytorch_benchmark_format(
write_to_json(pt_file, pt_records) write_to_json(pt_file, pt_records)
@deprecated(
"benchmark_serving.py is deprecated and will be removed in a future "
"version. Please use 'vllm bench serve' instead.",
)
def main(args: argparse.Namespace): def main(args: argparse.Namespace):
print(args) print(args)
random.seed(args.seed) random.seed(args.seed)

View File

@ -15,6 +15,7 @@ import torch
import uvloop import uvloop
from tqdm import tqdm from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase
from typing_extensions import deprecated
from benchmark_dataset import ( from benchmark_dataset import (
AIMODataset, AIMODataset,
@ -382,6 +383,10 @@ def get_requests(args, tokenizer):
return dataset_cls(**common_kwargs).sample(**sample_kwargs) return dataset_cls(**common_kwargs).sample(**sample_kwargs)
@deprecated(
"benchmark_throughput.py is deprecated and will be removed in a "
"future version. Please use 'vllm bench throughput' instead.",
)
def main(args: argparse.Namespace): def main(args: argparse.Namespace):
if args.seed is None: if args.seed is None:
args.seed = 0 args.seed = 0

View File

@ -3,7 +3,7 @@
# benchmark the overhead of disaggregated prefill. # benchmark the overhead of disaggregated prefill.
# methodology: # methodology:
# - send all request to prefill vLLM instance. It will buffer KV cache. # - send all request to prefill vLLM instance. It will buffer KV cache.
# - then send all request to decode instance. # - then send all request to decode instance.
# - The TTFT of decode instance is the overhead. # - The TTFT of decode instance is the overhead.
set -ex set -ex
@ -12,6 +12,8 @@ kill_gpu_processes() {
# kill all processes on GPU. # kill all processes on GPU.
pgrep pt_main_thread | xargs -r kill -9 pgrep pt_main_thread | xargs -r kill -9
pgrep python3 | xargs -r kill -9 pgrep python3 | xargs -r kill -9
# vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
pgrep VLLM | xargs -r kill -9
sleep 10 sleep 10
# remove vllm config file # remove vllm config file
@ -61,7 +63,7 @@ benchmark() {
--gpu-memory-utilization 0.6 \ --gpu-memory-utilization 0.6 \
--kv-transfer-config \ --kv-transfer-config \
'{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' & '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
CUDA_VISIBLE_DEVICES=1 python3 \ CUDA_VISIBLE_DEVICES=1 python3 \
-m vllm.entrypoints.openai.api_server \ -m vllm.entrypoints.openai.api_server \
@ -76,38 +78,38 @@ benchmark() {
wait_for_server 8200 wait_for_server 8200
# let the prefill instance finish prefill # let the prefill instance finish prefill
python3 ../benchmark_serving.py \ vllm bench serve \
--backend vllm \ --backend vllm \
--model $model \ --model $model \
--dataset-name $dataset_name \ --dataset-name $dataset_name \
--dataset-path $dataset_path \ --dataset-path $dataset_path \
--sonnet-input-len $input_len \ --sonnet-input-len $input_len \
--sonnet-output-len "$output_len" \ --sonnet-output-len "$output_len" \
--sonnet-prefix-len $prefix_len \ --sonnet-prefix-len $prefix_len \
--num-prompts $num_prompts \ --num-prompts $num_prompts \
--port 8100 \ --port 8100 \
--save-result \ --save-result \
--result-dir $results_folder \ --result-dir $results_folder \
--result-filename disagg_prefill_tp1.json \ --result-filename disagg_prefill_tp1.json \
--request-rate "inf" --request-rate "inf"
# send the request to decode. # send the request to decode.
# The TTFT of this command will be the overhead of disagg prefill impl. # The TTFT of this command will be the overhead of disagg prefill impl.
python3 ../benchmark_serving.py \ vllm bench serve \
--backend vllm \ --backend vllm \
--model $model \ --model $model \
--dataset-name $dataset_name \ --dataset-name $dataset_name \
--dataset-path $dataset_path \ --dataset-path $dataset_path \
--sonnet-input-len $input_len \ --sonnet-input-len $input_len \
--sonnet-output-len "$output_len" \ --sonnet-output-len "$output_len" \
--sonnet-prefix-len $prefix_len \ --sonnet-prefix-len $prefix_len \
--num-prompts $num_prompts \ --num-prompts $num_prompts \
--port 8200 \ --port 8200 \
--save-result \ --save-result \
--result-dir $results_folder \ --result-dir $results_folder \
--result-filename disagg_prefill_tp1_overhead.json \ --result-filename disagg_prefill_tp1_overhead.json \
--request-rate "$qps" --request-rate "$qps"
kill_gpu_processes kill_gpu_processes
} }

View File

@ -18,6 +18,8 @@ kill_gpu_processes() {
# kill all processes on GPU. # kill all processes on GPU.
pgrep pt_main_thread | xargs -r kill -9 pgrep pt_main_thread | xargs -r kill -9
pgrep python3 | xargs -r kill -9 pgrep python3 | xargs -r kill -9
# vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
pgrep VLLM | xargs -r kill -9
for port in 8000 8100 8200; do lsof -t -i:$port | xargs -r kill -9; done for port in 8000 8100 8200; do lsof -t -i:$port | xargs -r kill -9; done
sleep 1 sleep 1
} }
@ -58,7 +60,7 @@ launch_chunked_prefill() {
launch_disagg_prefill() { launch_disagg_prefill() {
model="meta-llama/Meta-Llama-3.1-8B-Instruct" model="meta-llama/Meta-Llama-3.1-8B-Instruct"
# disagg prefill # disagg prefill
CUDA_VISIBLE_DEVICES=0 python3 \ CUDA_VISIBLE_DEVICES=0 python3 \
-m vllm.entrypoints.openai.api_server \ -m vllm.entrypoints.openai.api_server \
@ -97,20 +99,20 @@ benchmark() {
output_len=$2 output_len=$2
tag=$3 tag=$3
python3 ../benchmark_serving.py \ vllm bench serve \
--backend vllm \ --backend vllm \
--model $model \ --model $model \
--dataset-name $dataset_name \ --dataset-name $dataset_name \
--dataset-path $dataset_path \ --dataset-path $dataset_path \
--sonnet-input-len $input_len \ --sonnet-input-len $input_len \
--sonnet-output-len "$output_len" \ --sonnet-output-len "$output_len" \
--sonnet-prefix-len $prefix_len \ --sonnet-prefix-len $prefix_len \
--num-prompts $num_prompts \ --num-prompts $num_prompts \
--port 8000 \ --port 8000 \
--save-result \ --save-result \
--result-dir $results_folder \ --result-dir $results_folder \
--result-filename "$tag"-qps-"$qps".json \ --result-filename "$tag"-qps-"$qps".json \
--request-rate "$qps" --request-rate "$qps"
sleep 2 sleep 2
} }

View File

@ -5,9 +5,8 @@ import itertools
import torch import torch
from vllm import _custom_ops as ops
from vllm.model_executor.layers.fused_moe.moe_align_block_size import ( from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
moe_align_block_size_triton, moe_align_block_size,
) )
from vllm.triton_utils import triton from vllm.triton_utils import triton
@ -21,60 +20,6 @@ def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor:
) )
def check_correctness(num_tokens, num_experts=256, block_size=256, topk=8):
"""
Verifies vllm vs. Triton
"""
topk_ids = get_topk_ids(num_tokens, num_experts, topk)
# 1. malloc space for triton and vllm
# malloc enough space (max_num_tokens_padded) for the sorted ids
max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
sorted_ids_triton = torch.empty(
(max_num_tokens_padded,), dtype=torch.int32, device="cuda"
)
expert_ids_triton = torch.empty(
(max_num_tokens_padded // block_size,), dtype=torch.int32, device="cuda"
)
num_tokens_post_pad_triton = torch.empty((1,), dtype=torch.int32, device="cuda")
sorted_ids_vllm = torch.empty_like(sorted_ids_triton)
expert_ids_vllm = torch.empty_like(expert_ids_triton)
num_tokens_post_pad_vllm = torch.empty_like(num_tokens_post_pad_triton)
# 2. run implementations
moe_align_block_size_triton(
topk_ids,
num_experts,
block_size,
sorted_ids_triton,
expert_ids_triton,
num_tokens_post_pad_triton,
)
ops.moe_align_block_size(
topk_ids,
num_experts,
block_size,
sorted_ids_vllm,
expert_ids_vllm,
num_tokens_post_pad_vllm,
)
print(f"✅ VLLM implementation works with {num_experts} experts!")
# 3. compare results
if torch.allclose(expert_ids_triton, expert_ids_vllm) and torch.allclose(
num_tokens_post_pad_triton, num_tokens_post_pad_vllm
):
print("✅ Triton and VLLM implementations match.")
else:
print("❌ Triton and VLLM implementations DO NOT match.")
print("Triton expert_ids:", expert_ids_triton)
print("VLLM expert_ids:", expert_ids_vllm)
print("Triton num_tokens_post_pad:", num_tokens_post_pad_triton)
print("VLLM num_tokens_post_pad:", num_tokens_post_pad_vllm)
# test configurations # test configurations
num_tokens_range = [1, 16, 256, 4096] num_tokens_range = [1, 16, 256, 4096]
num_experts_range = [16, 64, 224, 256, 280, 512] num_experts_range = [16, 64, 224, 256, 280, 512]
@ -87,8 +32,8 @@ configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range
x_names=["num_tokens", "num_experts", "topk"], x_names=["num_tokens", "num_experts", "topk"],
x_vals=configs, x_vals=configs,
line_arg="provider", line_arg="provider",
line_vals=["vllm", "triton"], # "triton" line_vals=["vllm"],
line_names=["VLLM", "Triton"], # "Triton" line_names=["vLLM"],
plot_name="moe-align-block-size-performance", plot_name="moe-align-block-size-performance",
args={}, args={},
) )
@ -98,36 +43,11 @@ def benchmark(num_tokens, num_experts, topk, provider):
block_size = 256 block_size = 256
topk_ids = get_topk_ids(num_tokens, num_experts, topk) topk_ids = get_topk_ids(num_tokens, num_experts, topk)
max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
sorted_ids = torch.empty((max_num_tokens_padded,), dtype=torch.int32, device="cuda")
max_num_m_blocks = max_num_tokens_padded // block_size
expert_ids = torch.empty((max_num_m_blocks,), dtype=torch.int32, device="cuda")
num_tokens_post_pad = torch.empty((1,), dtype=torch.int32, device="cuda")
quantiles = [0.5, 0.2, 0.8] quantiles = [0.5, 0.2, 0.8]
if provider == "vllm": if provider == "vllm":
ms, min_ms, max_ms = triton.testing.do_bench( ms, min_ms, max_ms = triton.testing.do_bench(
lambda: ops.moe_align_block_size( lambda: moe_align_block_size(topk_ids, block_size, num_experts),
topk_ids,
num_experts,
block_size,
sorted_ids.clone(),
expert_ids.clone(),
num_tokens_post_pad.clone(),
),
quantiles=quantiles,
)
elif provider == "triton":
ms, min_ms, max_ms = triton.testing.do_bench(
lambda: moe_align_block_size_triton(
topk_ids,
num_experts,
block_size,
sorted_ids.clone(),
expert_ids.clone(),
num_tokens_post_pad.clone(),
),
quantiles=quantiles, quantiles=quantiles,
) )
@ -151,6 +71,4 @@ if __name__ == "__main__":
) )
args = parser.parse_args() args = parser.parse_args()
print("Running correctness check...")
check_correctness(num_tokens=1024, num_experts=args.num_experts, topk=args.topk)
benchmark.run(print_data=True, show_plots=True) benchmark.run(print_data=True, show_plots=True)

View File

@ -8,12 +8,13 @@ import ray
import torch import torch
from transformers import AutoConfig from transformers import AutoConfig
from vllm.model_executor.layers.fused_moe.deep_gemm_moe import ( from vllm.model_executor.layers.fused_moe.fused_moe import *
from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
_moe_permute, _moe_permute,
_moe_unpermute_and_reduce, _moe_unpermute_and_reduce,
moe_permute,
moe_unpermute,
) )
from vllm.model_executor.layers.fused_moe.fused_moe import *
from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import *
from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
@ -63,18 +64,19 @@ def benchmark_permute(
def run(): def run():
if use_customized_permute: if use_customized_permute:
(permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = ( (
moe_permute( permuted_hidden_states,
qhidden_states, a1q_scale,
topk_weights=topk_weights, first_token_off,
topk_ids=topk_ids, inv_perm_idx,
token_expert_indices=token_expert_indices, m_indices,
topk=topk, ) = moe_permute(
n_expert=num_experts, qhidden_states,
n_local_expert=num_experts, a1q_scale=None,
expert_map=None, topk_ids=topk_ids,
align_block_size=align_block_size, n_expert=num_experts,
) expert_map=None,
align_block_size=align_block_size,
) )
else: else:
( (
@ -150,18 +152,19 @@ def benchmark_unpermute(
def prepare(): def prepare():
if use_customized_permute: if use_customized_permute:
(permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = ( (
moe_permute( permuted_hidden_states,
qhidden_states, a1q_scale,
topk_weights=topk_weights, first_token_off,
topk_ids=topk_ids, inv_perm_idx,
token_expert_indices=token_expert_indices, m_indices,
topk=topk, ) = moe_permute(
n_expert=num_experts, qhidden_states,
n_local_expert=num_experts, a1q_scale=None,
expert_map=None, topk_ids=topk_ids,
align_block_size=align_block_size, n_expert=num_experts,
) expert_map=None,
align_block_size=align_block_size,
) )
# convert to fp16/bf16 as gemm output # convert to fp16/bf16 as gemm output
return ( return (
@ -191,16 +194,19 @@ def benchmark_unpermute(
def run(input: tuple): def run(input: tuple):
if use_customized_permute: if use_customized_permute:
(permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = input (
permuted_hidden_states,
first_token_off,
inv_perm_idx,
m_indices,
) = input
output = torch.empty_like(hidden_states)
moe_unpermute( moe_unpermute(
output,
permuted_hidden_states, permuted_hidden_states,
topk_weights, topk_weights,
topk_ids,
inv_perm_idx, inv_perm_idx,
first_token_off, first_token_off,
topk,
num_experts,
num_experts,
) )
else: else:
( (
@ -211,7 +217,11 @@ def benchmark_unpermute(
inv_perm, inv_perm,
) = input ) = input
_moe_unpermute_and_reduce( _moe_unpermute_and_reduce(
output_hidden_states, permuted_hidden_states, inv_perm, topk_weights output_hidden_states,
permuted_hidden_states,
inv_perm,
topk_weights,
True,
) )
# JIT compilation & warmup # JIT compilation & warmup

View File

@ -151,7 +151,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding); ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding);
// Quantization // Quantization
#if defined(__AVX512F__) || defined(__aarch64__) #if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__))
at::Tag stride_tag = at::Tag::needs_fixed_stride_order; at::Tag stride_tag = at::Tag::needs_fixed_stride_order;
// Compute int8 quantized tensor for given scaling factor. // Compute int8 quantized tensor for given scaling factor.

View File

@ -10,32 +10,28 @@
void moe_permute( void moe_permute(
const torch::Tensor& input, // [n_token, hidden] const torch::Tensor& input, // [n_token, hidden]
const torch::Tensor& topk_weights, //[n_token, topk] const torch::Tensor& topk_ids, // [n_token, topk]
torch::Tensor& topk_ids, // [n_token, topk]
const torch::Tensor& token_expert_indices, // [n_token, topk] const torch::Tensor& token_expert_indices, // [n_token, topk]
const std::optional<torch::Tensor>& expert_map, // [n_expert] const std::optional<torch::Tensor>& expert_map, // [n_expert]
int64_t n_expert, int64_t n_local_expert, int64_t topk, int64_t n_expert, int64_t n_local_expert, int64_t topk,
const std::optional<int64_t>& align_block_size, const std::optional<int64_t>& align_block_size,
torch::Tensor& torch::Tensor& permuted_input, // [permuted_size, hidden]
permuted_input, // [topk * n_token/align_block_size_m, hidden]
torch::Tensor& expert_first_token_offset, // [n_local_expert + 1] torch::Tensor& expert_first_token_offset, // [n_local_expert + 1]
torch::Tensor& src_row_id2dst_row_id_map, // [n_token, topk] torch::Tensor& inv_permuted_idx, // [n_token, topk]
torch::Tensor& permuted_idx, // [permute_size]
torch::Tensor& m_indices) { // [align_expand_m] torch::Tensor& m_indices) { // [align_expand_m]
TORCH_CHECK(topk_weights.scalar_type() == at::ScalarType::Float,
"topk_weights must be float32");
TORCH_CHECK(expert_first_token_offset.scalar_type() == at::ScalarType::Long, TORCH_CHECK(expert_first_token_offset.scalar_type() == at::ScalarType::Long,
"expert_first_token_offset must be int64"); "expert_first_token_offset must be int64");
TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int, TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int,
"topk_ids must be int32"); "topk_ids must be int32");
TORCH_CHECK(token_expert_indices.scalar_type() == at::ScalarType::Int, TORCH_CHECK(token_expert_indices.scalar_type() == at::ScalarType::Int,
"token_expert_indices must be int32"); "token_expert_indices must be int32");
TORCH_CHECK(src_row_id2dst_row_id_map.scalar_type() == at::ScalarType::Int, TORCH_CHECK(inv_permuted_idx.scalar_type() == at::ScalarType::Int,
"src_row_id2dst_row_id_map must be int32"); "inv_permuted_idx must be int32");
TORCH_CHECK(expert_first_token_offset.size(0) == n_local_expert + 1, TORCH_CHECK(expert_first_token_offset.size(0) == n_local_expert + 1,
"expert_first_token_offset shape != n_local_expert+1") "expert_first_token_offset shape != n_local_expert+1")
TORCH_CHECK( TORCH_CHECK(inv_permuted_idx.sizes() == token_expert_indices.sizes(),
src_row_id2dst_row_id_map.sizes() == token_expert_indices.sizes(), "token_expert_indices shape must be same as inv_permuted_idx");
"token_expert_indices shape must be same as src_row_id2dst_row_id_map");
auto n_token = input.sizes()[0]; auto n_token = input.sizes()[0];
auto n_hidden = input.sizes()[1]; auto n_hidden = input.sizes()[1];
auto align_block_size_value = auto align_block_size_value =
@ -46,8 +42,9 @@ void moe_permute(
auto sort_workspace = torch::empty( auto sort_workspace = torch::empty(
{sorter_size}, {sorter_size},
torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false)); torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false));
auto copy_topk_ids = topk_ids.clone(); // copy topk_ids for preprocess
auto permuted_experts_id = torch::empty_like(topk_ids); auto permuted_experts_id = torch::empty_like(topk_ids);
auto dst_row_id2src_row_id_map = torch::empty_like(src_row_id2dst_row_id_map); auto sorted_row_idx = torch::empty_like(inv_permuted_idx);
auto align_expert_first_token_offset = auto align_expert_first_token_offset =
torch::zeros_like(expert_first_token_offset); torch::zeros_like(expert_first_token_offset);
@ -67,24 +64,22 @@ void moe_permute(
const int* expert_map_ptr = get_ptr<int>(expert_map.value()); const int* expert_map_ptr = get_ptr<int>(expert_map.value());
valid_num_ptr = valid_num_ptr =
get_ptr<int64_t>(expert_first_token_offset) + n_local_expert; get_ptr<int64_t>(expert_first_token_offset) + n_local_expert;
preprocessTopkIdLauncher(get_ptr<int>(topk_ids), n_token * topk, preprocessTopkIdLauncher(get_ptr<int>(copy_topk_ids), n_token * topk,
expert_map_ptr, n_expert, stream); expert_map_ptr, n_expert, stream);
} }
// expert sort topk expert id and scan expert id get expert_first_token_offset // expert sort topk expert id and scan expert id get expert_first_token_offset
sortAndScanExpert(get_ptr<int>(topk_ids), get_ptr<int>(token_expert_indices), sortAndScanExpert(
get_ptr<int>(permuted_experts_id), get_ptr<int>(copy_topk_ids), get_ptr<int>(token_expert_indices),
get_ptr<int>(dst_row_id2src_row_id_map), get_ptr<int>(permuted_experts_id), get_ptr<int>(sorted_row_idx),
get_ptr<int64_t>(expert_first_token_offset), n_token, get_ptr<int64_t>(expert_first_token_offset), n_token, n_expert,
n_expert, n_local_expert, topk, sorter, n_local_expert, topk, sorter, get_ptr<int>(sort_workspace), stream);
get_ptr<int>(sort_workspace), stream);
// dispatch expandInputRowsKernelLauncher // dispatch expandInputRowsKernelLauncher
MOE_DISPATCH(input.scalar_type(), [&] { MOE_DISPATCH(input.scalar_type(), [&] {
expandInputRowsKernelLauncher<scalar_t>( expandInputRowsKernelLauncher<scalar_t>(
get_ptr<scalar_t>(input), get_ptr<scalar_t>(permuted_input), get_ptr<scalar_t>(input), get_ptr<scalar_t>(permuted_input),
get_ptr<float>(topk_weights), get_ptr<int>(permuted_experts_id), get_ptr<int>(permuted_experts_id), get_ptr<int>(sorted_row_idx),
get_ptr<int>(dst_row_id2src_row_id_map), get_ptr<int>(inv_permuted_idx), get_ptr<int>(permuted_idx),
get_ptr<int>(src_row_id2dst_row_id_map),
get_ptr<int64_t>(expert_first_token_offset), n_token, valid_num_ptr, get_ptr<int64_t>(expert_first_token_offset), n_token, valid_num_ptr,
n_hidden, topk, n_local_expert, align_block_size_value, stream); n_hidden, topk, n_local_expert, align_block_size_value, stream);
}); });
@ -101,32 +96,34 @@ void moe_permute(
} }
void moe_unpermute( void moe_unpermute(
const torch::Tensor& permuted_hidden_states, // [n_token * topk, hidden] const torch::Tensor& permuted_hidden_states, // [n_token * topk, hidden]
const torch::Tensor& topk_weights, //[n_token, topk] const torch::Tensor& topk_weights, // [n_token, topk]
const torch::Tensor& topk_ids, // [n_token, topk] const torch::Tensor& inv_permuted_idx, // [n_token, topk]
const torch::Tensor& src_row_id2dst_row_id_map, // [n_token, topk] const std::optional<torch::Tensor>&
const torch::Tensor& expert_first_token_offset, // [n_local_expert+1] expert_first_token_offset, // [n_local_expert+1]
int64_t n_expert, int64_t n_local_expert, int64_t topk, int64_t topk,
torch::Tensor& hidden_states // [n_token, hidden] torch::Tensor& hidden_states // [n_token, hidden]
) { ) {
TORCH_CHECK(src_row_id2dst_row_id_map.sizes() == topk_ids.sizes(),
"topk_ids shape must be same as src_row_id2dst_row_id_map");
TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int,
"topk_ids must be int32");
TORCH_CHECK( TORCH_CHECK(
permuted_hidden_states.scalar_type() == hidden_states.scalar_type(), permuted_hidden_states.scalar_type() == hidden_states.scalar_type(),
"topk_ids dtype must be same as src_row_id2dst_row_id_map"); "permuted_hidden_states dtype must be same as hidden_states");
auto n_token = hidden_states.size(0); auto n_token = hidden_states.size(0);
auto n_hidden = hidden_states.size(1); auto n_hidden = hidden_states.size(1);
auto stream = at::cuda::getCurrentCUDAStream().stream(); auto stream = at::cuda::getCurrentCUDAStream().stream();
const int64_t* valid_ptr =
get_ptr<int64_t>(expert_first_token_offset) + n_local_expert; int64_t const* valid_ptr = nullptr;
if (expert_first_token_offset.has_value()) {
int n_local_expert = expert_first_token_offset.value().size(0) - 1;
valid_ptr =
get_ptr<int64_t>(expert_first_token_offset.value()) + n_local_expert;
}
MOE_DISPATCH(hidden_states.scalar_type(), [&] { MOE_DISPATCH(hidden_states.scalar_type(), [&] {
finalizeMoeRoutingKernelLauncher<scalar_t, scalar_t>( finalizeMoeRoutingKernelLauncher<scalar_t, scalar_t>(
get_ptr<scalar_t>(permuted_hidden_states), get_ptr<scalar_t>(permuted_hidden_states),
get_ptr<scalar_t>(hidden_states), get_ptr<float>(topk_weights), get_ptr<scalar_t>(hidden_states), get_ptr<float>(topk_weights),
get_ptr<int>(src_row_id2dst_row_id_map), get_ptr<int>(topk_ids), get_ptr<int>(inv_permuted_idx), n_token, n_hidden, topk, valid_ptr,
n_token, n_hidden, topk, valid_ptr, stream); stream);
}); });
} }

View File

@ -177,7 +177,7 @@ __global__ void getMIndicesKernel(int64_t* expert_first_token_offset,
int tidx = threadIdx.x; int tidx = threadIdx.x;
extern __shared__ int64_t smem_expert_first_token_offset[]; extern __shared__ int64_t smem_expert_first_token_offset[];
for (int i = tidx; i <= num_local_expert; i += blockDim.x) { for (int i = tidx; i <= num_local_expert; i += blockDim.x) {
smem_expert_first_token_offset[tidx] = __ldg(expert_first_token_offset + i); smem_expert_first_token_offset[i] = __ldg(expert_first_token_offset + i);
} }
__syncthreads(); __syncthreads();
auto last_token_offset = smem_expert_first_token_offset[eidx + 1]; auto last_token_offset = smem_expert_first_token_offset[eidx + 1];

View File

@ -57,31 +57,19 @@ void sortAndScanExpert(int* expert_for_source_row, const int* source_rows,
template <typename T> template <typename T>
void expandInputRowsKernelLauncher( void expandInputRowsKernelLauncher(
T const* unpermuted_input, T* permuted_output, T const* unpermuted_input, T* permuted_output, int* sorted_experts,
const float* unpermuted_scales, int* sorted_experts,
int const* expanded_dest_row_to_expanded_source_row, int const* expanded_dest_row_to_expanded_source_row,
int* expanded_source_row_to_expanded_dest_row, int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
int64_t* expert_first_token_offset, int64_t const num_rows, int64_t* expert_first_token_offset, int64_t const num_rows,
int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k, int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k,
int num_local_experts, const int& align_block_size, cudaStream_t stream); int num_local_experts, const int& align_block_size, cudaStream_t stream);
// Final kernel to unpermute and scale
// This kernel unpermutes the original data, does the k-way reduction and
// performs the final skip connection.
template <typename T, typename OutputType, bool CHECK_SKIPPED>
__global__ void finalizeMoeRoutingKernel(
T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
float const* scales, int const* expanded_source_row_to_expanded_dest_row,
int const* expert_for_source_row, int64_t const orig_cols, int64_t const k,
int64_t const* num_valid_ptr);
template <class T, class OutputType> template <class T, class OutputType>
void finalizeMoeRoutingKernelLauncher( void finalizeMoeRoutingKernelLauncher(
T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output, T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
float const* scales, int const* expanded_source_row_to_expanded_dest_row, float const* scales, int const* expanded_source_row_to_expanded_dest_row,
int const* expert_for_source_row, int64_t const num_rows, int64_t const num_rows, int64_t const cols, int64_t const k,
int64_t const cols, int64_t const k, int64_t const* num_valid_ptr, int64_t const* num_valid_ptr, cudaStream_t stream);
cudaStream_t stream);
void preprocessTopkIdLauncher(int* topk_id_ptr, int size, void preprocessTopkIdLauncher(int* topk_id_ptr, int size,
const int* expert_map_ptr, int num_experts, const int* expert_map_ptr, int num_experts,

View File

@ -2,10 +2,9 @@
template <typename T, bool CHECK_SKIPPED, bool ALIGN_BLOCK_SIZE> template <typename T, bool CHECK_SKIPPED, bool ALIGN_BLOCK_SIZE>
__global__ void expandInputRowsKernel( __global__ void expandInputRowsKernel(
T const* unpermuted_input, T* permuted_output, T const* unpermuted_input, T* permuted_output, int* sorted_experts,
const float* unpermuted_scales, int* sorted_experts,
int const* expanded_dest_row_to_expanded_source_row, int const* expanded_dest_row_to_expanded_source_row,
int* expanded_source_row_to_expanded_dest_row, int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
int64_t* expert_first_token_offset, int64_t const num_rows, int64_t* expert_first_token_offset, int64_t const num_rows,
int64_t const* num_dest_rows, int64_t const cols, int64_t k, int64_t const* num_dest_rows, int64_t const cols, int64_t k,
int num_local_experts, int align_block_size) { int num_local_experts, int align_block_size) {
@ -54,6 +53,10 @@ __global__ void expandInputRowsKernel(
assert(expanded_dest_row <= INT32_MAX); assert(expanded_dest_row <= INT32_MAX);
expanded_source_row_to_expanded_dest_row[expanded_source_row] = expanded_source_row_to_expanded_dest_row[expanded_source_row] =
static_cast<int>(expanded_dest_row); static_cast<int>(expanded_dest_row);
// skip non local expert token
if (!CHECK_SKIPPED || blockIdx.x < *num_dest_rows) {
permuted_idx[expanded_dest_row] = expanded_source_row;
}
} }
if (!CHECK_SKIPPED || blockIdx.x < *num_dest_rows) { if (!CHECK_SKIPPED || blockIdx.x < *num_dest_rows) {
@ -62,7 +65,7 @@ __global__ void expandInputRowsKernel(
using DataElem = cutlass::Array<T, ELEM_PER_THREAD>; using DataElem = cutlass::Array<T, ELEM_PER_THREAD>;
// Duplicate and permute rows // Duplicate and permute rows
int64_t const source_row = expanded_source_row % num_rows; int64_t const source_row = expanded_source_row / k;
auto const* source_row_ptr = auto const* source_row_ptr =
reinterpret_cast<DataElem const*>(unpermuted_input + source_row * cols); reinterpret_cast<DataElem const*>(unpermuted_input + source_row * cols);
@ -82,10 +85,9 @@ __global__ void expandInputRowsKernel(
template <typename T> template <typename T>
void expandInputRowsKernelLauncher( void expandInputRowsKernelLauncher(
T const* unpermuted_input, T* permuted_output, T const* unpermuted_input, T* permuted_output, int* sorted_experts,
const float* unpermuted_scales, int* sorted_experts,
int const* expanded_dest_row_to_expanded_source_row, int const* expanded_dest_row_to_expanded_source_row,
int* expanded_source_row_to_expanded_dest_row, int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
int64_t* expert_first_token_offset, int64_t const num_rows, int64_t* expert_first_token_offset, int64_t const num_rows,
int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k, int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k,
int num_local_experts, const int& align_block_size, cudaStream_t stream) { int num_local_experts, const int& align_block_size, cudaStream_t stream) {
@ -105,11 +107,11 @@ void expandInputRowsKernelLauncher(
int64_t smem_size = sizeof(int64_t) * (num_local_experts + 1); int64_t smem_size = sizeof(int64_t) * (num_local_experts + 1);
func<<<blocks, threads, smem_size, stream>>>( func<<<blocks, threads, smem_size, stream>>>(
unpermuted_input, permuted_output, unpermuted_scales, sorted_experts, unpermuted_input, permuted_output, sorted_experts,
expanded_dest_row_to_expanded_source_row, expanded_dest_row_to_expanded_source_row,
expanded_source_row_to_expanded_dest_row, expert_first_token_offset, expanded_source_row_to_expanded_dest_row, permuted_idx,
num_rows, num_valid_tokens_ptr, cols, k, num_local_experts, expert_first_token_offset, num_rows, num_valid_tokens_ptr, cols, k,
align_block_size); num_local_experts, align_block_size);
} }
template <class T, class U> template <class T, class U>
@ -128,11 +130,9 @@ template <typename T, typename OutputType, bool CHECK_SKIPPED>
__global__ void finalizeMoeRoutingKernel( __global__ void finalizeMoeRoutingKernel(
T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output, T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
float const* scales, int const* expanded_source_row_to_expanded_dest_row, float const* scales, int const* expanded_source_row_to_expanded_dest_row,
int const* expert_for_source_row, int64_t const orig_cols, int64_t const k, int64_t const orig_cols, int64_t const k, int64_t const* num_valid_ptr) {
int64_t const* num_valid_ptr) {
assert(orig_cols % 4 == 0); assert(orig_cols % 4 == 0);
int64_t const original_row = blockIdx.x; int64_t const original_row = blockIdx.x;
int64_t const num_rows = gridDim.x;
auto const offset = original_row * orig_cols; auto const offset = original_row * orig_cols;
OutputType* reduced_row_ptr = reduced_unpermuted_output + offset; OutputType* reduced_row_ptr = reduced_unpermuted_output + offset;
int64_t const num_valid = *num_valid_ptr; int64_t const num_valid = *num_valid_ptr;
@ -159,14 +159,13 @@ __global__ void finalizeMoeRoutingKernel(
ComputeElem thread_output; ComputeElem thread_output;
thread_output.fill(0); thread_output.fill(0);
for (int k_idx = 0; k_idx < k; ++k_idx) { for (int k_idx = 0; k_idx < k; ++k_idx) {
int64_t const expanded_original_row = original_row + k_idx * num_rows; int64_t const expanded_original_row = original_row * k + k_idx;
int64_t const expanded_permuted_row = int64_t const expanded_permuted_row =
expanded_source_row_to_expanded_dest_row[expanded_original_row]; expanded_source_row_to_expanded_dest_row[expanded_original_row];
int64_t const k_offset = original_row * k + k_idx; int64_t const k_offset = original_row * k + k_idx;
float const row_scale = scales[k_offset]; float const row_scale = scales[k_offset];
// Check after row_rescale has accumulated
if (CHECK_SKIPPED && expanded_permuted_row >= num_valid) { if (CHECK_SKIPPED && expanded_permuted_row >= num_valid) {
continue; continue;
} }
@ -189,9 +188,8 @@ template <class T, class OutputType>
void finalizeMoeRoutingKernelLauncher( void finalizeMoeRoutingKernelLauncher(
T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output, T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
float const* scales, int const* expanded_source_row_to_expanded_dest_row, float const* scales, int const* expanded_source_row_to_expanded_dest_row,
int const* expert_for_source_row, int64_t const num_rows, int64_t const num_rows, int64_t const cols, int64_t const k,
int64_t const cols, int64_t const k, int64_t const* num_valid_ptr, int64_t const* num_valid_ptr, cudaStream_t stream) {
cudaStream_t stream) {
int64_t const blocks = num_rows; int64_t const blocks = num_rows;
int64_t const threads = 256; int64_t const threads = 256;
bool const check_finished = num_valid_ptr != nullptr; bool const check_finished = num_valid_ptr != nullptr;
@ -201,6 +199,5 @@ void finalizeMoeRoutingKernelLauncher(
auto* const kernel = func_map[check_finished]; auto* const kernel = func_map[check_finished];
kernel<<<blocks, threads, 0, stream>>>( kernel<<<blocks, threads, 0, stream>>>(
expanded_permuted_rows, reduced_unpermuted_output, scales, expanded_permuted_rows, reduced_unpermuted_output, scales,
expanded_source_row_to_expanded_dest_row, expert_for_source_row, cols, k, expanded_source_row_to_expanded_dest_row, cols, k, num_valid_ptr);
num_valid_ptr);
} }

View File

@ -56,18 +56,17 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
" -> Tensor"); " -> Tensor");
m.def( m.def(
"moe_permute(Tensor input, Tensor topk_weight, Tensor! topk_ids," "moe_permute(Tensor input, Tensor topk_ids,"
"Tensor token_expert_indices, Tensor? expert_map, int n_expert," "Tensor token_expert_indices, Tensor? expert_map, int n_expert,"
"int n_local_expert," "int n_local_expert,"
"int topk, int? align_block_size,Tensor! permuted_input, Tensor! " "int topk, int? align_block_size,Tensor! permuted_input, Tensor! "
"expert_first_token_offset, Tensor! src_row_id2dst_row_id_map, Tensor! " "expert_first_token_offset, Tensor! inv_permuted_idx, Tensor! "
"m_indices)->()"); "permuted_idx, Tensor! m_indices)->()");
m.def( m.def(
"moe_unpermute(Tensor permuted_hidden_states, Tensor topk_weights," "moe_unpermute(Tensor permuted_hidden_states, Tensor topk_weights,"
"Tensor topk_ids,Tensor src_row_id2dst_row_id_map, Tensor " "Tensor inv_permuted_idx, Tensor? expert_first_token_offset, "
"expert_first_token_offset, int n_expert, int n_local_expert,int " "int topk, Tensor! hidden_states)->()");
"topk, Tensor! hidden_states)->()");
m.def("moe_permute_unpermute_supported() -> bool"); m.def("moe_permute_unpermute_supported() -> bool");
m.impl("moe_permute_unpermute_supported", &moe_permute_unpermute_supported); m.impl("moe_permute_unpermute_supported", &moe_permute_unpermute_supported);

View File

@ -292,6 +292,11 @@ void per_token_group_quant_fp8(const torch::Tensor& input,
torch::Tensor& output_q, torch::Tensor& output_s, torch::Tensor& output_q, torch::Tensor& output_s,
int64_t group_size, double eps, double fp8_min, int64_t group_size, double eps, double fp8_min,
double fp8_max, bool scale_ue8m0); double fp8_max, bool scale_ue8m0);
void per_token_group_quant_int8(const torch::Tensor& input,
torch::Tensor& output_q,
torch::Tensor& output_s, int64_t group_size,
double eps, double int8_min, double int8_max);
#endif #endif
void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input, void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,

View File

@ -1,6 +1,8 @@
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
#include <torch/all.h> #include <torch/all.h>
#include "../per_token_group_quant_8bit.h"
#include <cmath> #include <cmath>
#include "../../dispatch_utils.h" #include "../../dispatch_utils.h"
@ -336,3 +338,11 @@ void dynamic_scaled_int8_quant(
} }
}); });
} }
void per_token_group_quant_int8(const torch::Tensor& input,
torch::Tensor& output_q,
torch::Tensor& output_s, int64_t group_size,
double eps, double int8_min, double int8_max) {
per_token_group_quant_8bit(input, output_q, output_s, group_size, eps,
int8_min, int8_max);
}

View File

@ -1,6 +1,8 @@
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
#include <c10/util/Float8_e4m3fn.h> #include <c10/util/Float8_e4m3fn.h>
#include "../per_token_group_quant_8bit.h"
#include <cmath> #include <cmath>
#include <cuda_fp16.h> #include <cuda_fp16.h>
@ -120,7 +122,7 @@ void per_token_group_quant_8bit(const torch::Tensor& input,
torch::Tensor& output_q, torch::Tensor& output_q,
torch::Tensor& output_s, int64_t group_size, torch::Tensor& output_s, int64_t group_size,
double eps, double min_8bit, double max_8bit, double eps, double min_8bit, double max_8bit,
bool scale_ue8m0 = false) { bool scale_ue8m0) {
TORCH_CHECK(input.is_contiguous()); TORCH_CHECK(input.is_contiguous());
TORCH_CHECK(output_q.is_contiguous()); TORCH_CHECK(output_q.is_contiguous());
@ -198,6 +200,8 @@ void per_token_group_quant_8bit(const torch::Tensor& input,
input.scalar_type(), "per_token_group_quant_8bit", ([&] { input.scalar_type(), "per_token_group_quant_8bit", ([&] {
if (dst_type == at::ScalarType::Float8_e4m3fn) { if (dst_type == at::ScalarType::Float8_e4m3fn) {
LAUNCH_KERNEL(scalar_t, c10::Float8_e4m3fn); LAUNCH_KERNEL(scalar_t, c10::Float8_e4m3fn);
} else if (dst_type == at::ScalarType::Char) {
LAUNCH_KERNEL(scalar_t, int8_t);
} }
})); }));

View File

@ -187,8 +187,12 @@ struct PrepackedLayoutBTemplate {
CUTE_HOST_DEVICE static constexpr auto TVbNbKL_to_offset_copy( CUTE_HOST_DEVICE static constexpr auto TVbNbKL_to_offset_copy(
Shape_NKL shape_mkl) { Shape_NKL shape_mkl) {
auto layout = TVbNbKL_to_offset(shape_mkl); auto layout = TVbNbKL_to_offset(shape_mkl);
return make_layout(coalesce(get<0>(layout)), get<1>(layout), // for 4-bit elements, having >= 64 values per column
get<2>(layout)); // allows TMA to load full 32-byte sectors
auto inner_layout =
make_layout(make_shape(_256{}, size<0>(layout) / _256{}));
return make_layout(inner_layout, get<1>(layout), get<2>(layout));
} }
// ((BlockN, BlockK), (BlocksN, BlocksK), L) -> (storage_idx) // ((BlockN, BlockK), (BlocksN, BlocksK), L) -> (storage_idx)

View File

@ -0,0 +1,10 @@
#pragma once
#include <torch/all.h>
// TODO(wentao): refactor the folder to 8bit, then includes fp8 and int8 folders
// 8-bit per-token-group quantization helper used by both FP8 and INT8
void per_token_group_quant_8bit(const torch::Tensor& input,
torch::Tensor& output_q,
torch::Tensor& output_s, int64_t group_size,
double eps, double min_8bit, double max_8bit,
bool scale_ue8m0 = false);

View File

@ -624,6 +624,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
ops.impl("per_token_group_fp8_quant", torch::kCUDA, ops.impl("per_token_group_fp8_quant", torch::kCUDA,
&per_token_group_quant_fp8); &per_token_group_quant_fp8);
// Compute per-token-group INT8 quantized tensor and scaling factor.
ops.def(
"per_token_group_quant_int8(Tensor input, Tensor! output_q, Tensor! "
"output_s, int group_size, float eps, float int8_min, float int8_max) -> "
"()");
ops.impl("per_token_group_quant_int8", torch::kCUDA,
&per_token_group_quant_int8);
// reorder weight for AllSpark Ampere W8A16 Fused Gemm kernel // reorder weight for AllSpark Ampere W8A16 Fused Gemm kernel
ops.def( ops.def(
"rearrange_kn_weight_as_n32k16_order(Tensor b_qweight, Tensor b_scales, " "rearrange_kn_weight_as_n32k16_order(Tensor b_qweight, Tensor b_scales, "

View File

@ -1,62 +0,0 @@
# This vLLM Dockerfile is used to construct an image that can build and run vLLM on ARM CPU platform.
FROM ubuntu:22.04 AS cpu-test-arm
ENV CCACHE_DIR=/root/.cache/ccache
ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
RUN --mount=type=cache,target=/var/cache/apt \
apt-get update -y \
&& apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
# tcmalloc provides better memory allocation efficiency, e.g., holding memory in caches to speed up access of commonly-used objects.
RUN --mount=type=cache,target=/root/.cache/pip \
pip install py-cpuinfo # Use this to gather CPU info and optimize based on ARM Neoverse cores
# Set LD_PRELOAD for tcmalloc on ARM
ENV LD_PRELOAD="/usr/lib/aarch64-linux-gnu/libtcmalloc_minimal.so.4"
RUN echo 'ulimit -c 0' >> ~/.bashrc
WORKDIR /workspace
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
pip install --upgrade pip && \
pip install -r requirements/build.txt
FROM cpu-test-arm AS build
WORKDIR /workspace/vllm
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
--mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
pip install -v -r requirements/cpu.txt
COPY . .
ARG GIT_REPO_CHECK=0
RUN --mount=type=bind,source=.git,target=.git \
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
# Disabling AVX512 specific optimizations for ARM
ARG VLLM_CPU_DISABLE_AVX512="true"
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=cache,target=/root/.cache/ccache \
--mount=type=bind,source=.git,target=.git \
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
pip install dist/*.whl && \
rm -rf dist
WORKDIR /workspace/
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]

View File

@ -1,4 +1,11 @@
# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform. # This vLLM Dockerfile is used to build images that can run vLLM on both x86_64 and arm64 CPU platforms.
#
# Supported platforms:
# - linux/amd64 (x86_64)
# - linux/arm64 (aarch64)
#
# Use the `--platform` option with `docker buildx build` to specify the target architecture, e.g.:
# docker buildx build --platform=linux/arm64 -f docker/Dockerfile.cpu .
# #
# Build targets: # Build targets:
# vllm-openai (default): used for serving deployment # vllm-openai (default): used for serving deployment
@ -53,7 +60,20 @@ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --upgrade pip && \ uv pip install --upgrade pip && \
uv pip install -r requirements/cpu.txt uv pip install -r requirements/cpu.txt
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/venv/lib/libiomp5.so:$LD_PRELOAD" ARG TARGETARCH
ENV TARGETARCH=${TARGETARCH}
RUN if [ "$TARGETARCH" = "arm64" ]; then \
PRELOAD_PATH="/usr/lib/aarch64-linux-gnu/libtcmalloc_minimal.so.4"; \
else \
PRELOAD_PATH="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/venv/lib/libiomp5.so"; \
fi && \
echo "export LD_PRELOAD=$PRELOAD_PATH" >> ~/.bashrc
# Ensure that the LD_PRELOAD environment variable for export is in effect.
SHELL ["/bin/bash", "-c"]
ENV LD_PRELOAD=${LD_PRELOAD}
RUN echo 'ulimit -c 0' >> ~/.bashrc RUN echo 'ulimit -c 0' >> ~/.bashrc

View File

@ -1,4 +1,4 @@
ARG NIGHTLY_DATE="20250714" ARG NIGHTLY_DATE="20250724"
ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.12_tpuvm_$NIGHTLY_DATE" ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.12_tpuvm_$NIGHTLY_DATE"
FROM $BASE_IMAGE FROM $BASE_IMAGE

View File

@ -9,10 +9,13 @@ We support tracing vLLM workers using the `torch.profiler` module. You can enabl
The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` environment variable set. The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` environment variable set.
When using `benchmarks/benchmark_serving.py`, you can enable profiling by passing the `--profile` flag. When using `vllm bench serve`, you can enable profiling by passing the `--profile` flag.
Traces can be visualized using <https://ui.perfetto.dev/>. Traces can be visualized using <https://ui.perfetto.dev/>.
!!! tip
You can directly call bench module without installing vllm using `python -m vllm.entrypoints.cli.main bench`.
!!! tip !!! tip
Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly. Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly.
@ -35,10 +38,10 @@ VLLM_TORCH_PROFILER_DIR=./vllm_profile \
--model meta-llama/Meta-Llama-3-70B --model meta-llama/Meta-Llama-3-70B
``` ```
benchmark_serving.py: vllm bench command:
```bash ```bash
python benchmarks/benchmark_serving.py \ vllm bench serve \
--backend vllm \ --backend vllm \
--model meta-llama/Meta-Llama-3-70B \ --model meta-llama/Meta-Llama-3-70B \
--dataset-name sharegpt \ --dataset-name sharegpt \
@ -69,13 +72,13 @@ apt install nsight-systems-cli
For basic usage, you can just append `nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace=node` before any existing script you would run for offline inference. For basic usage, you can just append `nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace=node` before any existing script you would run for offline inference.
The following is an example using the `benchmarks/benchmark_latency.py` script: The following is an example using the `vllm bench latency` script:
```bash ```bash
nsys profile -o report.nsys-rep \ nsys profile -o report.nsys-rep \
--trace-fork-before-exec=true \ --trace-fork-before-exec=true \
--cuda-graph-trace=node \ --cuda-graph-trace=node \
python benchmarks/benchmark_latency.py \ vllm bench latency \
--model meta-llama/Llama-3.1-8B-Instruct \ --model meta-llama/Llama-3.1-8B-Instruct \
--num-iters-warmup 5 \ --num-iters-warmup 5 \
--num-iters 1 \ --num-iters 1 \
@ -98,7 +101,7 @@ nsys profile -o report.nsys-rep \
vllm serve meta-llama/Llama-3.1-8B-Instruct vllm serve meta-llama/Llama-3.1-8B-Instruct
# client # client
python benchmarks/benchmark_serving.py \ vllm bench serve \
--backend vllm \ --backend vllm \
--model meta-llama/Llama-3.1-8B-Instruct \ --model meta-llama/Llama-3.1-8B-Instruct \
--num-prompts 1 \ --num-prompts 1 \
@ -132,7 +135,7 @@ You can view these profiles either as summaries in the CLI, using `nsys stats [p
... ...
** CUDA GPU Kernel Summary (cuda_gpu_kern_sum): ** CUDA GPU Kernel Summary (cuda_gpu_kern_sum):
Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
-------- --------------- --------- ----------- ----------- -------- --------- ----------- ---------------------------------------------------------------------------------------------------- -------- --------------- --------- ----------- ----------- -------- --------- ----------- ----------------------------------------------------------------------------------------------------
46.3 10,327,352,338 17,505 589,965.9 144,383.0 27,040 3,126,460 944,263.8 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize128x128x64_warpgroupsize1x1x1_execute_segment_k_of… 46.3 10,327,352,338 17,505 589,965.9 144,383.0 27,040 3,126,460 944,263.8 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize128x128x64_warpgroupsize1x1x1_execute_segment_k_of…
14.8 3,305,114,764 5,152 641,520.7 293,408.0 287,296 2,822,716 867,124.9 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize256x128x64_warpgroupsize2x1x1_execute_segment_k_of… 14.8 3,305,114,764 5,152 641,520.7 293,408.0 287,296 2,822,716 867,124.9 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize256x128x64_warpgroupsize2x1x1_execute_segment_k_of…
@ -143,7 +146,7 @@ You can view these profiles either as summaries in the CLI, using `nsys stats [p
2.6 587,283,113 37,824 15,526.7 3,008.0 2,719 2,517,756 139,091.1 std::enable_if<T2>(int)0&&vllm::_typeConvert<T1>::exists, void>::type vllm::fused_add_rms_norm_kern… 2.6 587,283,113 37,824 15,526.7 3,008.0 2,719 2,517,756 139,091.1 std::enable_if<T2>(int)0&&vllm::_typeConvert<T1>::exists, void>::type vllm::fused_add_rms_norm_kern…
1.9 418,362,605 18,912 22,121.5 3,871.0 3,328 2,523,870 175,248.2 void vllm::rotary_embedding_kernel<c10::BFloat16, (bool)1>(const long *, T1 *, T1 *, const T1 *, in… 1.9 418,362,605 18,912 22,121.5 3,871.0 3,328 2,523,870 175,248.2 void vllm::rotary_embedding_kernel<c10::BFloat16, (bool)1>(const long *, T1 *, T1 *, const T1 *, in…
0.7 167,083,069 18,880 8,849.7 2,240.0 1,471 2,499,996 101,436.1 void vllm::reshape_and_cache_flash_kernel<__nv_bfloat16, __nv_bfloat16, (vllm::Fp8KVCacheDataType)0 0.7 167,083,069 18,880 8,849.7 2,240.0 1,471 2,499,996 101,436.1 void vllm::reshape_and_cache_flash_kernel<__nv_bfloat16, __nv_bfloat16, (vllm::Fp8KVCacheDataType)0
... ...
``` ```
GUI example: GUI example:

View File

@ -3,14 +3,14 @@ An implementation of xPyD with dynamic scaling based on point-to-point communica
# Detailed Design # Detailed Design
## Overall Process ## Overall Process
As shown in Figure 1, the overall process of this **PD disaggregation** solution is described through a request flow: As shown in Figure 1, the overall process of this **PD disaggregation** solution is described through a request flow:
1. The client sends an HTTP request to the Proxy/Router's `/v1/completions` interface. 1. The client sends an HTTP request to the Proxy/Router's `/v1/completions` interface.
2. The Proxy/Router selects a **1P1D (1 Prefill instance + 1 Decode instance)** through either through round-robin or random selection, generates a `request_id` (rules to be introduced later), modifies the `max_tokens` in the HTTP request message to **1**, and then forwards the request to the **P instance**. 2. The Proxy/Router selects a **1P1D (1 Prefill instance + 1 Decode instance)** through either through round-robin or random selection, generates a `request_id` (rules to be introduced later), modifies the `max_tokens` in the HTTP request message to **1**, and then forwards the request to the **P instance**.
3. Immediately afterward, the Proxy/Router forwards the **original HTTP request** to the **D instance**. 3. Immediately afterward, the Proxy/Router forwards the **original HTTP request** to the **D instance**.
4. The **P instance** performs **Prefill** and then **actively sends the generated KV cache** to the D instance (using **PUT_ASYNC** mode). The D instance's `zmq_addr` can be resolved through the `request_id`. 4. The **P instance** performs **Prefill** and then **actively sends the generated KV cache** to the D instance (using **PUT_ASYNC** mode). The D instance's `zmq_addr` can be resolved through the `request_id`.
5. The **D instance** has a **dedicated thread** for receiving the KV cache (to avoid blocking the main process). The received KV cache is saved into the **GPU memory buffer**, the size of which is determined by the vLLM startup parameter `kv_buffer_size`. When the GPU buffer is full, the KV cache is stored in the **local Tensor memory pool**. 5. The **D instance** has a **dedicated thread** for receiving the KV cache (to avoid blocking the main process). The received KV cache is saved into the **GPU memory buffer**, the size of which is determined by the vLLM startup parameter `kv_buffer_size`. When the GPU buffer is full, the KV cache is stored in the **local Tensor memory pool**.
6. During the **Decode**, the D instance's main process retrieves the KV cache (transmitted by the P instance) from either the **GPU buffer** or the **memory pool**, thereby **skipping Prefill**. 6. During the **Decode**, the D instance's main process retrieves the KV cache (transmitted by the P instance) from either the **GPU buffer** or the **memory pool**, thereby **skipping Prefill**.
7. After completing **Decode**, the D instance returns the result to the **Proxy/Router**, which then forwards it to the **client**. 7. After completing **Decode**, the D instance returns the result to the **Proxy/Router**, which then forwards it to the **client**.
![image1](https://github.com/user-attachments/assets/fb01bde6-755b-49f7-ad45-48a94b1e10a7) ![image1](https://github.com/user-attachments/assets/fb01bde6-755b-49f7-ad45-48a94b1e10a7)
@ -291,7 +291,7 @@ curl -X POST -s http://10.0.1.1:10001/v1/completions \
??? console "Command" ??? console "Command"
```shell ```shell
python3 benchmark_serving.py \ vllm bench serve \
--backend vllm \ --backend vllm \
--model base_model \ --model base_model \
--tokenizer meta-llama/Llama-3.1-8B-Instruct \ --tokenizer meta-llama/Llama-3.1-8B-Instruct \

View File

@ -177,6 +177,70 @@ Multi-image input can be extended to perform video captioning. We show this with
You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary
instead of using multi-image input. instead of using multi-image input.
Instead of NumPy arrays, you can also pass `'torch.Tensor'` instances, as shown in this example using Qwen2.5-VL:
??? code
```python
from transformers import AutoProcessor
from vllm import LLM, SamplingParams
from qwen_vl_utils import process_vision_info
model_path = "Qwen/Qwen2.5-VL-3B-Instruct/"
video_path = "https://content.pexels.com/videos/free-videos.mp4"
llm = LLM(
model=model_path,
gpu_memory_utilization=0.8,
enforce_eager=True,
limit_mm_per_prompt={"video": 1},
)
sampling_params = SamplingParams(
max_tokens=1024,
)
video_messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": [
{"type": "text", "text": "describe this video."},
{
"type": "video",
"video": video_path,
"total_pixels": 20480 * 28 * 28,
"min_pixels": 16 * 28 * 28
}
]
},
]
messages = video_messages
processor = AutoProcessor.from_pretrained(model_path)
prompt = processor.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
)
image_inputs, video_inputs = process_vision_info(messages)
mm_data = {}
if video_inputs is not None:
mm_data["video"] = video_inputs
llm_inputs = {
"prompt": prompt,
"multi_modal_data": mm_data,
}
outputs = llm.generate([llm_inputs], sampling_params=sampling_params)
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
```
!!! note
'process_vision_info' is only applicable to Qwen2.5-VL and similar models.
Full example: <gh-file:examples/offline_inference/vision_language.py> Full example: <gh-file:examples/offline_inference/vision_language.py>
### Audio Inputs ### Audio Inputs

View File

@ -6,6 +6,7 @@ Contents:
- [Supported Hardware](supported_hardware.md) - [Supported Hardware](supported_hardware.md)
- [AutoAWQ](auto_awq.md) - [AutoAWQ](auto_awq.md)
- [AutoRound](auto_round.md)
- [BitsAndBytes](bnb.md) - [BitsAndBytes](bnb.md)
- [BitBLAS](bitblas.md) - [BitBLAS](bitblas.md)
- [GGUF](gguf.md) - [GGUF](gguf.md)

View File

@ -0,0 +1,103 @@
# AutoRound
[AutoRound](https://github.com/intel/auto-round) is Intels advanced quantization algorithm designed to produce highly efficient **INT2, INT3, INT4, and INT8**
quantized large language models—striking an optimal balance between accuracy and deployment performance.
AutoRound applies weight-only quantization to transformer-based models, enabling significant memory savings and faster
inference while maintaining near-original accuracy. It supports a wide range of hardware platforms, including **CPUs,
Intel GPUs, HPUs, and CUDA-enabled devices**.
Please refer to the [AutoRound guide](https://github.com/intel/auto-round/blob/main/docs/step_by_step.md) for more details.
Key Features:
**AutoRound, AutoAWQ, AutoGPTQ, and GGUF** are supported
**10+ vision-language models (VLMs)** are supported
**Per-layer mixed-bit quantization** for fine-grained control
**RTN (Round-To-Nearest) mode** for quick quantization with slight accuracy loss
**Multiple quantization recipes**: best, base, and light
✅ Advanced utilities such as immediate packing and support for **10+ backends**
## Installation
```bash
uv pip install auto-round
```
## Quantizing a model
For VLMs, please change to `auto-round-mllm` in CLI usage and `AutoRoundMLLM` in API usage.
### CLI usage
```bash
auto-round \
--model Qwen/Qwen3-0.6B \
--bits 4 \
--group_size 128 \
--format "auto_round" \
--output_dir ./tmp_autoround
```
```bash
auto-round \
--model Qwen/Qwen3-0.6B \
--format "gguf:q4_k_m" \
--output_dir ./tmp_autoround
```
### API usage
```python
from transformers import AutoModelForCausalLM, AutoTokenizer
from auto_round import AutoRound
model_name = "Qwen/Qwen3-0.6B"
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)
bits, group_size, sym = 4, 128, True
autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym)
# the best accuracy, 4-5X slower, low_gpu_mem_usage could save ~20G but ~30% slower
# autoround = AutoRound(model, tokenizer, nsamples=512, iters=1000, low_gpu_mem_usage=True, bits=bits, group_size=group_size, sym=sym)
# 2-3X speedup, slight accuracy drop at W4G128
# autoround = AutoRound(model, tokenizer, nsamples=128, iters=50, lr=5e-3, bits=bits, group_size=group_size, sym=sym )
output_dir = "./tmp_autoround"
# format= 'auto_round'(default), 'auto_gptq', 'auto_awq'
autoround.quantize_and_save(output_dir, format="auto_round")
```
## Running a quantized model with vLLM
Here is some example code to run auto-round format in vLLM:
```python
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
]
sampling_params = SamplingParams(temperature=0.6, top_p=0.95)
model_name = "Intel/DeepSeek-R1-0528-Qwen3-8B-int4-AutoRound"
llm = LLM(model=model_name)
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
# Acknowledgement
Special thanks to open-source low precision libraries such as AutoGPTQ, AutoAWQ, GPTQModel, Triton, Marlin, and
ExLLaMAV2 for providing low-precision CUDA kernels, which are leveraged in AutoRound.

View File

@ -33,7 +33,7 @@ Testing has been conducted on AWS Graviton3 instances for compatibility.
# --8<-- [end:pre-built-images] # --8<-- [end:pre-built-images]
# --8<-- [start:build-image-from-source] # --8<-- [start:build-image-from-source]
```bash ```bash
docker build -f docker/Dockerfile.arm \ docker build -f docker/Dockerfile.cpu \
--tag vllm-cpu-env . --tag vllm-cpu-env .
# Launching OpenAI server # Launching OpenAI server

View File

@ -365,6 +365,7 @@ th {
| `Grok1ModelForCausalLM` | Grok1 | `hpcai-tech/grok-1`. | ✅︎ | ✅︎ | ✅︎ | | `Grok1ModelForCausalLM` | Grok1 | `hpcai-tech/grok-1`. | ✅︎ | ✅︎ | ✅︎ |
| `HunYuanDenseV1ForCausalLM` | Hunyuan-7B-Instruct-0124 | `tencent/Hunyuan-7B-Instruct-0124` | ✅︎ | | ✅︎ | | `HunYuanDenseV1ForCausalLM` | Hunyuan-7B-Instruct-0124 | `tencent/Hunyuan-7B-Instruct-0124` | ✅︎ | | ✅︎ |
| `HunYuanMoEV1ForCausalLM` | Hunyuan-80B-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | | ✅︎ | | `HunYuanMoEV1ForCausalLM` | Hunyuan-80B-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | | ✅︎ |
| `HCXVisionForCausalLM` | HyperCLOVAX-SEED-Vision-Instruct-3B | `naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B` | | | ✅︎ |
| `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ | | `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ | | `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `InternLM3ForCausalLM` | InternLM3 | `internlm/internlm3-8b-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | | `InternLM3ForCausalLM` | InternLM3 | `internlm/internlm3-8b-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
@ -592,6 +593,7 @@ Specified using `--task generate`.
| `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ | | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ |
| `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ | | `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ |
| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ | | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ |
| `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, etc. | | ✅︎ | ✅︎ |
| `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ | | `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | | | ✅︎ | | `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | | | ✅︎ |
| `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | | ✅︎ | | `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | | ✅︎ |
@ -612,6 +614,7 @@ Specified using `--task generate`.
| `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ | ⚠️ | | `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ | ⚠️ |
| `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ | ✅︎ | | `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ | ✅︎ |
| `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `Phi4MultimodalForCausalLM` | Phi-4-multimodal (HF Transformers) | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct` (with revision `refs/pr/70`), etc. | ✅︎ | ✅︎ | ✅︎ |
| `PixtralForConditionalGeneration` | Mistral 3 (Mistral format), Pixtral (Mistral format) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistralai/Pixtral-12B-2409`, etc. | | ✅︎ | ✅︎ | | `PixtralForConditionalGeneration` | Mistral 3 (Mistral format), Pixtral (Mistral format) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistralai/Pixtral-12B-2409`, etc. | | ✅︎ | ✅︎ |
| `QwenVLForConditionalGeneration`<sup>^</sup> | Qwen-VL | T + I<sup>E+</sup> | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ | | `QwenVLForConditionalGeneration`<sup>^</sup> | Qwen-VL | T + I<sup>E+</sup> | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A<sup>+</sup> | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ | ✅︎ | | `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A<sup>+</sup> | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ | ✅︎ |

View File

@ -2,10 +2,14 @@
Reinforcement Learning from Human Feedback (RLHF) is a technique that fine-tunes language models using human-generated preference data to align model outputs with desired behaviors. Reinforcement Learning from Human Feedback (RLHF) is a technique that fine-tunes language models using human-generated preference data to align model outputs with desired behaviors.
vLLM can be used to generate the completions for RLHF. The best way to do this is with libraries like [TRL](https://github.com/huggingface/trl), [OpenRLHF](https://github.com/OpenRLHF/OpenRLHF) and [verl](https://github.com/volcengine/verl). vLLM can be used to generate the completions for RLHF. Some ways to do this include using libraries like [TRL](https://github.com/huggingface/trl), [OpenRLHF](https://github.com/OpenRLHF/OpenRLHF), [verl](https://github.com/volcengine/verl) and [unsloth](https://github.com/unslothai/unsloth).
See the following basic examples to get started if you don't want to use an existing library: See the following basic examples to get started if you don't want to use an existing library:
- [Training and inference processes are located on separate GPUs (inspired by OpenRLHF)](../examples/offline_inference/rlhf.md) - [Training and inference processes are located on separate GPUs (inspired by OpenRLHF)](../examples/offline_inference/rlhf.md)
- [Training and inference processes are colocated on the same GPUs using Ray](../examples/offline_inference/rlhf_colocate.md) - [Training and inference processes are colocated on the same GPUs using Ray](../examples/offline_inference/rlhf_colocate.md)
- [Utilities for performing RLHF with vLLM](../examples/offline_inference/rlhf_utils.md) - [Utilities for performing RLHF with vLLM](../examples/offline_inference/rlhf_utils.md)
See the following notebooks showing how to use vLLM for GRPO:
- [Qwen-3 4B GRPO using Unsloth + vLLM](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_(4B)-GRPO.ipynb)

View File

@ -190,6 +190,37 @@ def run_phi4mm(question: str, audio_count: int) -> ModelRequestData:
) )
def run_phi4_multimodal(question: str, audio_count: int) -> ModelRequestData:
"""
Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
show how to process audio inputs.
"""
model_path = snapshot_download(
"microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70"
)
# Since the vision-lora and speech-lora co-exist with the base model,
# we have to manually specify the path of the lora weights.
speech_lora_path = os.path.join(model_path, "speech-lora")
placeholders = "<|audio|>" * audio_count
prompts = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
engine_args = EngineArgs(
model=model_path,
max_model_len=12800,
max_num_seqs=2,
enable_lora=True,
max_lora_rank=320,
limit_mm_per_prompt={"audio": audio_count},
)
return ModelRequestData(
engine_args=engine_args,
prompt=prompts,
lora_requests=[LoRARequest("speech", 1, speech_lora_path)],
)
# Qwen2-Audio # Qwen2-Audio
def run_qwen2_audio(question: str, audio_count: int) -> ModelRequestData: def run_qwen2_audio(question: str, audio_count: int) -> ModelRequestData:
model_name = "Qwen/Qwen2-Audio-7B-Instruct" model_name = "Qwen/Qwen2-Audio-7B-Instruct"
@ -303,6 +334,7 @@ model_example_map = {
"granite_speech": run_granite_speech, "granite_speech": run_granite_speech,
"minicpmo": run_minicpmo, "minicpmo": run_minicpmo,
"phi4_mm": run_phi4mm, "phi4_mm": run_phi4mm,
"phi4_multimodal": run_phi4_multimodal,
"qwen2_audio": run_qwen2_audio, "qwen2_audio": run_qwen2_audio,
"qwen2_5_omni": run_qwen2_5_omni, "qwen2_5_omni": run_qwen2_5_omni,
"ultravox": run_ultravox, "ultravox": run_ultravox,

View File

@ -3,12 +3,12 @@
import argparse import argparse
import datetime import datetime
import os import os
import re
from typing import Union from typing import Union
import albumentations import albumentations
import numpy as np import numpy as np
import rasterio import rasterio
import regex as re
import torch import torch
from einops import rearrange from einops import rearrange
from terratorch.datamodules import Sen1Floods11NonGeoDataModule from terratorch.datamodules import Sen1Floods11NonGeoDataModule

View File

@ -29,6 +29,7 @@ import shutil
from pathlib import Path from pathlib import Path
from vllm import LLM, EngineArgs from vllm import LLM, EngineArgs
from vllm.model_executor.model_loader import ShardedStateLoader
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
@ -39,7 +40,10 @@ def parse_args():
"--output", "-o", required=True, type=str, help="path to output checkpoint" "--output", "-o", required=True, type=str, help="path to output checkpoint"
) )
parser.add_argument( parser.add_argument(
"--file-pattern", type=str, help="string pattern of saved filenames" "--file-pattern",
type=str,
default=ShardedStateLoader.DEFAULT_PATTERN,
help="string pattern of saved filenames",
) )
parser.add_argument( parser.add_argument(
"--max-file-size", "--max-file-size",

View File

@ -316,6 +316,85 @@ def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
) )
# naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B
def run_hyperclovax_seed_vision(
questions: list[str], modality: str
) -> ModelRequestData:
model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=8192 if modality == "image" else 16384,
limit_mm_per_prompt={modality: 1},
)
messages = list()
for question in questions:
if modality == "image":
"""
ocr: List the words in the image in raster order.
Even if the word order feels unnatural for reading,
the model will handle it as long as it follows raster order.
e.g. "Naver, CLOVA, bigshane"
lens_keywords: List the entity names in the image.
e.g. "iPhone"
lens_local_keywords: List the entity names with quads in the image.
e.g. "[0.07, 0.21, 0.92, 0.90] iPhone"
"""
messages.append(
[
{
"role": "user",
"content": [
{
"type": "image",
"ocr": "",
"lens_keywords": "",
"lens_local_keywords": "",
},
{
"type": "text",
"text": question,
},
],
}
]
)
elif modality == "video":
messages.append(
[
{
"role": "user",
"content": [
{
"type": "video",
},
{
"type": "text",
"text": question,
},
],
}
]
)
else:
raise ValueError(f"Unsupported modality: {modality}")
prompts = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
)
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
stop_token_ids=None,
)
# Idefics3-8B-Llama3 # Idefics3-8B-Llama3
def run_idefics3(questions: list[str], modality: str) -> ModelRequestData: def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image" assert modality == "image"
@ -389,6 +468,39 @@ def run_tarsier(questions: list[str], modality: str) -> ModelRequestData:
) )
# Intern-S1
def run_interns1(questions: list[str], modality: str) -> ModelRequestData:
model_name = "internlm/Intern-S1"
engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=8192,
max_num_seqs=2,
limit_mm_per_prompt={modality: 1},
enforce_eager=True,
)
if modality == "image":
placeholder = "<IMG_CONTEXT>"
elif modality == "video":
placeholder = "<video>"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
messages = [
[{"role": "user", "content": f"{placeholder}\n{question}"}]
for question in questions
]
prompts = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# InternVL # InternVL
def run_internvl(questions: list[str], modality: str) -> ModelRequestData: def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
model_name = "OpenGVLab/InternVL3-2B" model_name = "OpenGVLab/InternVL3-2B"
@ -987,6 +1099,41 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
) )
# HF format Phi-4-multimodal-instruct
def run_phi4_multimodal(questions: list[str], modality: str) -> ModelRequestData:
"""
Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
show how to process image inputs.
"""
assert modality == "image"
model_path = snapshot_download(
"microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70"
)
# Since the vision-lora and speech-lora co-exist with the base model,
# we have to manually specify the path of the lora weights.
vision_lora_path = os.path.join(model_path, "vision-lora")
prompts = [
f"<|user|><|image|>{question}<|end|><|assistant|>" for question in questions
]
engine_args = EngineArgs(
model=model_path,
max_model_len=5120,
max_num_seqs=2,
max_num_batched_tokens=12800,
enable_lora=True,
max_lora_rank=320,
# Note - mm_processor_kwargs can also be passed to generate/chat calls
mm_processor_kwargs={"dynamic_hd": 16},
limit_mm_per_prompt={"image": 1},
)
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
)
# Pixtral HF-format # Pixtral HF-format
def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData: def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image" assert modality == "image"
@ -1222,7 +1369,9 @@ model_example_map = {
"glm4v": run_glm4v, "glm4v": run_glm4v,
"glm4_1v": run_glm4_1v, "glm4_1v": run_glm4_1v,
"h2ovl_chat": run_h2ovl, "h2ovl_chat": run_h2ovl,
"hyperclovax_seed_vision": run_hyperclovax_seed_vision,
"idefics3": run_idefics3, "idefics3": run_idefics3,
"interns1": run_interns1,
"internvl_chat": run_internvl, "internvl_chat": run_internvl,
"nemotron_vl": run_nemotron_vl, "nemotron_vl": run_nemotron_vl,
"keye_vl": run_keye_vl, "keye_vl": run_keye_vl,
@ -1244,6 +1393,7 @@ model_example_map = {
"paligemma2": run_paligemma2, "paligemma2": run_paligemma2,
"phi3_v": run_phi3v, "phi3_v": run_phi3v,
"phi4_mm": run_phi4mm, "phi4_mm": run_phi4mm,
"phi4_multimodal": run_phi4_multimodal,
"pixtral_hf": run_pixtral_hf, "pixtral_hf": run_pixtral_hf,
"qwen_vl": run_qwen_vl, "qwen_vl": run_qwen_vl,
"qwen2_vl": run_qwen2_vl, "qwen2_vl": run_qwen2_vl,

View File

@ -253,6 +253,33 @@ def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData:
) )
def load_interns1(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "internlm/Intern-S1"
engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=4096,
limit_mm_per_prompt={"image": len(image_urls)},
)
placeholders = "\n".join(
f"Image-{i}: <IMG_CONTEXT>\n" for i, _ in enumerate(image_urls, start=1)
)
messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
prompt = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=[fetch_image(url) for url in image_urls],
)
def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData: def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "OpenGVLab/InternVL2-2B" model_name = "OpenGVLab/InternVL2-2B"
@ -289,6 +316,53 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
) )
def load_hyperclovax_seed_vision(
question: str, image_urls: list[str]
) -> ModelRequestData:
model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=16384,
limit_mm_per_prompt={"image": len(image_urls)},
)
message = {"role": "user", "content": list()}
for _image_url in image_urls:
message["content"].append(
{
"type": "image",
"image": _image_url,
"ocr": "",
"lens_keywords": "",
"lens_local_keywords": "",
}
)
message["content"].append(
{
"type": "text",
"text": question,
}
)
prompt = tokenizer.apply_chat_template(
[
message,
],
tokenize=False,
add_generation_prompt=True,
)
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
stop_token_ids=None,
image_data=[fetch_image(url) for url in image_urls],
)
def load_llava(question: str, image_urls: list[str]) -> ModelRequestData: def load_llava(question: str, image_urls: list[str]) -> ModelRequestData:
# NOTE: CAUTION! Original Llava models wasn't really trained on multi-image inputs, # NOTE: CAUTION! Original Llava models wasn't really trained on multi-image inputs,
# it will generate poor response for multi-image inputs! # it will generate poor response for multi-image inputs!
@ -686,6 +760,40 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
) )
def load_phi4_multimodal(question: str, image_urls: list[str]) -> ModelRequestData:
"""
Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
show how to process multi images inputs.
"""
model_path = snapshot_download(
"microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70"
)
# Since the vision-lora and speech-lora co-exist with the base model,
# we have to manually specify the path of the lora weights.
vision_lora_path = os.path.join(model_path, "vision-lora")
engine_args = EngineArgs(
model=model_path,
max_model_len=4096,
max_num_seqs=2,
limit_mm_per_prompt={"image": len(image_urls)},
enable_lora=True,
max_lora_rank=320,
# Note - mm_processor_kwargs can also be passed to generate/chat calls
mm_processor_kwargs={"dynamic_hd": 4},
)
placeholders = "<|image|>" * len(image_urls)
prompt = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=[fetch_image(url) for url in image_urls],
lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
)
def load_qwen_vl_chat(question: str, image_urls: list[str]) -> ModelRequestData: def load_qwen_vl_chat(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "Qwen/Qwen-VL-Chat" model_name = "Qwen/Qwen-VL-Chat"
engine_args = EngineArgs( engine_args = EngineArgs(
@ -899,7 +1007,9 @@ model_example_map = {
"gemma3": load_gemma3, "gemma3": load_gemma3,
"h2ovl_chat": load_h2ovl, "h2ovl_chat": load_h2ovl,
"idefics3": load_idefics3, "idefics3": load_idefics3,
"interns1": load_interns1,
"internvl_chat": load_internvl, "internvl_chat": load_internvl,
"hyperclovax_seed_vision": load_hyperclovax_seed_vision,
"keye_vl": load_keye_vl, "keye_vl": load_keye_vl,
"kimi_vl": load_kimi_vl, "kimi_vl": load_kimi_vl,
"llava": load_llava, "llava": load_llava,
@ -912,6 +1022,7 @@ model_example_map = {
"ovis": load_ovis, "ovis": load_ovis,
"phi3_v": load_phi3v, "phi3_v": load_phi3v,
"phi4_mm": load_phi4mm, "phi4_mm": load_phi4mm,
"phi4_multimodal": load_phi4_multimodal,
"pixtral_hf": load_pixtral_hf, "pixtral_hf": load_pixtral_hf,
"qwen_vl_chat": load_qwen_vl_chat, "qwen_vl_chat": load_qwen_vl_chat,
"qwen2_vl": load_qwen2_vl, "qwen2_vl": load_qwen2_vl,

View File

@ -29,7 +29,7 @@ PROXY_PORT=${PROXY_PORT:-30001}
PREFILL_GPUS=${PREFILL_GPUS:-0} PREFILL_GPUS=${PREFILL_GPUS:-0}
DECODE_GPUS=${DECODE_GPUS:-1,2,3} DECODE_GPUS=${DECODE_GPUS:-1,2,3}
PREFILL_PORTS=${PREFILL_PORTS:-20003} PREFILL_PORTS=${PREFILL_PORTS:-20003}
DECODE_PORTS=${DECODE_PORTS:-20005,20007,20009} DECODE_PORTS=${DECODE_PORTS:-20005,20007,20009}
echo "Warning: P2P NCCL disaggregated prefill XpYd support for vLLM v1 is experimental and subject to change." echo "Warning: P2P NCCL disaggregated prefill XpYd support for vLLM v1 is experimental and subject to change."
echo "" echo ""
@ -164,7 +164,7 @@ main() {
local gpu_id=${PREFILL_GPU_ARRAY[$i]} local gpu_id=${PREFILL_GPU_ARRAY[$i]}
local port=${PREFILL_PORT_ARRAY[$i]} local port=${PREFILL_PORT_ARRAY[$i]}
local kv_port=$((21001 + i)) local kv_port=$((21001 + i))
echo " Prefill server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port" echo " Prefill server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
CUDA_VISIBLE_DEVICES=$gpu_id VLLM_USE_V1=1 vllm serve $MODEL \ CUDA_VISIBLE_DEVICES=$gpu_id VLLM_USE_V1=1 vllm serve $MODEL \
--enforce-eager \ --enforce-eager \
@ -193,7 +193,7 @@ main() {
local gpu_id=${DECODE_GPU_ARRAY[$i]} local gpu_id=${DECODE_GPU_ARRAY[$i]}
local port=${DECODE_PORT_ARRAY[$i]} local port=${DECODE_PORT_ARRAY[$i]}
local kv_port=$((22001 + i)) local kv_port=$((22001 + i))
echo " Decode server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port" echo " Decode server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \ VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
--enforce-eager \ --enforce-eager \
@ -233,7 +233,7 @@ main() {
# Run Benchmark # Run Benchmark
# ============================================================================= # =============================================================================
cd ../../../benchmarks/ cd ../../../benchmarks/
python3 benchmark_serving.py --port 10001 --seed $(date +%s) \ vllm bench serve --port 10001 --seed $(date +%s) \
--model $MODEL \ --model $MODEL \
--dataset-name random --random-input-len 7500 --random-output-len 200 \ --dataset-name random --random-input-len 7500 --random-output-len 200 \
--num-prompts 200 --burstiness 100 --request-rate 2 | tee benchmark.log --num-prompts 200 --burstiness 100 --request-rate 2 | tee benchmark.log
@ -243,4 +243,4 @@ main() {
cleanup cleanup
} }
main main

View File

@ -28,7 +28,7 @@ Submit some sample requests to the server:
```bash ```bash
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
python3 ../../../benchmarks/benchmark_serving.py \ vllm bench serve \
--model mistralai/Mistral-7B-v0.1 \ --model mistralai/Mistral-7B-v0.1 \
--tokenizer mistralai/Mistral-7B-v0.1 \ --tokenizer mistralai/Mistral-7B-v0.1 \
--endpoint /v1/completions \ --endpoint /v1/completions \

View File

@ -122,7 +122,7 @@ main() {
# begin benchmark # begin benchmark
cd ../../../../benchmarks/ cd ../../../../benchmarks/
python3 benchmark_serving.py --port 9000 --seed $(date +%s) \ vllm bench serve --port 9000 --seed $(date +%s) \
--model meta-llama/Llama-3.1-8B-Instruct \ --model meta-llama/Llama-3.1-8B-Instruct \
--dataset-name random --random-input-len 7500 --random-output-len 200 \ --dataset-name random --random-input-len 7500 --random-output-len 200 \
--num-prompts 200 --burstiness 100 --request-rate 3.6 | tee benchmark.log --num-prompts 200 --burstiness 100 --request-rate 3.6 | tee benchmark.log
@ -133,4 +133,4 @@ main() {
} }
main main

View File

@ -10,7 +10,8 @@ setuptools>=77.0.3,<80.0.0
--extra-index-url https://download.pytorch.org/whl/cpu --extra-index-url https://download.pytorch.org/whl/cpu
torch==2.6.0+cpu; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218 torch==2.6.0+cpu; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218
torch==2.7.0; platform_system == "Darwin" torch==2.7.0; platform_system == "Darwin"
torch==2.7.0; platform_machine == "ppc64le" or platform_machine == "aarch64" torch==2.7.0; platform_machine == "ppc64le"
torch==2.6.0; platform_machine == "aarch64" # for arm64 CPUs, torch 2.7.0 has a issue: https://github.com/vllm-project/vllm/issues/17960
# required for the image processor of minicpm-o-2_6, this must be updated alongside torch # required for the image processor of minicpm-o-2_6, this must be updated alongside torch
torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x" torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
@ -25,3 +26,6 @@ datasets # for benchmark scripts
intel-openmp==2024.2.1; platform_machine == "x86_64" intel-openmp==2024.2.1; platform_machine == "x86_64"
intel_extension_for_pytorch==2.6.0; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218 intel_extension_for_pytorch==2.6.0; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218
triton==3.2.0; platform_machine == "x86_64" # Triton is required for torch 2.6+cpu, as it is imported in torch.compile. triton==3.2.0; platform_machine == "x86_64" # Triton is required for torch 2.6+cpu, as it is imported in torch.compile.
# Use this to gather CPU info and optimize based on ARM Neoverse cores
py-cpuinfo; platform_machine == "aarch64"

View File

@ -19,8 +19,8 @@ nixl==0.3.0
--find-links https://storage.googleapis.com/libtpu-releases/index.html --find-links https://storage.googleapis.com/libtpu-releases/index.html
--find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
--find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
torch==2.9.0.dev20250716 torch==2.9.0.dev20250724
torchvision==0.24.0.dev20250716 torchvision==0.24.0.dev20250724
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250716-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250724-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250716-cp312-cp312-linux_x86_64.whl ; python_version == "3.12" torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250724-cp312-cp312-linux_x86_64.whl ; python_version == "3.12"

View File

@ -1062,8 +1062,17 @@ class VllmRunner:
return [req_output.outputs.score for req_output in req_outputs] return [req_output.outputs.score for req_output in req_outputs]
def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]: def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
executor = self.llm.llm_engine.model_executor if hasattr(self.llm.llm_engine, "model_executor"):
return executor.apply_model(func) # This works either in V0 or in V1 with
# VLLM_ENABLE_V1_MULTIPROCESSING=0
executor = self.llm.llm_engine.model_executor
return executor.apply_model(func)
# This works in V1 with VLLM_ALLOW_INSECURE_SERIALIZATION=1
def _apply_model(self):
return func(self.get_model())
return self.llm.llm_engine.collective_rpc(_apply_model)
def __enter__(self): def __enter__(self):
return self return self

View File

@ -0,0 +1,93 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import base64
import io
import numpy as np
import pytest
import requests
import torch
from ...utils import RemoteOpenAIServer
MODEL_NAME = "christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM"
DTYPE = "float16"
@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
@pytest.fixture(scope="module")
def server():
args = [
"--task",
"embed",
# use half precision for speed and memory savings in CI environment
"--dtype",
DTYPE,
"--enforce-eager",
"--trust-remote-code",
"--skip-tokenizer-init",
"--max-num-seqs",
"32"
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_single_request(server: RemoteOpenAIServer, model_name: str):
pixel_values = torch.full((6, 512, 512), 1.0, dtype=torch.float16)
location_coords = torch.full((1, 2), 1.0, dtype=torch.float16)
buffer_tiff = io.BytesIO()
torch.save(pixel_values, buffer_tiff)
buffer_tiff.seek(0)
binary_data = buffer_tiff.read()
base64_tensor_embedding = base64.b64encode(binary_data).decode('utf-8')
buffer_coord = io.BytesIO()
torch.save(location_coords, buffer_coord)
buffer_coord.seek(0)
binary_data = buffer_coord.read()
base64_coord_embedding = base64.b64encode(binary_data).decode('utf-8')
prompt = {
"model":
model_name,
"additional_data": {
"prompt_token_ids": [1]
},
"encoding_format":
"base64",
"messages": [{
"role":
"user",
"content": [{
"type": "image_embeds",
"image_embeds": {
"pixel_values": base64_tensor_embedding,
"location_coords": base64_coord_embedding,
},
}],
}]
}
# test single pooling
response = requests.post(server.url_for("pooling"), json=prompt)
response.raise_for_status()
output = response.json()["data"][0]['data']
np_response = np.frombuffer(base64.b64decode(output), dtype=np.float32)
assert len(np_response) == 524288

View File

@ -0,0 +1,191 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Optional
import pytest
import torch
import vllm.v1.attention.backends.rocm_aiter_fa # noqa: F401
from vllm.platforms import current_platform
NUM_HEADS = [(4, 4), (8, 2), (16, 2)]
HEAD_SIZES = [128, 256]
BLOCK_SIZES = [16, 32]
DTYPES = [torch.float16, torch.bfloat16]
QDTYPES = [None]
# one value large enough to test overflow in index calculation.
# one value small enough to test the schema op check
NUM_BLOCKS = [32768, 2048]
def ref_paged_attn(
query: torch.Tensor,
key_cache: torch.Tensor,
value_cache: torch.Tensor,
query_lens: list[int],
kv_lens: list[int],
block_tables: torch.Tensor,
scale: float,
sliding_window: Optional[int] = None,
soft_cap: Optional[float] = None,
) -> torch.Tensor:
num_seqs = len(query_lens)
block_tables = block_tables.cpu().numpy()
_, block_size, num_kv_heads, head_size = key_cache.shape
outputs: list[torch.Tensor] = []
start_idx = 0
for i in range(num_seqs):
query_len = query_lens[i]
kv_len = kv_lens[i]
q = query[start_idx:start_idx + query_len]
q *= scale
num_kv_blocks = (kv_len + block_size - 1) // block_size
block_indices = block_tables[i, :num_kv_blocks]
k = key_cache[block_indices].view(-1, num_kv_heads, head_size)
k = k[:kv_len]
v = value_cache[block_indices].view(-1, num_kv_heads, head_size)
v = v[:kv_len]
if q.shape[1] != k.shape[1]:
k = torch.repeat_interleave(k, q.shape[1] // k.shape[1], dim=1)
v = torch.repeat_interleave(v, q.shape[1] // v.shape[1], dim=1)
attn = torch.einsum("qhd,khd->hqk", q, k).float()
empty_mask = torch.ones(query_len, kv_len)
mask = torch.triu(empty_mask, diagonal=kv_len - query_len + 1).bool()
if sliding_window is not None:
sliding_window_mask = torch.triu(empty_mask,
diagonal=kv_len -
(query_len + sliding_window) +
1).bool().logical_not()
mask |= sliding_window_mask
if soft_cap is not None:
attn = soft_cap * torch.tanh(attn / soft_cap)
attn.masked_fill_(mask, float("-inf"))
attn = torch.softmax(attn, dim=-1).to(v.dtype)
out = torch.einsum("hqk,khd->qhd", attn, v)
outputs.append(out)
start_idx += query_len
return torch.cat(outputs, dim=0)
@pytest.mark.skipif(not current_platform.is_rocm(),
reason="Only ROCm is supported")
@pytest.mark.parametrize("seq_lens",
[[(10, 1328), (5, 18),
(129, 463)], [(8, 523), (24, 37), (3, 2011)]])
@pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", HEAD_SIZES)
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
@pytest.mark.parametrize("sliding_window", [None, 256])
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("soft_cap", [None])
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
@pytest.mark.parametrize("q_dtype", QDTYPES)
@torch.inference_mode()
def test_varlen_with_paged_kv(
seq_lens: list[tuple[int, int]],
num_heads: tuple[int, int],
head_size: int,
sliding_window: Optional[int],
dtype: torch.dtype,
block_size: int,
soft_cap: Optional[float],
num_blocks: int,
q_dtype: Optional[torch.dtype],
) -> None:
torch.set_default_device("cuda")
current_platform.seed_everything(0)
num_seqs = len(seq_lens)
query_lens = [x[0] for x in seq_lens]
kv_lens = [x[1] for x in seq_lens]
num_query_heads = num_heads[0]
num_kv_heads = num_heads[1]
assert num_query_heads % num_kv_heads == 0
max_query_len = max(query_lens)
max_kv_len = max(kv_lens)
window_size = ((sliding_window - 1, 0) if sliding_window is not None else
(-1, -1))
scale = head_size**-0.5
query = torch.randn(sum(query_lens),
num_query_heads,
head_size,
dtype=dtype)
key_cache = torch.randn(num_blocks,
block_size,
num_kv_heads,
head_size,
dtype=dtype)
value_cache = torch.randn_like(key_cache)
cu_query_lens = torch.tensor([0] + query_lens,
dtype=torch.int32).cumsum(dim=0,
dtype=torch.int32)
cu_seq_lens = torch.tensor([0] + kv_lens,
dtype=torch.int32).cumsum(dim=0,
dtype=torch.int32)
kv_lens = torch.tensor(kv_lens, dtype=torch.int32)
max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
block_tables = torch.randint(0,
num_blocks,
(num_seqs, max_num_blocks_per_seq),
dtype=torch.int32)
output = torch.empty_like(query)
maybe_quantized_query = query
maybe_quantized_key_cache = key_cache
maybe_quantized_value_cache = value_cache
k_descale = None
v_descale = None
if q_dtype is not None:
# QKV are drawn from N(0, 1): no need for a fp8 scaling factor
maybe_quantized_query = query.to(q_dtype)
maybe_quantized_key_cache = key_cache.to(q_dtype)
maybe_quantized_value_cache = value_cache.to(q_dtype)
scale_shape = (num_seqs, num_kv_heads)
k_descale = torch.ones(scale_shape, dtype=torch.float32)
v_descale = torch.ones(scale_shape, dtype=torch.float32)
torch.ops.vllm.flash_attn_varlen_func(
maybe_quantized_query,
maybe_quantized_key_cache,
maybe_quantized_value_cache,
out=output,
cu_seqlens_q=cu_query_lens,
max_seqlen_q=max_query_len,
max_seqlen_k=max_kv_len,
softmax_scale=scale,
alibi_slopes=None,
window_size=window_size,
block_table=block_tables,
cu_seqlens_k=cu_seq_lens,
k_scale=k_descale,
v_scale=v_descale,
)
ref_output = ref_paged_attn(
query=query,
key_cache=key_cache,
value_cache=value_cache,
query_lens=query_lens,
kv_lens=kv_lens,
block_tables=block_tables,
scale=scale,
sliding_window=sliding_window,
soft_cap=soft_cap,
)
atol, rtol = 2e-2, 2e-2
if q_dtype is not None:
atol, rtol = 1.5e-1, 1.5e-1
torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol), \
f"{torch.max(torch.abs(output - ref_output))}"

View File

@ -17,28 +17,34 @@ from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
moe_permute, moe_permute_unpermute_supported, moe_unpermute) moe_permute, moe_permute_unpermute_supported, moe_unpermute)
from vllm.platforms import current_platform from vllm.platforms import current_platform
NUM_EXPERTS = [16, 64] NUM_EXPERTS = [16, 64, 256]
TOP_KS = [2, 4, 6, 8] TOP_KS = [2, 4, 6, 8]
EP_SIZE = [1, 4, 16] EP_SIZE = [1, 4, 16]
current_platform.seed_everything(0) current_platform.seed_everything(0)
def torch_permute(hidden_states: torch.Tensor, def torch_permute(
topk_ids: torch.Tensor, hidden_states: torch.Tensor,
token_expert_indices: torch.Tensor, topk_ids: torch.Tensor,
topk: int, # token_expert_indices: torch.Tensor,
n_expert: int, topk: int,
n_local_expert: int, n_expert: int,
start_expert: int, n_local_expert: int,
expert_map: Optional[torch.Tensor] = None, start_expert: int,
align_block_size: Optional[int] = None, expert_map: Optional[torch.Tensor] = None,
fill_invalid_expert: int = -1) -> list[torch.Tensor]: align_block_size: Optional[int] = None,
fill_invalid_expert: int = -1) -> list[torch.Tensor]:
n_token, n_hidden = hidden_states.shape[0], hidden_states.shape[1] n_token, n_hidden = hidden_states.shape[0], hidden_states.shape[1]
if expert_map is not None: if expert_map is not None:
is_local_expert = (expert_map[topk_ids] != -1) is_local_expert = (expert_map[topk_ids] != -1)
not_local_expert = (expert_map[topk_ids] == -1) not_local_expert = (expert_map[topk_ids] == -1)
topk_ids = is_local_expert * ( topk_ids = is_local_expert * (
topk_ids - start_expert) + not_local_expert * (topk_ids + n_expert) topk_ids - start_expert) + not_local_expert * (topk_ids + n_expert)
token_expert_indices = torch.arange(0,
n_token * topk,
dtype=torch.int32,
device=hidden_states.device).reshape(
(n_token, topk))
sorted_topk_ids, sorted_indices = torch.sort(topk_ids.flatten(), sorted_topk_ids, sorted_indices = torch.sort(topk_ids.flatten(),
stable=True) stable=True)
@ -59,8 +65,8 @@ def torch_permute(hidden_states: torch.Tensor,
valid_row_idx = [] valid_row_idx = []
if align_block_size is None: if align_block_size is None:
permuted_hidden_states = hidden_states[dst_row_id2src_row_id_map % permuted_hidden_states = hidden_states[dst_row_id2src_row_id_map //
n_token, ...] topk, ...]
permuted_row_size = permuted_hidden_states.shape[0] permuted_row_size = permuted_hidden_states.shape[0]
m_indices = torch.empty(permuted_row_size, m_indices = torch.empty(permuted_row_size,
device="cuda", device="cuda",
@ -73,14 +79,21 @@ def torch_permute(hidden_states: torch.Tensor,
0, n_token * topk, device="cuda", 0, n_token * topk, device="cuda",
dtype=torch.int32)[src2dst_idx].reshape((n_token, topk)) dtype=torch.int32)[src2dst_idx].reshape((n_token, topk))
valid_row_idx += [i for i in range(expert_first_token_offset[-1])] valid_row_idx += [i for i in range(expert_first_token_offset[-1])]
dst_row_id2src_row_id_map[
expert_first_token_offset[-1]:] = n_token * topk
return [ return [
permuted_hidden_states, expert_first_token_offset, permuted_hidden_states, expert_first_token_offset,
src_row_id2dst_row_id_map, m_indices, valid_row_idx src_row_id2dst_row_id_map, dst_row_id2src_row_id_map, m_indices,
valid_row_idx
] ]
else: else:
permuted_row_size = (topk * n_token + n_expert * permuted_row_size = (topk * n_token + n_expert *
(align_block_size - 1) + align_block_size - (align_block_size - 1) + align_block_size -
1) // align_block_size * align_block_size 1) // align_block_size * align_block_size
permuted_idx = torch.full((permuted_row_size, ),
n_token * topk,
dtype=torch.int32,
device=hidden_states.device)
permuted_hidden_states = torch.empty((permuted_row_size, n_hidden), permuted_hidden_states = torch.empty((permuted_row_size, n_hidden),
device="cuda", device="cuda",
dtype=hidden_states.dtype) dtype=hidden_states.dtype)
@ -105,13 +118,16 @@ def torch_permute(hidden_states: torch.Tensor,
align_first_token_offset = align_expert_first_token_offset[i - 1] align_first_token_offset = align_expert_first_token_offset[i - 1]
align_last_token_offset = align_expert_first_token_offset[i] align_last_token_offset = align_expert_first_token_offset[i]
dst_row_id2src_row_id_in_expert = dst_row_id2src_row_id_map[ dst_row_id2src_row_id_in_expert = dst_row_id2src_row_id_map[
first_token_offset:first_token_offset + first_token_offset:first_token_offset + n_token_in_expert]
n_token_in_expert] % n_token
# store token in current expert with align_first_token_offset # store token in current expert with align_first_token_offset
permuted_hidden_states[align_first_token_offset:\ permuted_hidden_states[align_first_token_offset:\
align_first_token_offset+n_token_in_expert,\ align_first_token_offset+n_token_in_expert,\
...] = hidden_states[\ ...] = hidden_states[\
dst_row_id2src_row_id_in_expert, ...] dst_row_id2src_row_id_in_expert // topk,\
...]
permuted_idx[align_first_token_offset:\
align_first_token_offset+\
n_token_in_expert] = dst_row_id2src_row_id_in_expert
# set current expert m_indices # set current expert m_indices
m_indices[align_first_token_offset:align_last_token_offset] = i - 1 m_indices[align_first_token_offset:align_last_token_offset] = i - 1
valid_row_idx += [ valid_row_idx += [
@ -135,7 +151,7 @@ def torch_permute(hidden_states: torch.Tensor,
src2dst_idx].reshape((n_token, topk)) src2dst_idx].reshape((n_token, topk))
return [ return [
permuted_hidden_states, align_expert_first_token_offset, permuted_hidden_states, align_expert_first_token_offset,
align_src_row_id2dst_row_id, m_indices, valid_row_idx align_src_row_id2dst_row_id, permuted_idx, m_indices, valid_row_idx
] ]
@ -146,15 +162,18 @@ def torch_unpermute(permuted_hidden_states: torch.Tensor,
valid_row_idx: torch.Tensor, topk: int, valid_row_idx: torch.Tensor, topk: int,
n_expert: int) -> torch.Tensor: n_expert: int) -> torch.Tensor:
# ignore invalid row # ignore invalid row
n_hidden = permuted_hidden_states.shape[1]
mask = torch.zeros(permuted_hidden_states.shape[0], mask = torch.zeros(permuted_hidden_states.shape[0],
dtype=bool, dtype=bool,
device="cuda") device="cuda")
mask[valid_row_idx] = True mask[valid_row_idx] = True
permuted_hidden_states[~mask] = 0 permuted_hidden_states[~mask] = 0
idx = src_row_id2dst_row_id_map.flatten()[
token_expert_indices.flatten()].reshape(token_expert_indices.shape) permuted_hidden_states = permuted_hidden_states[
output = permuted_hidden_states[idx, ...] * topk_weights[..., None] src_row_id2dst_row_id_map.flatten(), ...]
output = output.sum(dim=1).to(permuted_hidden_states.dtype) permuted_hidden_states = permuted_hidden_states.view(-1, topk, n_hidden)
output = (permuted_hidden_states * topk_weights.unsqueeze(2)).sum(1).to(
permuted_hidden_states.dtype)
return output return output
@ -184,43 +203,56 @@ def test_moe_permute_unpermute(n_token: int, n_hidden: int, topk: int,
gating_output = torch.randn((n_token, n_expert), device="cuda").to(dtype) gating_output = torch.randn((n_token, n_expert), device="cuda").to(dtype)
topk_weights, topk_ids, token_expert_indices = fused_topk( topk_weights, topk_ids, token_expert_indices = fused_topk(
hidden_states, gating_output, topk, False) hidden_states, gating_output, topk, False)
gold0, gold1, gold2, gold3, valid_row_idx = torch_permute( (gold_permuted_hidden_states, gold_expert_first_token_offset,
hidden_states, gold_inv_permuted_idx, gold_permuted_idx, gold_m_indices,
topk_ids, valid_row_idx) = torch_permute(
token_expert_indices, hidden_states,
topk, topk_ids,
n_expert, # token_expert_indices,
n_local_expert, topk,
start_expert, n_expert,
expert_map=expert_map, n_local_expert,
align_block_size=align_block_size, start_expert,
fill_invalid_expert=fill_invalid_expert) expert_map=expert_map,
align_block_size=align_block_size,
fill_invalid_expert=fill_invalid_expert)
result0, result1, result2, result3 = moe_permute( (permuted_hidden_states, _, expert_first_token_offset, inv_permuted_idx,
hidden_states, topk_weights, topk_ids, token_expert_indices, topk, m_indices) = moe_permute(hidden_states=hidden_states,
n_expert, n_local_expert, expert_map, align_block_size, a1q_scale=None,
fill_invalid_expert) topk_ids=topk_ids,
n_expert=n_expert,
n_local_expert=n_local_expert,
expert_map=expert_map,
align_block_size=align_block_size,
fill_invalid_expert=fill_invalid_expert)
# check expert_first_token_offset # check expert_first_token_offset
torch.testing.assert_close(gold1, result1, atol=0, rtol=0) torch.testing.assert_close(gold_expert_first_token_offset,
# check src_row_id2dst_row_id_map expert_first_token_offset,
torch.testing.assert_close(gold2, result2, atol=0, rtol=0) atol=0,
# check mindice rtol=0)
torch.testing.assert_close(gold3, result3, atol=0, rtol=0) # check src_row_id2dst_row_id_map
# check permuted_hidden_states, only valid token torch.testing.assert_close(gold_inv_permuted_idx.flatten(),
torch.testing.assert_close(gold0[valid_row_idx], inv_permuted_idx,
result0[valid_row_idx], atol=0,
rtol=0)
# check mindice
torch.testing.assert_close(gold_m_indices, m_indices, atol=0, rtol=0)
# check permuted_hidden_states, only valid token
torch.testing.assert_close(gold_permuted_hidden_states[valid_row_idx],
permuted_hidden_states[valid_row_idx],
atol=0, atol=0,
rtol=0) rtol=0)
# add a random tensor to simulate group gemm # add a random tensor to simulate group gemm
result0 = 0.5 * result0 + torch.randn_like(result0) result0 = 0.5 * permuted_hidden_states + torch.randn_like(
permuted_hidden_states)
result4 = torch.empty_like(hidden_states)
moe_unpermute(result4, result0, topk_weights, inv_permuted_idx,
expert_first_token_offset)
result4 = moe_unpermute(result0, topk_weights, topk_ids, result2, result1,
topk, n_expert, n_local_expert)
gold4 = torch_unpermute(result0, topk_weights, topk_ids, gold4 = torch_unpermute(result0, topk_weights, topk_ids,
token_expert_indices, result2, valid_row_idx, topk, token_expert_indices, inv_permuted_idx,
n_local_expert) valid_row_idx, topk, n_local_expert)
# check unpermuted hidden # check unpermuted hidden
torch.testing.assert_close(result4, gold4, atol=2e-2, rtol=0) torch.testing.assert_close(result4, gold4, atol=2e-2, rtol=0)

View File

@ -22,10 +22,12 @@ REVISION_ROBERTA = os.environ.get("REVISION", "main")
@pytest.mark.skipif(current_platform.is_rocm(), @pytest.mark.skipif(current_platform.is_rocm(),
reason="Xformers backend is not supported on ROCm.") reason="Xformers backend is not supported on ROCm.")
def test_model_loading_with_params(vllm_runner): def test_model_loading_with_params(vllm_runner, monkeypatch):
""" """
Test parameter weight loading with tp>1. Test parameter weight loading with tp>1.
""" """
# to use apply_model
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
with vllm_runner(model_name=MODEL_NAME, with vllm_runner(model_name=MODEL_NAME,
revision=REVISION, revision=REVISION,
dtype="float16", dtype="float16",
@ -61,10 +63,12 @@ def test_model_loading_with_params(vllm_runner):
@pytest.mark.skipif(current_platform.is_rocm(), @pytest.mark.skipif(current_platform.is_rocm(),
reason="Xformers backend is not supported on ROCm.") reason="Xformers backend is not supported on ROCm.")
def test_roberta_model_loading_with_params(vllm_runner): def test_roberta_model_loading_with_params(vllm_runner, monkeypatch):
""" """
Test parameter weight loading with tp>1. Test parameter weight loading with tp>1.
""" """
# to use apply_model
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
with vllm_runner(model_name=MODEL_NAME_ROBERTA, with vllm_runner(model_name=MODEL_NAME_ROBERTA,
revision=REVISION_ROBERTA, revision=REVISION_ROBERTA,
dtype="float16", dtype="float16",
@ -101,10 +105,12 @@ def test_roberta_model_loading_with_params(vllm_runner):
@pytest.mark.skipif(current_platform.is_rocm(), @pytest.mark.skipif(current_platform.is_rocm(),
reason="Xformers backend is not supported on ROCm.") reason="Xformers backend is not supported on ROCm.")
def test_facebook_roberta_model_loading_with_params(vllm_runner): def test_facebook_roberta_model_loading_with_params(vllm_runner, monkeypatch):
""" """
Test loading roberta-base model with no lm_head. Test loading roberta-base model with no lm_head.
""" """
# to use apply_model
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
model_name = "FacebookAI/roberta-base" model_name = "FacebookAI/roberta-base"
with vllm_runner(model_name=model_name, with vllm_runner(model_name=model_name,
dtype="float16", dtype="float16",

View File

@ -39,17 +39,9 @@ def v1(run_with_both_engines):
pytest.param("ssmits/Qwen2-7B-Instruct-embed-base", pytest.param("ssmits/Qwen2-7B-Instruct-embed-base",
marks=[pytest.mark.skip_v0, pytest.mark.cpu_model]), marks=[pytest.mark.skip_v0, pytest.mark.cpu_model]),
# [Encoder-only] # [Encoder-only]
pytest.param( pytest.param("BAAI/bge-base-en-v1.5", marks=[pytest.mark.core_model]),
"BAAI/bge-base-en-v1.5", pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
marks=[ pytest.param("intfloat/multilingual-e5-small"),
# CPU only supports V1
pytest.mark.core_model,
pytest.mark.skip_v1
]),
pytest.param("sentence-transformers/all-MiniLM-L12-v2",
marks=[pytest.mark.skip_v1]),
pytest.param("intfloat/multilingual-e5-small",
marks=[pytest.mark.skip_v1]),
pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct", pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
marks=[pytest.mark.skip_v1]), marks=[pytest.mark.skip_v1]),
# [Cross-Encoder] # [Cross-Encoder]

View File

@ -23,6 +23,14 @@ RERANK_MODELS = [
] ]
@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
@pytest.mark.parametrize("model_info", EMBEDDING_MODELS) @pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
def test_embed_models_mteb(hf_runner, vllm_runner, def test_embed_models_mteb(hf_runner, vllm_runner,
model_info: EmbedModelInfo) -> None: model_info: EmbedModelInfo) -> None:

View File

@ -677,6 +677,7 @@ VLM_TEST_SETTINGS = {
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501 img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501 video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
multi_image_prompt="Picture 1: <vlm_image>\nPicture 2: <vlm_image>\nDescribe these two images with one paragraph respectively.", # noqa: E501
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForVision2Seq,

View File

@ -22,6 +22,9 @@ from transformers import (AutoConfig, AutoProcessor, AutoTokenizer,
GenerationConfig) GenerationConfig)
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.v1.executor.abstract import Executor
from vllm.v1.kv_cache_interface import (ChunkedLocalAttentionSpec,
FullAttentionSpec)
from ....utils import multi_gpu_test from ....utils import multi_gpu_test
@ -69,6 +72,26 @@ def run_maverick_serving(model: str):
raise raise
def get_rope_layers_config(model_path: str) -> list[int]:
"""
Get the interleaved RoPE configuration from HuggingFace config
Args:
model_path: Path to the local directory containing the reduced
Maverick model checkpoint
Returns:
List of 0 or 1 indicating whether each layer uses RoPE and local attn
0 indicates that RoPE is not used while 1 indicates that RoPE is used.
"""
config_path = Path(model_path) / "config.json"
model_config = json.loads(config_path.read_text())
text_config = model_config["text_config"]
no_rope_layers = text_config["no_rope_layers"]
print(f"Found no_rope_layers: {no_rope_layers}")
return no_rope_layers
def create_reduced_maverick_model( def create_reduced_maverick_model(
original_model_name: original_model_name:
str = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", str = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
@ -113,7 +136,6 @@ def create_reduced_maverick_model(
print("Loading original model configuration...") print("Loading original model configuration...")
original_config = AutoConfig.from_pretrained(original_model_name, original_config = AutoConfig.from_pretrained(original_model_name,
trust_remote_code=True) trust_remote_code=True)
print("Creating reduced configuration...") print("Creating reduced configuration...")
reduced_config = create_reduced_config(original_config, text_layers, reduced_config = create_reduced_config(original_config, text_layers,
num_experts, vision_layers) num_experts, vision_layers)
@ -510,21 +532,32 @@ def save_weights_to_safetensors(weights: dict[str, torch.Tensor],
f"{index_data['metadata']['total_size'] / (1024**3):.2f} GB") f"{index_data['metadata']['total_size'] / (1024**3):.2f} GB")
def run_reduced_model(model_path: str, def check_attention_spec_interleaved_rope(
should_profile: bool = False, llm: LLM,
**kwargs) -> None: num_attention_layers: int,
"""Test the created reduced model with vLLM.""" num_ranks: int,
rope_layers: list[int],
print(f"\nTesting reduced model at {model_path}...") ):
"""Check that the attention spec is correct."""
llm = LLM( assert isinstance(llm.llm_engine.model_executor, Executor)
model=model_path, kv_cache_specs_per_rank = llm.llm_engine.model_executor.get_kv_cache_specs(
trust_remote_code=True,
max_model_len=512, # Small context for testing
gpu_memory_utilization=0.3, # Conservative memory usage
**kwargs,
) )
for rank in range(num_ranks):
kv_cache_specs = kv_cache_specs_per_rank[rank]
assert len(kv_cache_specs.keys()) == num_attention_layers
for i in range(num_attention_layers):
if rope_layers[i] == 0:
expected_spec = FullAttentionSpec
else:
expected_spec = ChunkedLocalAttentionSpec
assert isinstance(
kv_cache_specs[
f"language_model.model.layers.{i}.self_attn.attn"],
expected_spec)
def run_reduced_model(llm: LLM, should_profile: bool = False) -> None:
"""Test the created reduced model with vLLM."""
sampling_params = SamplingParams(temperature=0.8, sampling_params = SamplingParams(temperature=0.8,
top_p=0.95, top_p=0.95,
max_tokens=50) max_tokens=50)
@ -551,6 +584,7 @@ def run_reduced_model(model_path: str,
@pytest.mark.parametrize("tp,ep", [(2, True)]) @pytest.mark.parametrize("tp,ep", [(2, True)])
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
def test_dummy_maverick( def test_dummy_maverick(
monkeypatch,
original_model_name: str, original_model_name: str,
text_layers: int, text_layers: int,
num_experts: int, num_experts: int,
@ -562,6 +596,10 @@ def test_dummy_maverick(
force_recreate: bool = True, force_recreate: bool = True,
profile: bool = False, profile: bool = False,
) -> None: ) -> None:
# Disable multiprocessing allows us to access model executor from LLM engine
monkeypatch.setenv("VLLM_USE_V1", "1")
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
model_path = create_reduced_maverick_model( model_path = create_reduced_maverick_model(
original_model_name=original_model_name, original_model_name=original_model_name,
output_dir=output_dir, output_dir=output_dir,
@ -573,11 +611,27 @@ def test_dummy_maverick(
print(f"\nReduced model created successfully at: {model_path}") print(f"\nReduced model created successfully at: {model_path}")
run_reduced_model(model_path=model_path, rope_layers = get_rope_layers_config(model_path)
should_profile=profile,
enforce_eager=enforce_eager, llm = LLM(
tensor_parallel_size=tp, model=model_path,
enable_expert_parallel=ep) trust_remote_code=True,
max_model_len=512, # Small context for testing
gpu_memory_utilization=0.3, # Conservative memory usage
enforce_eager=enforce_eager,
tensor_parallel_size=tp,
enable_expert_parallel=ep,
)
check_attention_spec_interleaved_rope(
llm,
text_layers,
tp,
rope_layers,
)
print(f"\nTesting reduced model at {model_path}...")
run_reduced_model(llm=llm, should_profile=profile)
def main(): def main():

View File

@ -0,0 +1,252 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
from collections.abc import Sequence
from typing import Optional
import librosa
import pytest
from huggingface_hub import snapshot_download
from vllm.assets.image import ImageAsset
from vllm.lora.request import LoRARequest
from vllm.multimodal.image import rescale_image_size
from vllm.platforms import current_platform
from ....conftest import (IMAGE_ASSETS, HfRunner, PromptAudioInput,
PromptImageInput, VllmRunner)
from ....utils import large_gpu_test
from ...utils import check_logprobs_close
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"stop_sign":
"<|user|>\n<|image|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n", # noqa: E501
"cherry_blossom":
"<|user|>\n<|image|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n", # noqa: E501
})
HF_MULTIIMAGE_IMAGE_PROMPT = "<|user|>\n<|image|>\n<|image|>\nDescribe these images.<|end|>\n<|assistant|>\n" # noqa: E501
model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct",
revision="refs/pr/70")
# Since the vision-lora and speech-lora co-exist with the base model,
# we have to manually specify the path of the lora weights.
vision_lora_path = os.path.join(model_path, "vision-lora")
speech_question = os.path.join(model_path, "examples",
"what_is_shown_in_this_image.wav")
models = [model_path]
target_dtype = "half"
# ROCm Triton FA can run into shared memory issues with these models,
# use other backends in the meantime
# FIXME (mattwong, gshtrasb, hongxiayan)
if current_platform.is_rocm():
os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
def run_test(
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
inputs: Sequence[tuple[list[str], PromptImageInput,
Optional[PromptAudioInput]]],
model: str,
*,
max_model_len: int,
dtype: str,
max_tokens: int,
num_logprobs: int,
mm_limit: int,
tensor_parallel_size: int,
distributed_executor_backend: Optional[str] = None,
):
"""Inference result should be the same between hf and vllm.
All the image fixtures for the test are from IMAGE_ASSETS.
For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects
and corresponding MultiModalConfig as input.
Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf.
"""
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
# max_model_len should be greater than image_feature_size
with vllm_runner(
model,
task="generate",
max_model_len=max_model_len,
max_num_seqs=2,
dtype=dtype,
limit_mm_per_prompt={"image": mm_limit},
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enable_lora=True,
max_lora_rank=320,
gpu_memory_utilization=0.8, # set to 0.8 to avoid OOM in CI
enforce_eager=True,
trust_remote_code=False,
) as vllm_model:
lora_request = LoRARequest("vision", 1, vision_lora_path)
vllm_outputs_per_case = [
vllm_model.generate_greedy_logprobs(prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images,
audios=audios,
lora_request=lora_request)
for prompts, images, audios in inputs
]
with hf_runner(model, dtype=dtype) as hf_model:
hf_model.model.load_adapter(
vision_lora_path,
adapter_name="vision",
)
hf_processor = hf_model.processor
eos_token_id = hf_processor.tokenizer.eos_token_id
hf_outputs_per_case = [
hf_model.generate_greedy_logprobs_limit(prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images,
audios=audios,
eos_token_id=eos_token_id)
for prompts, images, audios in inputs
]
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
vllm_outputs_per_case):
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
"size_factors",
[
# No image
[],
# Single-scale
[1.0],
# Single-scale, batched
[1.0, 1.0, 1.0],
# Multi-scale
[0.25, 0.5, 1.0],
],
)
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_model_len", [12800])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
dtype: str, max_model_len: int, max_tokens: int,
num_logprobs: int) -> None:
images = [asset.pil_image for asset in image_assets]
inputs_per_image = [(
[prompt for _ in size_factors],
[rescale_image_size(image, factor) for factor in size_factors],
None,
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
run_test(
hf_runner,
vllm_runner,
inputs_per_image,
model,
dtype=dtype,
max_model_len=max_model_len,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
mm_limit=1,
tensor_parallel_size=1,
)
@large_gpu_test(min_gb=48)
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
"size_factors",
[
# No image
# [],
# Single-scale
[1.0],
# Single-scale, batched
[1.0, 1.0, 1.0],
# Multi-scale
[0.25, 0.5, 1.0],
],
)
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_model_len", [25600])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
size_factors, dtype: str, max_model_len: int,
max_tokens: int, num_logprobs: int) -> None:
images = [asset.pil_image for asset in image_assets]
inputs_per_case = [
(
[HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
[[rescale_image_size(image, factor) for image in images]
for factor in size_factors],
None,
),
]
run_test(
hf_runner,
vllm_runner,
inputs_per_case,
model,
dtype=dtype,
max_model_len=max_model_len,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
mm_limit=2,
tensor_parallel_size=1,
)
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_model_len", [12800])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_vision_speech_models(hf_runner, vllm_runner, model, dtype: str,
max_model_len: int, max_tokens: int,
num_logprobs: int) -> None:
# use the example speech question so that the model outputs are reasonable
audio = librosa.load(speech_question, sr=16000)
image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
inputs_vision_speech = [
(
["<|user|><|image|><|audio|><|end|><|assistant|>"],
[image],
[audio],
),
]
run_test(
hf_runner,
vllm_runner,
inputs_vision_speech,
model,
dtype=dtype,
max_model_len=max_model_len,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
mm_limit=1,
tensor_parallel_size=1,
)

View File

@ -41,12 +41,18 @@ def glm4_1v_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
def _test_processing_correctness( def _test_processing_correctness(
model_id: str, model_id_or_arch: str,
hit_rate: float, hit_rate: float,
num_batches: int, num_batches: int,
simplify_rate: float, simplify_rate: float,
): ):
model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id) if model_id_or_arch in HF_EXAMPLE_MODELS.get_supported_archs():
# Use model architecture to get the default model id
model_info = HF_EXAMPLE_MODELS.get_hf_info(model_id_or_arch)
model_id = model_info.default
else:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id_or_arch)
model_id = model_id_or_arch
model_info.check_available_online(on_fail="skip") model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip") model_info.check_transformers_version(on_fail="skip")
@ -58,7 +64,7 @@ def _test_processing_correctness(
trust_remote_code=model_info.trust_remote_code, trust_remote_code=model_info.trust_remote_code,
seed=0, seed=0,
dtype="auto", dtype="auto",
revision=None, revision=model_info.revision,
hf_overrides=model_info.hf_overrides, hf_overrides=model_info.hf_overrides,
) )
@ -272,12 +278,14 @@ def _test_processing_correctness_one(
"THUDM/GLM-4.1V-9B-Thinking", "THUDM/GLM-4.1V-9B-Thinking",
"ibm-granite/granite-speech-3.3-2b", "ibm-granite/granite-speech-3.3-2b",
"h2oai/h2ovl-mississippi-800m", "h2oai/h2ovl-mississippi-800m",
"internlm/Intern-S1",
"OpenGVLab/InternVL2-1B", "OpenGVLab/InternVL2-1B",
"OpenGVLab/InternVL3-1B", "OpenGVLab/InternVL3-1B",
"HuggingFaceM4/Idefics3-8B-Llama3", "HuggingFaceM4/Idefics3-8B-Llama3",
"HuggingFaceTB/SmolVLM2-2.2B-Instruct", "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
"moonshotai/Kimi-VL-A3B-Instruct", "moonshotai/Kimi-VL-A3B-Instruct",
"meta-llama/Llama-4-Scout-17B-16E-Instruct", "meta-llama/Llama-4-Scout-17B-16E-Instruct",
"naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
"llava-hf/llava-1.5-7b-hf", "llava-hf/llava-1.5-7b-hf",
"llava-hf/llava-v1.6-mistral-7b-hf", "llava-hf/llava-v1.6-mistral-7b-hf",
"llava-hf/LLaVA-NeXT-Video-7B-hf", "llava-hf/LLaVA-NeXT-Video-7B-hf",
@ -330,6 +338,28 @@ def test_processing_correctness(
) )
# Phi4MultimodalForCausalLM share same model repo with original format
# Phi4MMForCausalLM, so we add it as a separate test case
# Remove this test after conversion PR merged:
# https://huggingface.co/microsoft/Phi-4-multimodal-instruct/discussions/70
@pytest.mark.parametrize("model_arch", ["Phi4MultimodalForCausalLM"])
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
@pytest.mark.parametrize("num_batches", [32])
@pytest.mark.parametrize("simplify_rate", [1.0])
def test_processing_correctness_phi4_multimodal(
model_arch: str,
hit_rate: float,
num_batches: int,
simplify_rate: float,
):
_test_processing_correctness(
model_arch,
hit_rate=hit_rate,
num_batches=num_batches,
simplify_rate=simplify_rate,
)
def _assert_inputs_equal( def _assert_inputs_equal(
a: MultiModalInputs, a: MultiModalInputs,
b: MultiModalInputs, b: MultiModalInputs,

View File

@ -201,6 +201,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
trust_remote_code=True), trust_remote_code=True),
"HunYuanDenseV1ForCausalLM":_HfExamplesInfo("tencent/Hunyuan-7B-Instruct-0124", "HunYuanDenseV1ForCausalLM":_HfExamplesInfo("tencent/Hunyuan-7B-Instruct-0124",
trust_remote_code=True), trust_remote_code=True),
"HCXVisionForCausalLM": _HfExamplesInfo(
"naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
trust_remote_code=True),
"InternLMForCausalLM": _HfExamplesInfo("internlm/internlm-chat-7b", "InternLMForCausalLM": _HfExamplesInfo("internlm/internlm-chat-7b",
trust_remote_code=True), trust_remote_code=True),
"InternLM2ForCausalLM": _HfExamplesInfo("internlm/internlm2-chat-7b", "InternLM2ForCausalLM": _HfExamplesInfo("internlm/internlm2-chat-7b",
@ -218,6 +221,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"fp8": "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"}), # noqa: E501 "fp8": "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"}), # noqa: E501
"LLaMAForCausalLM": _HfExamplesInfo("decapoda-research/llama-7b-hf", "LLaMAForCausalLM": _HfExamplesInfo("decapoda-research/llama-7b-hf",
is_available_online=False), is_available_online=False),
"Llama4ForCausalLM": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501
is_available_online=False),
"MambaForCausalLM": _HfExamplesInfo("state-spaces/mamba-130m-hf"), "MambaForCausalLM": _HfExamplesInfo("state-spaces/mamba-130m-hf"),
"Mamba2ForCausalLM": _HfExamplesInfo("mistralai/Mamba-Codestral-7B-v0.1"), "Mamba2ForCausalLM": _HfExamplesInfo("mistralai/Mamba-Codestral-7B-v0.1"),
"FalconMambaForCausalLM": _HfExamplesInfo("tiiuae/falcon-mamba-7b-instruct"), # noqa: E501 "FalconMambaForCausalLM": _HfExamplesInfo("tiiuae/falcon-mamba-7b-instruct"), # noqa: E501
@ -376,6 +381,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
extras={"2B": "OpenGVLab/InternVL2-2B", extras={"2B": "OpenGVLab/InternVL2-2B",
"3.0": "OpenGVLab/InternVL3-1B"}, # noqa: E501 "3.0": "OpenGVLab/InternVL3-1B"}, # noqa: E501
trust_remote_code=True), trust_remote_code=True),
"InternS1ForConditionalGeneration": _HfExamplesInfo("internlm/Intern-S1",
trust_remote_code=True),
"Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3", # noqa: E501 "Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3", # noqa: E501
{"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}), # noqa: E501 {"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}), # noqa: E501
"KeyeForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501 "KeyeForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501
@ -426,6 +433,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"1.6-gemma": "AIDC-AI/Ovis1.6-Gemma2-9B"}), # noqa: E501 "1.6-gemma": "AIDC-AI/Ovis1.6-Gemma2-9B"}), # noqa: E501
"Phi4MMForCausalLM": _HfExamplesInfo("microsoft/Phi-4-multimodal-instruct", "Phi4MMForCausalLM": _HfExamplesInfo("microsoft/Phi-4-multimodal-instruct",
trust_remote_code=True), trust_remote_code=True),
"Phi4MultimodalForCausalLM": _HfExamplesInfo("microsoft/Phi-4-multimodal-instruct", # noqa: E501
revision="refs/pr/70"),
"PixtralForConditionalGeneration": _HfExamplesInfo("mistralai/Pixtral-12B-2409", # noqa: E501 "PixtralForConditionalGeneration": _HfExamplesInfo("mistralai/Pixtral-12B-2409", # noqa: E501
tokenizer_mode="mistral"), tokenizer_mode="mistral"),
"QwenVLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen-VL", "QwenVLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen-VL",

View File

@ -17,7 +17,9 @@ from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tenso
CompressedTensorsW4A4Fp4, CompressedTensorsW4A16Fp4, CompressedTensorsW4A4Fp4, CompressedTensorsW4A16Fp4,
CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8, CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8,
CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8, CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8,
CompressedTensorsWNA16, cutlass_fp4_supported) CompressedTensorsWNA16)
from vllm.model_executor.layers.quantization.utils.quant_utils import (
cutlass_fp4_supported)
from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
sparse_cutlass_supported) sparse_cutlass_supported)
from vllm.platforms import current_platform from vllm.platforms import current_platform

View File

@ -8,7 +8,10 @@ import pytest
from tests.quantization.utils import is_quant_method_supported from tests.quantization.utils import is_quant_method_supported
MODELS = ["microsoft/Phi-3-mini-4k-instruct"] MODELS = [
"microsoft/Phi-3-mini-4k-instruct", # dense model
"ai21labs/Jamba-tiny-dev", # MoE model
]
@pytest.mark.skipif(not is_quant_method_supported("rtn"), @pytest.mark.skipif(not is_quant_method_supported("rtn"),

View File

@ -4,6 +4,7 @@
import pytest import pytest
import torch import torch
from vllm.model_executor.models.fuyu import FuyuImagePatchInputs
from vllm.model_executor.models.phi3v import Phi3VImagePixelInputs from vllm.model_executor.models.phi3v import Phi3VImagePixelInputs
@ -124,3 +125,24 @@ def test_tensor_schema_with_invalid_resolve_binding_dims():
"w": 336 "w": 336
}, },
) )
def test_tensor_schema_with_list_of_symbolic_dim():
flat_data = torch.stack([torch.randn(768) for _ in range(3)]) # (bn=3, fn)
patches_per_image = [64, 64, 64] # len = bn = 3
FuyuImagePatchInputs(
flat_data=flat_data,
patches_per_image=patches_per_image,
)
def test_tensor_schema_with_list_of_symbolic_dim_mismatch_in_length():
flat_data = torch.stack([torch.randn(768) for _ in range(4)]) # (bn=4, fn)
patches_per_image = [64, 64, 64] # len = 3 ≠ bn
with pytest.raises(ValueError, match="expected 'bn'=4, got 3"):
FuyuImagePatchInputs(
flat_data=flat_data,
patches_per_image=patches_per_image,
)

View File

@ -93,6 +93,7 @@ def create_common_attn_metadata(
max_query_len=max_query_len, max_query_len=max_query_len,
block_table_tensor=block_table_tensor, block_table_tensor=block_table_tensor,
slot_mapping=slot_mapping, slot_mapping=slot_mapping,
causal=True,
) )

View File

@ -13,7 +13,6 @@ UNSUPPORTED_MODELS_V1 = [
"openai/whisper-large-v3", # transcription "openai/whisper-large-v3", # transcription
"facebook/bart-large-cnn", # encoder decoder "facebook/bart-large-cnn", # encoder decoder
"state-spaces/mamba-130m-hf", # mamba1 "state-spaces/mamba-130m-hf", # mamba1
"BAAI/bge-m3", # embedding
] ]
MODEL = "meta-llama/Llama-3.2-1B-Instruct" MODEL = "meta-llama/Llama-3.2-1B-Instruct"

View File

@ -1,9 +1,8 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import re
import pytest import pytest
import regex as re
import requests import requests
import torch import torch

View File

@ -59,7 +59,7 @@ def test_basic(
# actually test chunked prompt # actually test chunked prompt
max_num_batched_tokens=1024, max_num_batched_tokens=1024,
max_model_len=8192, max_model_len=8192,
gpu_memory_utilization=0.95, gpu_memory_utilization=0.7,
max_num_seqs=max_num_seqs, max_num_seqs=max_num_seqs,
tensor_parallel_size=tensor_parallel_size) as vllm_model: tensor_parallel_size=tensor_parallel_size) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, vllm_outputs = vllm_model.generate_greedy(example_prompts,

View File

@ -67,4 +67,9 @@ class InfEncoder(json.JSONEncoder):
def write_to_json(filename: str, records: list) -> None: def write_to_json(filename: str, records: list) -> None:
with open(filename, "w") as f: with open(filename, "w") as f:
json.dump(records, f, cls=InfEncoder) json.dump(
records,
f,
cls=InfEncoder,
default=lambda o: f"<{type(o).__name__} is not JSON serializable>",
)

View File

@ -4790,26 +4790,26 @@ class VllmConfig:
def __str__(self): def __str__(self):
return ( return (
f"model={self.model_config.model!r}," f"model={self.model_config.model!r}, "
f" speculative_config={self.speculative_config!r}," f"speculative_config={self.speculative_config!r}, "
f" tokenizer={self.model_config.tokenizer!r}, " f"tokenizer={self.model_config.tokenizer!r}, "
f"skip_tokenizer_init={self.model_config.skip_tokenizer_init}," f"skip_tokenizer_init={self.model_config.skip_tokenizer_init}, "
f" tokenizer_mode={self.model_config.tokenizer_mode}, " f"tokenizer_mode={self.model_config.tokenizer_mode}, "
f"revision={self.model_config.revision}, " f"revision={self.model_config.revision}, "
f"override_neuron_config={self.model_config.override_neuron_config}," f"override_neuron_config={self.model_config.override_neuron_config}, " # noqa
f" tokenizer_revision={self.model_config.tokenizer_revision}, " f"tokenizer_revision={self.model_config.tokenizer_revision}, "
f"trust_remote_code={self.model_config.trust_remote_code}, " f"trust_remote_code={self.model_config.trust_remote_code}, "
f"dtype={self.model_config.dtype}, " f"dtype={self.model_config.dtype}, "
f"max_seq_len={self.model_config.max_model_len}," f"max_seq_len={self.model_config.max_model_len}, "
f" download_dir={self.load_config.download_dir!r}, " f"download_dir={self.load_config.download_dir!r}, "
f"load_format={self.load_config.load_format}, " f"load_format={self.load_config.load_format}, "
f"tensor_parallel_size={self.parallel_config.tensor_parallel_size}," f"tensor_parallel_size={self.parallel_config.tensor_parallel_size}, " # noqa
f" pipeline_parallel_size={self.parallel_config.pipeline_parallel_size}, " # noqa f"pipeline_parallel_size={self.parallel_config.pipeline_parallel_size}, " # noqa
f"disable_custom_all_reduce={self.parallel_config.disable_custom_all_reduce}, " # noqa f"disable_custom_all_reduce={self.parallel_config.disable_custom_all_reduce}, " # noqa
f"quantization={self.model_config.quantization}, " f"quantization={self.model_config.quantization}, "
f"enforce_eager={self.model_config.enforce_eager}, " f"enforce_eager={self.model_config.enforce_eager}, "
f"kv_cache_dtype={self.cache_config.cache_dtype}, " f"kv_cache_dtype={self.cache_config.cache_dtype}, "
f" device_config={self.device_config.device}, " f"device_config={self.device_config.device}, "
f"decoding_config={self.decoding_config!r}, " f"decoding_config={self.decoding_config!r}, "
f"observability_config={self.observability_config!r}, " f"observability_config={self.observability_config!r}, "
f"seed={self.model_config.seed}, " f"seed={self.model_config.seed}, "

View File

@ -156,8 +156,16 @@ class SharedStorageConnector(KVConnectorBase_V1):
logger.info("Inject KV cache of %d tokens to the paged memory", logger.info("Inject KV cache of %d tokens to the paged memory",
len(request.slot_mapping)) len(request.slot_mapping))
for layer_name in forward_context.no_compile_layers: for layer_name in forward_context.no_compile_layers:
attn_layer = forward_context.no_compile_layers[layer_name] layer = forward_context.no_compile_layers[layer_name]
kv_cache_layer = attn_layer.kv_cache[\
# Only process layers that have kv_cache
# attribute (attention layers) Skip non-attention
# layers like FusedMoE/MLP etc.
kv_cache_attr = getattr(layer, 'kv_cache', None)
if kv_cache_attr is None:
continue
kv_cache_layer = kv_cache_attr[ \
forward_context.virtual_engine] forward_context.virtual_engine]
filename = self._generate_filename_debug( filename = self._generate_filename_debug(

View File

@ -1649,7 +1649,8 @@ class EngineArgs:
if (self.max_num_seqs is None if (self.max_num_seqs is None
and usage_context in default_max_num_seqs): and usage_context in default_max_num_seqs):
self.max_num_seqs = default_max_num_seqs[usage_context] self.max_num_seqs = min(default_max_num_seqs[usage_context],
self.max_num_batched_tokens or sys.maxsize)
logger.debug("Setting max_num_seqs to %d for %s usage context.", logger.debug("Setting max_num_seqs to %d for %s usage context.",
self.max_num_seqs, use_context_value) self.max_num_seqs, use_context_value)

View File

@ -97,11 +97,16 @@ class MQLLMEngineClient(EngineClient):
self.model_config = engine_config.model_config self.model_config = engine_config.model_config
self.decoding_config = engine_config.decoding_config self.decoding_config = engine_config.decoding_config
# Create the tokenizer group. if self.vllm_config.model_config.skip_tokenizer_init:
self.tokenizer = init_tokenizer_from_configs( self.tokenizer = None
model_config=self.model_config,
scheduler_config=engine_config.scheduler_config, else:
lora_config=engine_config.lora_config) # Create the tokenizer group.
self.tokenizer = init_tokenizer_from_configs(
model_config=self.model_config,
scheduler_config=engine_config.scheduler_config,
lora_config=engine_config.lora_config)
self.input_preprocessor = InputPreprocessor(self.model_config, self.input_preprocessor = InputPreprocessor(self.model_config,
self.tokenizer) self.tokenizer)
@ -375,7 +380,10 @@ class MQLLMEngineClient(EngineClient):
return self.input_preprocessor return self.input_preprocessor
async def get_tokenizer(self, lora_request: Optional[LoRARequest] = None): async def get_tokenizer(self, lora_request: Optional[LoRARequest] = None):
return await self.tokenizer.get_lora_tokenizer_async(lora_request) if self.tokenizer is None:
return None
else:
return await self.tokenizer.get_lora_tokenizer_async(lora_request)
async def get_vllm_config(self) -> VllmConfig: async def get_vllm_config(self) -> VllmConfig:
return self.vllm_config return self.vllm_config

View File

@ -14,6 +14,7 @@ from pydantic import ValidationError
from tqdm.auto import tqdm from tqdm.auto import tqdm
from typing_extensions import TypeVar, deprecated from typing_extensions import TypeVar, deprecated
import vllm.envs as envs
from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput, from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput,
BeamSearchSequence, BeamSearchSequence,
create_sort_beams_key_function) create_sort_beams_key_function)
@ -44,9 +45,10 @@ from vllm.model_executor.layers.quantization import QuantizationMethods
from vllm.outputs import (ClassificationRequestOutput, EmbeddingRequestOutput, from vllm.outputs import (ClassificationRequestOutput, EmbeddingRequestOutput,
PoolingRequestOutput, RequestOutput, PoolingRequestOutput, RequestOutput,
ScoringRequestOutput) ScoringRequestOutput)
from vllm.pooling_params import PoolingParams, PoolingTask from vllm.pooling_params import PoolingParams
from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams, from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams,
RequestOutputKind, SamplingParams) RequestOutputKind, SamplingParams)
from vllm.tasks import PoolingTask
from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer, from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
get_cached_tokenizer) get_cached_tokenizer)
from vllm.usage.usage_lib import UsageContext from vllm.usage.usage_lib import UsageContext
@ -277,6 +279,16 @@ class LLM:
self.request_counter = Counter() self.request_counter = Counter()
self.default_sampling_params: Union[dict[str, Any], None] = None self.default_sampling_params: Union[dict[str, Any], None] = None
if envs.VLLM_USE_V1:
supported_tasks = self.llm_engine \
.get_supported_tasks() # type: ignore
else:
supported_tasks = self.llm_engine.model_config.supported_tasks
logger.info("Supported_tasks: %s", supported_tasks)
self.supported_tasks = supported_tasks
def get_tokenizer( def get_tokenizer(
self, self,
lora_request: Optional[LoRARequest] = None, lora_request: Optional[LoRARequest] = None,
@ -1170,8 +1182,7 @@ class LLM:
A list of `EmbeddingRequestOutput` objects containing the A list of `EmbeddingRequestOutput` objects containing the
embedding vectors in the same order as the input prompts. embedding vectors in the same order as the input prompts.
""" """
model_config = self.llm_engine.model_config if "embed" not in self.supported_tasks:
if "embed" not in model_config.supported_tasks:
raise ValueError("Embedding API is not supported by this model. " raise ValueError("Embedding API is not supported by this model. "
"Please set `--task embed`.") "Please set `--task embed`.")
@ -1215,8 +1226,7 @@ class LLM:
A list of `ClassificationRequestOutput` objects containing the A list of `ClassificationRequestOutput` objects containing the
embedding vectors in the same order as the input prompts. embedding vectors in the same order as the input prompts.
""" """
model_config = self.llm_engine.model_config if "classify" not in self.supported_tasks:
if "classify" not in model_config.supported_tasks:
raise ValueError( raise ValueError(
"Classification API is not supported by this model. " "Classification API is not supported by this model. "
"Please set `--task classify`.") "Please set `--task classify`.")
@ -1397,8 +1407,8 @@ class LLM:
raise ValueError(" ".join(messages)) raise ValueError(" ".join(messages))
if all(t not in model_config.supported_tasks supported_tasks = self.supported_tasks
for t in ("embed", "classify")): if all(t not in supported_tasks for t in ("embed", "classify")):
raise ValueError("Score API is not supported by this model. " raise ValueError("Score API is not supported by this model. "
"Please set `--task embed` or `--task classify`.") "Please set `--task embed` or `--task classify`.")

View File

@ -1586,6 +1586,14 @@ async def init_app_state(
state.vllm_config = vllm_config state.vllm_config = vllm_config
model_config = vllm_config.model_config model_config = vllm_config.model_config
if envs.VLLM_USE_V1:
supported_tasks = await engine_client \
.get_supported_tasks() # type: ignore
else:
supported_tasks = model_config.supported_tasks
logger.info("Supported_tasks: %s", supported_tasks)
resolved_chat_template = load_chat_template(args.chat_template) resolved_chat_template = load_chat_template(args.chat_template)
if resolved_chat_template is not None: if resolved_chat_template is not None:
# Get the tokenizer to check official template # Get the tokenizer to check official template
@ -1647,7 +1655,7 @@ async def init_app_state(
reasoning_parser=args.reasoning_parser, reasoning_parser=args.reasoning_parser,
enable_prompt_tokens_details=args.enable_prompt_tokens_details, enable_prompt_tokens_details=args.enable_prompt_tokens_details,
enable_force_include_usage=args.enable_force_include_usage, enable_force_include_usage=args.enable_force_include_usage,
) if "generate" in model_config.supported_tasks else None ) if "generate" in supported_tasks else None
state.openai_serving_chat = OpenAIServingChat( state.openai_serving_chat = OpenAIServingChat(
engine_client, engine_client,
model_config, model_config,
@ -1664,7 +1672,7 @@ async def init_app_state(
reasoning_parser=args.reasoning_parser, reasoning_parser=args.reasoning_parser,
enable_prompt_tokens_details=args.enable_prompt_tokens_details, enable_prompt_tokens_details=args.enable_prompt_tokens_details,
enable_force_include_usage=args.enable_force_include_usage, enable_force_include_usage=args.enable_force_include_usage,
) if "generate" in model_config.supported_tasks else None ) if "generate" in supported_tasks else None
state.openai_serving_completion = OpenAIServingCompletion( state.openai_serving_completion = OpenAIServingCompletion(
engine_client, engine_client,
model_config, model_config,
@ -1673,7 +1681,7 @@ async def init_app_state(
return_tokens_as_token_ids=args.return_tokens_as_token_ids, return_tokens_as_token_ids=args.return_tokens_as_token_ids,
enable_prompt_tokens_details=args.enable_prompt_tokens_details, enable_prompt_tokens_details=args.enable_prompt_tokens_details,
enable_force_include_usage=args.enable_force_include_usage, enable_force_include_usage=args.enable_force_include_usage,
) if "generate" in model_config.supported_tasks else None ) if "generate" in supported_tasks else None
state.openai_serving_pooling = OpenAIServingPooling( state.openai_serving_pooling = OpenAIServingPooling(
engine_client, engine_client,
model_config, model_config,
@ -1681,7 +1689,7 @@ async def init_app_state(
request_logger=request_logger, request_logger=request_logger,
chat_template=resolved_chat_template, chat_template=resolved_chat_template,
chat_template_content_format=args.chat_template_content_format, chat_template_content_format=args.chat_template_content_format,
) if "encode" in model_config.supported_tasks else None ) if "encode" in supported_tasks else None
state.openai_serving_embedding = OpenAIServingEmbedding( state.openai_serving_embedding = OpenAIServingEmbedding(
engine_client, engine_client,
model_config, model_config,
@ -1689,24 +1697,22 @@ async def init_app_state(
request_logger=request_logger, request_logger=request_logger,
chat_template=resolved_chat_template, chat_template=resolved_chat_template,
chat_template_content_format=args.chat_template_content_format, chat_template_content_format=args.chat_template_content_format,
) if "embed" in model_config.supported_tasks else None ) if "embed" in supported_tasks else None
state.openai_serving_classification = ServingClassification( state.openai_serving_classification = ServingClassification(
engine_client, engine_client,
model_config, model_config,
state.openai_serving_models, state.openai_serving_models,
request_logger=request_logger, request_logger=request_logger,
) if "classify" in model_config.supported_tasks else None ) if "classify" in supported_tasks else None
enable_serving_reranking = ("classify" in model_config.supported_tasks enable_serving_reranking = ("classify" in supported_tasks and getattr(
and getattr(model_config.hf_config, model_config.hf_config, "num_labels", 0) == 1)
"num_labels", 0) == 1)
state.openai_serving_scores = ServingScores( state.openai_serving_scores = ServingScores(
engine_client, engine_client,
model_config, model_config,
state.openai_serving_models, state.openai_serving_models,
request_logger=request_logger, request_logger=request_logger,
) if ("embed" in model_config.supported_tasks ) if ("embed" in supported_tasks or enable_serving_reranking) else None
or enable_serving_reranking) else None
state.openai_serving_tokenization = OpenAIServingTokenization( state.openai_serving_tokenization = OpenAIServingTokenization(
engine_client, engine_client,
@ -1721,13 +1727,13 @@ async def init_app_state(
model_config, model_config,
state.openai_serving_models, state.openai_serving_models,
request_logger=request_logger, request_logger=request_logger,
) if "transcription" in model_config.supported_tasks else None ) if "transcription" in supported_tasks else None
state.openai_serving_translation = OpenAIServingTranslation( state.openai_serving_translation = OpenAIServingTranslation(
engine_client, engine_client,
model_config, model_config,
state.openai_serving_models, state.openai_serving_models,
request_logger=request_logger, request_logger=request_logger,
) if "transcription" in model_config.supported_tasks else None ) if "transcription" in supported_tasks else None
state.task = model_config.task state.task = model_config.task
state.enable_server_load_tracking = args.enable_server_load_tracking state.enable_server_load_tracking = args.enable_server_load_tracking

View File

@ -1007,6 +1007,13 @@ class CompletionRequest(OpenAIBaseModel):
"default: 0). Any priority other than 0 will raise an error " "default: 0). Any priority other than 0 will raise an error "
"if the served model does not use priority scheduling."), "if the served model does not use priority scheduling."),
) )
request_id: str = Field(
default_factory=lambda: f"{random_uuid()}",
description=(
"The request_id related to this request. If the caller does "
"not set it, a random_uuid will be generated. This id is used "
"through out the inference process and return in response."),
)
logits_processors: Optional[LogitsProcessors] = Field( logits_processors: Optional[LogitsProcessors] = Field(
default=None, default=None,
description=( description=(
@ -1251,6 +1258,13 @@ class EmbeddingCompletionRequest(OpenAIBaseModel):
"default: 0). Any priority other than 0 will raise an error " "default: 0). Any priority other than 0 will raise an error "
"if the served model does not use priority scheduling."), "if the served model does not use priority scheduling."),
) )
request_id: str = Field(
default_factory=lambda: f"{random_uuid()}",
description=(
"The request_id related to this request. If the caller does "
"not set it, a random_uuid will be generated. This id is used "
"through out the inference process and return in response."),
)
# --8<-- [end:embedding-extra-params] # --8<-- [end:embedding-extra-params]
@ -1302,6 +1316,13 @@ class EmbeddingChatRequest(OpenAIBaseModel):
"default: 0). Any priority other than 0 will raise an error " "default: 0). Any priority other than 0 will raise an error "
"if the served model does not use priority scheduling."), "if the served model does not use priority scheduling."),
) )
request_id: str = Field(
default_factory=lambda: f"{random_uuid()}",
description=(
"The request_id related to this request. If the caller does "
"not set it, a random_uuid will be generated. This id is used "
"through out the inference process and return in response."),
)
# --8<-- [end:chat-embedding-extra-params] # --8<-- [end:chat-embedding-extra-params]
@model_validator(mode="before") @model_validator(mode="before")

View File

@ -14,6 +14,7 @@ import torch
from prometheus_client import start_http_server from prometheus_client import start_http_server
from tqdm import tqdm from tqdm import tqdm
import vllm.envs as envs
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.engine.arg_utils import AsyncEngineArgs, optional_type from vllm.engine.arg_utils import AsyncEngineArgs, optional_type
from vllm.engine.protocol import EngineClient from vllm.engine.protocol import EngineClient
@ -335,6 +336,14 @@ async def run_batch(
model_config = vllm_config.model_config model_config = vllm_config.model_config
if envs.VLLM_USE_V1:
supported_tasks = await engine_client \
.get_supported_tasks() # type: ignore
else:
supported_tasks = model_config.supported_tasks
logger.info("Supported_tasks: %s", supported_tasks)
# Create the openai serving objects. # Create the openai serving objects.
openai_serving_models = OpenAIServingModels( openai_serving_models = OpenAIServingModels(
engine_client=engine_client, engine_client=engine_client,
@ -351,7 +360,7 @@ async def run_batch(
chat_template=None, chat_template=None,
chat_template_content_format="auto", chat_template_content_format="auto",
enable_prompt_tokens_details=args.enable_prompt_tokens_details, enable_prompt_tokens_details=args.enable_prompt_tokens_details,
) if "generate" in model_config.supported_tasks else None ) if "generate" in supported_tasks else None
openai_serving_embedding = OpenAIServingEmbedding( openai_serving_embedding = OpenAIServingEmbedding(
engine_client, engine_client,
model_config, model_config,
@ -359,19 +368,17 @@ async def run_batch(
request_logger=request_logger, request_logger=request_logger,
chat_template=None, chat_template=None,
chat_template_content_format="auto", chat_template_content_format="auto",
) if "embed" in model_config.supported_tasks else None ) if "embed" in supported_tasks else None
enable_serving_reranking = ("classify" in model_config.supported_tasks enable_serving_reranking = ("classify" in supported_tasks and getattr(
and getattr(model_config.hf_config, model_config.hf_config, "num_labels", 0) == 1)
"num_labels", 0) == 1)
openai_serving_scores = ServingScores( openai_serving_scores = ServingScores(
engine_client, engine_client,
model_config, model_config,
openai_serving_models, openai_serving_models,
request_logger=request_logger, request_logger=request_logger,
) if ("embed" in model_config.supported_tasks ) if ("embed" in supported_tasks or enable_serving_reranking) else None
or enable_serving_reranking) else None
tracker = BatchProgressTracker() tracker = BatchProgressTracker()
logger.info("Reading batch from %s...", args.input_file) logger.info("Reading batch from %s...", args.input_file)

View File

@ -113,7 +113,9 @@ class OpenAIServingCompletion(OpenAIServing):
return self.create_error_response( return self.create_error_response(
"Echo is unsupported with prompt embeds.") "Echo is unsupported with prompt embeds.")
request_id = f"cmpl-{self._base_request_id(raw_request)}" request_id = (
f"cmpl-"
f"{self._base_request_id(raw_request, request.request_id)}")
created_time = int(time.time()) created_time = int(time.time())
request_metadata = RequestResponseMetadata(request_id=request_id) request_metadata = RequestResponseMetadata(request_id=request_id)

View File

@ -163,8 +163,9 @@ class OpenAIServingEmbedding(EmbeddingMixin):
for the API specification. This API mimics the OpenAI Embedding API. for the API specification. This API mimics the OpenAI Embedding API.
""" """
model_name = self._get_model_name(request.model) model_name = self._get_model_name(request.model)
request_id = (f"{self.request_id_prefix}-" request_id = (
f"{self._base_request_id(raw_request)}") f"{self.request_id_prefix}-"
f"{self._base_request_id(raw_request, request.request_id)}")
ctx = EmbeddingServeContext( ctx = EmbeddingServeContext(
request=request, request=request,

View File

@ -880,7 +880,10 @@ class OpenAIServing:
_chat_template_kwargs.update(chat_template_kwargs or {}) _chat_template_kwargs.update(chat_template_kwargs or {})
request_prompt: Union[str, list[int]] request_prompt: Union[str, list[int]]
if isinstance(tokenizer, MistralTokenizer):
if tokenizer is None:
request_prompt = "placeholder"
elif isinstance(tokenizer, MistralTokenizer):
request_prompt = apply_mistral_chat_template( request_prompt = apply_mistral_chat_template(
tokenizer, tokenizer,
messages=messages, messages=messages,
@ -910,7 +913,14 @@ class OpenAIServing:
request = tool_parser(tokenizer).adjust_request( # type: ignore request = tool_parser(tokenizer).adjust_request( # type: ignore
request=request) request=request)
if isinstance(request_prompt, str): if tokenizer is None:
assert isinstance(request_prompt, str), (
"Prompt has to be a string", \
"when the tokenizer is not initialised"
)
prompt_inputs = TextTokensPrompt(prompt=request_prompt,
prompt_token_ids=[1])
elif isinstance(request_prompt, str):
prompt_inputs = await self._tokenize_prompt_input_async( prompt_inputs = await self._tokenize_prompt_input_async(
request, request,
tokenizer, tokenizer,
@ -947,9 +957,11 @@ class OpenAIServing:
def _load_and_validate_embed(embed: bytes) -> EmbedsPrompt: def _load_and_validate_embed(embed: bytes) -> EmbedsPrompt:
tensor = torch.load(io.BytesIO(base64.b64decode(embed)), tensor = torch.load(io.BytesIO(base64.b64decode(embed)),
weights_only=True) weights_only=True)
assert isinstance( assert isinstance(tensor, torch.Tensor) and tensor.dtype in (
tensor, torch.float32,
(torch.FloatTensor, torch.BFloat16Tensor, torch.HalfTensor)) torch.bfloat16,
torch.float16,
)
if tensor.dim() > 2: if tensor.dim() > 2:
tensor = tensor.squeeze(0) tensor = tensor.squeeze(0)
assert tensor.dim() == 2 assert tensor.dim() == 2

View File

@ -96,7 +96,11 @@ class OpenAIServingPooling(OpenAIServing):
self.max_model_len, truncate_prompt_tokens) self.max_model_len, truncate_prompt_tokens)
lora_request = self._maybe_get_adapters(request) lora_request = self._maybe_get_adapters(request)
tokenizer = await self.engine_client.get_tokenizer(lora_request) if self.model_config.skip_tokenizer_init:
tokenizer = None
else:
tokenizer = await self.engine_client.get_tokenizer(lora_request
)
if isinstance(request, PoolingChatRequest): if isinstance(request, PoolingChatRequest):
( (

View File

@ -16,8 +16,8 @@ from vllm.config import VllmConfig
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.pooling_params import PoolingTask
from vllm.sequence import ExecuteModelRequest, PoolerOutput from vllm.sequence import ExecuteModelRequest, PoolerOutput
from vllm.tasks import SupportedTask
from vllm.utils import make_async from vllm.utils import make_async
from vllm.worker.worker_base import WorkerBase from vllm.worker.worker_base import WorkerBase
@ -136,9 +136,9 @@ class ExecutorBase(ABC):
return self.collective_rpc(rpc_func) return self.collective_rpc(rpc_func)
@cached_property # Avoid unnecessary RPC calls @cached_property # Avoid unnecessary RPC calls
def supported_pooling_tasks(self) -> tuple[PoolingTask, ...]: def supported_tasks(self) -> tuple[SupportedTask, ...]:
output = self.collective_rpc("get_supported_pooling_tasks") output = self.collective_rpc("get_supported_tasks")
return tuple({task for tasks in output for task in tasks}) return output[0]
def execute_model( def execute_model(
self, execute_model_req: ExecuteModelRequest self, execute_model_req: ExecuteModelRequest

View File

@ -1127,6 +1127,7 @@ def flashinfer_fused_moe_blockscale_fp8(
tile_tokens_dim=_get_tile_tokens_dim(x.shape[0], top_k, tile_tokens_dim=_get_tile_tokens_dim(x.shape[0], top_k,
global_num_experts), global_num_experts),
routing_method_type=2, # DeepSeek-styled routing method routing_method_type=2, # DeepSeek-styled routing method
use_shuffled_weight=False,
) )

View File

@ -5,144 +5,8 @@ from typing import Optional
import torch import torch
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.triton_utils import tl, triton from vllm.triton_utils import triton
from vllm.utils import cdiv, round_up from vllm.utils import round_up
@triton.jit
def moe_align_block_size_stage1(
topk_ids_ptr,
tokens_cnts_ptr,
num_experts: tl.constexpr,
numel: tl.constexpr,
tokens_per_thread: tl.constexpr,
):
pid = tl.program_id(0)
start_idx = pid * tokens_per_thread
off_c = (pid + 1) * num_experts
for i in range(tokens_per_thread):
if start_idx + i < numel:
idx = tl.load(topk_ids_ptr + start_idx + i)
token_cnt = tl.load(tokens_cnts_ptr + off_c + idx)
tl.store(tokens_cnts_ptr + off_c + idx, token_cnt + 1)
@triton.jit
def moe_align_block_size_stage2(
tokens_cnts_ptr,
num_experts: tl.constexpr,
):
pid = tl.program_id(0)
last_cnt = 0
for i in range(1, num_experts + 1):
token_cnt = tl.load(tokens_cnts_ptr + i * num_experts + pid)
last_cnt = last_cnt + token_cnt
tl.store(tokens_cnts_ptr + i * num_experts + pid, last_cnt)
@triton.jit
def moe_align_block_size_stage3(
total_tokens_post_pad_ptr,
tokens_cnts_ptr,
cumsum_ptr,
num_experts: tl.constexpr,
block_size: tl.constexpr,
):
last_cumsum = 0
off_cnt = num_experts * num_experts
for i in range(1, num_experts + 1):
token_cnt = tl.load(tokens_cnts_ptr + off_cnt + i - 1)
last_cumsum = last_cumsum + tl.cdiv(token_cnt, block_size) * block_size
tl.store(cumsum_ptr + i, last_cumsum)
tl.store(total_tokens_post_pad_ptr, last_cumsum)
@triton.jit
def moe_align_block_size_stage4(
topk_ids_ptr,
sorted_token_ids_ptr,
expert_ids_ptr,
tokens_cnts_ptr,
cumsum_ptr,
num_experts: tl.constexpr,
block_size: tl.constexpr,
numel: tl.constexpr,
tokens_per_thread: tl.constexpr,
):
pid = tl.program_id(0)
start_idx = tl.load(cumsum_ptr + pid)
end_idx = tl.load(cumsum_ptr + pid + 1)
for i in range(start_idx, end_idx, block_size):
tl.store(expert_ids_ptr + i // block_size, pid)
start_idx = pid * tokens_per_thread
off_t = pid * num_experts
for i in range(start_idx, tl.minimum(start_idx + tokens_per_thread,
numel)):
expert_id = tl.load(topk_ids_ptr + i)
token_cnt = tl.load(tokens_cnts_ptr + off_t + expert_id)
rank_post_pad = token_cnt + tl.load(cumsum_ptr + expert_id)
tl.store(sorted_token_ids_ptr + rank_post_pad, i)
tl.store(tokens_cnts_ptr + off_t + expert_id, token_cnt + 1)
# Triton implementation based on:
# https://github.com/sgl-project/sglang/commit/ba5112ff691d791a9e38c6c71f59324a5fcb49d0
def moe_align_block_size_triton(
topk_ids: torch.Tensor,
num_experts: int,
block_size: int,
sorted_token_ids: torch.Tensor,
expert_ids: torch.Tensor,
num_tokens_post_pad: torch.Tensor,
) -> None:
numel = topk_ids.numel()
grid = (num_experts, )
tokens_cnts = torch.zeros((num_experts + 1, num_experts),
dtype=torch.int32,
device=topk_ids.device)
cumsum = torch.zeros((num_experts + 1, ),
dtype=torch.int32,
device=topk_ids.device)
tokens_per_thread = cdiv(numel, num_experts)
sorted_token_ids.fill_(numel)
expert_ids.zero_()
moe_align_block_size_stage1[grid](
topk_ids,
tokens_cnts,
num_experts,
numel,
tokens_per_thread,
)
moe_align_block_size_stage2[grid](
tokens_cnts,
num_experts,
)
moe_align_block_size_stage3[(1, )](
num_tokens_post_pad,
tokens_cnts,
cumsum,
num_experts,
block_size,
)
moe_align_block_size_stage4[grid](
topk_ids,
sorted_token_ids,
expert_ids,
tokens_cnts,
cumsum,
num_experts,
block_size,
numel,
tokens_per_thread,
)
def moe_align_block_size( def moe_align_block_size(

View File

@ -76,43 +76,43 @@ def _moe_unpermute_and_reduce(
def moe_permute( def moe_permute(
hidden_states: torch.Tensor, hidden_states: torch.Tensor,
topk_weights: torch.Tensor, a1q_scale: Optional[torch.Tensor],
topk_ids: torch.Tensor, topk_ids: torch.Tensor,
token_expert_indices: torch.Tensor,
topk: int,
n_expert: int, n_expert: int,
n_local_expert: int, n_local_expert: int = -1,
expert_map: Optional[torch.Tensor] = None, expert_map: Optional[torch.Tensor] = None,
align_block_size: Optional[int] = None, align_block_size: Optional[int] = None,
fill_invalid_expert: int = -1 fill_invalid_expert: int = -1
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: ) -> tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor, torch.Tensor,
torch.Tensor]:
""" """
This function expands and permutes activation to gather uncontinuous tokens This function expands and permutes activation to gather uncontinuous tokens
for each expert. for each expert.
Parameters: Parameters:
- hidden_states (torch.Tensor): The input tensor to the MoE layer. - hidden_states (torch.Tensor): The input tensor to the MoE layer.
- topk_weights (torch.Tensor): topk expert route weight for each token. - a1q_scale (Optional[torch.Tensor]): quant scale for hidden_states
- topk_ids (torch.Tensor): topk expert route id for each token. - topk_ids (torch.Tensor): topk expert route id for each token.
- token_expert_indices (torch.Tensor): indice for expanded hidden.
- topk (int): The number of top-k experts to select.
- n_expert (int): The number of expert. - n_expert (int): The number of expert.
- n_local_expert (int): The number of expert in current EP rank. - n_local_expert (int): The number of expert in current EP rank.
- expert_map (Optional[torch.Tensor]): A tensor mapping expert indices - expert_map (Optional[torch.Tensor]): A tensor mapping expert indices
from the global expert space to the local expert space of the expert from the global expert space to the local expert space of the expert
parallel shard. parallel shard.
- align_block_size (Optional[int]): align group gemm block size for deepgemm - align_block_size (Optional[int]): align group gemm block size for deepgemm
- fill_invalid_expert(int): fill expert id in m_indices for invalid expert - fill_invalid_expert(int): fill expert id in m_indices for invalid expert
to workaround DeepGemm unsupported -1 in m_indices to workaround DeepGemm unsupported -1 in m_indices
Returns: Returns:
- permuted_hidden_states (torch.Tensor): permuted activation. - permuted_hidden_states (torch.Tensor): permuted activation.
- a1q_scale (Optional[torch.Tensor]): quant scale for hidden_states
- expert_first_token_offset (torch.Tensor): offset of the first token - expert_first_token_offset (torch.Tensor): offset of the first token
of each expert for standard grouped gemm. if enable 'align_block_size' of each expert for standard grouped gemm. if enable 'align_block_size'
expert_first_token_offset will align up to 'align_block_size'. expert_first_token_offset will align up to 'align_block_size'.
- src_row_id2dst_row_id_map (torch.Tensor): idx map for moe_unpermute. - inv_permuted_idx (torch.Tensor): idx map for moe_unpermute.
- permuted_idx (torch.Tensor): idx map from hidden to permuted_hidden.
- m_indices: m_indices for grouped gemm in deepgemm,`m_indices[i]` records - m_indices: m_indices for grouped gemm in deepgemm,`m_indices[i]` records
the group which the j-th row of the LHS belong to.` the group which the j-th row of the LHS belong to.`
""" """
n_token, n_hidden = hidden_states.size() n_token, n_hidden = hidden_states.size()
topk = topk_ids.size(1)
assert (n_hidden * hidden_states.element_size() assert (n_hidden * hidden_states.element_size()
) % 16 == 0, "permue kernel need hidden dim align to 16B" ) % 16 == 0, "permue kernel need hidden dim align to 16B"
permuted_row_size = n_token * topk permuted_row_size = n_token * topk
@ -120,12 +120,19 @@ def moe_permute(
permuted_row_size = (permuted_row_size + n_expert * permuted_row_size = (permuted_row_size + n_expert *
(align_block_size - 1) + align_block_size - (align_block_size - 1) + align_block_size -
1) // align_block_size * align_block_size 1) // align_block_size * align_block_size
if n_local_expert == -1:
n_local_expert = n_expert
permuted_hidden_states = torch.empty( permuted_hidden_states = torch.empty(
(permuted_row_size, n_hidden), (permuted_row_size, n_hidden),
dtype=hidden_states.dtype, dtype=hidden_states.dtype,
device=hidden_states.device, device=hidden_states.device,
) )
token_expert_indices = torch.arange(0,
n_token * topk,
dtype=torch.int32,
device=hidden_states.device).reshape(
(n_token, topk))
m_indices = torch.full((permuted_row_size, ), m_indices = torch.full((permuted_row_size, ),
fill_invalid_expert, fill_invalid_expert,
dtype=torch.int32, dtype=torch.int32,
@ -133,57 +140,54 @@ def moe_permute(
expert_first_token_offset = torch.empty(n_local_expert + 1, expert_first_token_offset = torch.empty(n_local_expert + 1,
dtype=torch.int64, dtype=torch.int64,
device=hidden_states.device) device=hidden_states.device)
src_row_id2dst_row_id_map = torch.empty((n_token, topk), permuted_idx = torch.full((permuted_row_size, ),
dtype=torch.int32, n_token * topk,
device=hidden_states.device) dtype=torch.int32,
torch.ops._moe_C.moe_permute(hidden_states, topk_weights, topk_ids, device=hidden_states.device)
token_expert_indices, expert_map, n_expert, inv_permuted_idx = torch.empty((n_token, topk),
n_local_expert, topk, align_block_size, dtype=torch.int32,
permuted_hidden_states, device=hidden_states.device)
expert_first_token_offset, topk_ids = topk_ids.to(torch.int32)
src_row_id2dst_row_id_map, m_indices) torch.ops._moe_C.moe_permute(hidden_states, topk_ids, token_expert_indices,
return (permuted_hidden_states, expert_first_token_offset, expert_map, n_expert, n_local_expert, topk,
src_row_id2dst_row_id_map, m_indices) align_block_size, permuted_hidden_states,
expert_first_token_offset, inv_permuted_idx,
permuted_idx, m_indices)
if a1q_scale is not None:
a1q_scale = a1q_scale[permuted_idx.clamp(max=n_token * topk - 1) //
topk]
return (permuted_hidden_states, a1q_scale, expert_first_token_offset,
inv_permuted_idx.flatten(), m_indices)
def moe_unpermute( def moe_unpermute(
out: torch.Tensor,
permuted_hidden_states: torch.Tensor, permuted_hidden_states: torch.Tensor,
topk_weights: torch.Tensor, topk_weights: torch.Tensor,
topk_ids: torch.Tensor, inv_permuted_idx: torch.Tensor,
src_row_id2dst_row_id_map: torch.Tensor, expert_first_token_offset: Optional[torch.Tensor] = None,
expert_first_token_offset: torch.Tensor, ) -> None:
topk: int,
n_expert: int,
n_local_expert: int,
) -> torch.Tensor:
""" """
This function expands and permutes activation to gathering uncontinuous This function expands and permutes activation to gathering uncontinuous
tokens for each expert. tokens for each expert.
Parameters: Parameters:
- out (torch.Tensor): output tensor
- permuted_hidden_states (torch.Tensor): permuted activation. - permuted_hidden_states (torch.Tensor): permuted activation.
- topk_weights (torch.Tensor): topk expert route weight for each token. - topk_weights (torch.Tensor): topk expert route weight for each token.
- topk_ids (torch.Tensor): topk expert route id for each token. - inv_permuted_idx (torch.Tensor): row idx map for moe_unpermute.
- expert_first_token_offset (torch.Tensor): offset of the first token - expert_first_token_offset (Optional[torch.Tensor]): offset of the first
of each expert for grouped gemm. token of each expert for grouped gemm.
- topk (int): The number of top-k experts to select.
- n_expert (int): The number of expert.
- n_local_expert (int): The number of expert in current EP rank.
Returns: Returns:
- hidden_states (torch.Tensor): The reduced and unpermuted activation - hidden_states (torch.Tensor): The reduced and unpermuted activation
tensor. tensor.
""" """
n_token, n_hidden = topk_weights.size(0), permuted_hidden_states.size(-1) topk = topk_weights.size(1)
n_hidden = permuted_hidden_states.size(-1)
assert (n_hidden * permuted_hidden_states.element_size() assert (n_hidden * permuted_hidden_states.element_size()
) % 16 == 0, "unpermue kernel need hidden dim align to 16B" ) % 16 == 0, "unpermue kernel need hidden dim align to 16B"
hidden_states = torch.empty((n_token, n_hidden),
dtype=permuted_hidden_states.dtype,
device=permuted_hidden_states.device)
torch.ops._moe_C.moe_unpermute(permuted_hidden_states, topk_weights, torch.ops._moe_C.moe_unpermute(permuted_hidden_states, topk_weights,
topk_ids, src_row_id2dst_row_id_map, inv_permuted_idx, expert_first_token_offset,
expert_first_token_offset, n_expert, topk, out)
n_local_expert, topk, hidden_states)
return hidden_states
def moe_permute_unpermute_supported(): def moe_permute_unpermute_supported():

View File

@ -24,6 +24,7 @@ from vllm.model_executor.layers.mamba.mamba_utils import (
extra_groups_for_head_shards, get_mamba_state_shape) extra_groups_for_head_shards, get_mamba_state_shape)
from vllm.model_executor.layers.mamba.ops.causal_conv1d import ( from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
causal_conv1d_fn, causal_conv1d_update) causal_conv1d_fn, causal_conv1d_update)
from vllm.model_executor.layers.mamba.ops.layernorm_gated import rms_norm_gated
from vllm.model_executor.layers.mamba.ops.mamba_ssm import ( from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
selective_state_update) selective_state_update)
from vllm.model_executor.layers.mamba.ops.ssd_combined import ( from vllm.model_executor.layers.mamba.ops.ssd_combined import (
@ -133,21 +134,15 @@ class Mixer2RMSNormGated(CustomOp):
return x * nn.functional.silu(gate.to( return x * nn.functional.silu(gate.to(
torch.float32)).to(input_dtype) torch.float32)).to(input_dtype)
if self.tp_size > 1 or self.n_groups != 1: if (((self.n_groups % self.tp_size) != 0) or self.n_groups != 1):
return self.forward_native(x, gate) return self.forward_native(x, gate)
from vllm import _custom_ops as ops return rms_norm_gated(x,
self.weight.data,
# cast x and gate to float32 before silu bias=None,
out = torch.empty_like(x) z=gate,
y = x * nn.functional.silu(gate.to(torch.float32)) eps=self.variance_epsilon,
ops.rms_norm( norm_before_gate=False)
out,
y.to(x.dtype),
self.weight.data,
self.variance_epsilon,
)
return out
def mamba_v2_sharded_weight_loader( def mamba_v2_sharded_weight_loader(

View File

@ -0,0 +1,168 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Copyright (c) 2024, Tri Dao.
# Adapted from https://github.com/state-spaces/mamba/blob/60dadf2e0ee730ac337035d5533de10bc26e4847/mamba_ssm/ops/triton/layernorm_gated.py
import torch
from vllm.triton_utils import tl, triton
@triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None})
@triton.heuristics({"HAS_Z": lambda args: args["Z"] is not None})
@triton.jit
def _layer_norm_fwd_1pass_kernel(
X, # pointer to the input
Y, # pointer to the output
W, # pointer to the weights
B, # pointer to the biases
Z, # pointer to the other branch
Mean, # pointer to the mean
Rstd, # pointer to the 1/std
stride_x_row: tl.int64,
stride_y_row: tl.int64,
stride_z_row: tl.int64,
M: tl.int64, # number of rows in X
N: tl.int64, # number of columns in X
eps, # epsilon to avoid division by zero
BLOCK_N: tl.constexpr,
HAS_BIAS: tl.constexpr,
HAS_Z: tl.constexpr,
NORM_BEFORE_GATE: tl.constexpr,
IS_RMS_NORM: tl.constexpr,
):
# Map the program id to the row of X and Y it should compute.
row = tl.program_id(0)
group = tl.program_id(1)
X += row * stride_x_row + group * N
Y += row * stride_y_row + group * N
if HAS_Z:
Z += row * stride_z_row + group * N
if not IS_RMS_NORM:
Mean += group * M
Rstd += group * M
W += group * N
if HAS_BIAS:
B += group * N
# Compute mean and variance
cols = tl.arange(0, BLOCK_N)
x = tl.load(X + cols, mask=cols < N, other=0.).to(tl.float32)
if HAS_Z and not NORM_BEFORE_GATE:
z = tl.load(Z + cols, mask=cols < N).to(tl.float32)
x *= z * tl.sigmoid(z)
if not IS_RMS_NORM:
mean = tl.sum(x, axis=0) / N
tl.store(Mean + row, mean)
xbar = tl.where(cols < N, x - mean, 0.)
var = tl.sum(xbar * xbar, axis=0) / N
else:
xbar = tl.where(cols < N, x, 0.)
var = tl.sum(xbar * xbar, axis=0) / N
rstd = 1 / tl.sqrt(var + eps)
tl.store(Rstd + row, rstd)
# Normalize and apply linear transformation
mask = cols < N
w = tl.load(W + cols, mask=mask).to(tl.float32)
if HAS_BIAS:
b = tl.load(B + cols, mask=mask).to(tl.float32)
x_hat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd
y = x_hat * w + b if HAS_BIAS else x_hat * w
if HAS_Z and NORM_BEFORE_GATE:
z = tl.load(Z + cols, mask=mask).to(tl.float32)
y *= z * tl.sigmoid(z)
# Write output
tl.store(Y + cols, y, mask=mask)
def _layer_norm_fwd(x,
weight,
bias,
eps,
z=None,
out=None,
group_size=None,
norm_before_gate=True,
is_rms_norm=False):
M, N = x.shape
if group_size is None:
group_size = N
assert N % group_size == 0
ngroups = N // group_size
assert x.stride(-1) == 1
if z is not None:
assert z.stride(-1) == 1
assert z.shape == (M, N)
assert weight.shape == (N, )
assert weight.stride(-1) == 1
if bias is not None:
assert bias.stride(-1) == 1
assert bias.shape == (N, )
# allocate output
if out is not None:
assert out.shape == x.shape
else:
out = torch.empty_like(x)
assert out.stride(-1) == 1
mean = torch.empty((ngroups * M, ), dtype=torch.float32,
device=x.device) if not is_rms_norm else None
rstd = torch.empty((ngroups * M, ), dtype=torch.float32, device=x.device)
# Less than 64KB per feature: enqueue fused kernel
MAX_FUSED_SIZE = 65536 // x.element_size()
BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(group_size))
if group_size > BLOCK_N:
raise RuntimeError(
"This layer norm doesn't support feature dim >= 64KB.")
# heuristics for number of warps
num_warps = min(max(BLOCK_N // 256, 1), 8)
grid = (M, ngroups)
with torch.cuda.device(x.device.index):
_layer_norm_fwd_1pass_kernel[grid](x,
out,
weight,
bias,
z,
mean,
rstd,
x.stride(0),
out.stride(0),
z.stride(0) if z is not None else 0,
M,
group_size,
eps,
BLOCK_N=BLOCK_N,
NORM_BEFORE_GATE=norm_before_gate,
IS_RMS_NORM=is_rms_norm,
num_warps=num_warps)
return out, mean, rstd
def rms_norm_gated(x,
weight,
bias,
z=None,
eps=1e-6,
group_size=None,
norm_before_gate=True):
x_shape_og = x.shape
# reshape input data into 2D tensor
x = x.reshape(-1, x.shape[-1])
if x.stride(-1) != 1:
x = x.contiguous()
if z is not None:
assert z.shape == x_shape_og
z = z.reshape(-1, z.shape[-1])
if z.stride(-1) != 1:
z = z.contiguous()
weight = weight.contiguous()
if bias is not None:
bias = bias.contiguous()
y, _, _ = _layer_norm_fwd(x,
weight,
bias,
eps,
z=z,
group_size=group_size,
norm_before_gate=norm_before_gate,
is_rms_norm=True)
return y.reshape(x_shape_og)

View File

@ -16,8 +16,9 @@ from vllm.config import ModelConfig, PoolerConfig
from vllm.model_executor.pooling_metadata import ( # noqa: E501 from vllm.model_executor.pooling_metadata import ( # noqa: E501
PoolingMetadata as V0PoolingMetadata) PoolingMetadata as V0PoolingMetadata)
from vllm.model_executor.pooling_metadata import PoolingTensors from vllm.model_executor.pooling_metadata import PoolingTensors
from vllm.pooling_params import PoolingParams, PoolingTask from vllm.pooling_params import PoolingParams
from vllm.sequence import PoolerOutput, PoolingSequenceGroupOutput from vllm.sequence import PoolerOutput, PoolingSequenceGroupOutput
from vllm.tasks import PoolingTask
from vllm.utils import resolve_obj_by_qualname from vllm.utils import resolve_obj_by_qualname
from vllm.v1.pool.metadata import PoolingMetadata as V1PoolingMetadata from vllm.v1.pool.metadata import PoolingMetadata as V1PoolingMetadata

View File

@ -33,7 +33,7 @@ from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
find_matched_target, is_activation_quantization_format, find_matched_target, is_activation_quantization_format,
should_ignore_layer) should_ignore_layer)
from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
from vllm.model_executor.layers.quantization.utils.nvfp4_emulation_utils import ( # noqa: E501 from vllm.model_executor.layers.quantization.utils.quant_utils import (
cutlass_fp4_supported) cutlass_fp4_supported)
from vllm.platforms import current_platform from vllm.platforms import current_platform

View File

@ -27,8 +27,8 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
prepare_moe_fp4_layer_for_marlin) prepare_moe_fp4_layer_for_marlin)
from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import ( from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
prepare_moe_fp8_layer_for_marlin) prepare_moe_fp8_layer_for_marlin)
from vllm.model_executor.layers.quantization.utils.nvfp4_emulation_utils import ( # noqa: E501 from vllm.model_executor.layers.quantization.utils.quant_utils import (
cutlass_fp4_supported) cutlass_fp4_supported, swizzle_blockscale)
from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
all_close_1d, normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize) all_close_1d, normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize)
from vllm.model_executor.utils import set_weight_attrs from vllm.model_executor.utils import set_weight_attrs
@ -193,29 +193,6 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
{"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}) {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
set_weight_attrs(w2_input_scale, extra_weight_attrs) set_weight_attrs(w2_input_scale, extra_weight_attrs)
def swizzle_blockscale(self, scale: torch.tensor):
assert (scale.dtype == torch.float8_e4m3fn)
# Pad and blockwise interleave weight_scale
scale_ndim = scale.ndim
if scale.ndim == 2:
scale = scale.unsqueeze(0)
assert scale.ndim == 3
B, M, K = scale.shape
round_up_multiple = lambda x, m: (x + m - 1) // m * m
M_padded = round_up_multiple(M, 128)
K_padded = round_up_multiple(K, 4)
padded_scale = torch.zeros((B, M_padded, K_padded), dtype=scale.dtype)
padded_scale[:B, :M, :K] = scale
batches, rows, cols = padded_scale.shape
assert rows % 128 == 0
assert cols % 4 == 0
padded_scale = padded_scale.reshape(batches, rows // 128, 4, 32,
cols // 4, 4)
swizzled_scale = padded_scale.permute((0, 1, 4, 3, 2, 5))
swizzled_scale = swizzled_scale.contiguous().cuda()
return (swizzled_scale.reshape(M, K)
if scale_ndim == 2 else swizzled_scale.reshape(B, M, K))
def process_weights_after_loading(self, layer: torch.nn.Module) -> None: def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
# From packed to weight # From packed to weight
@ -243,13 +220,13 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
return return
# swizzle weight scales # swizzle weight scales
layer.w13_blockscale_swizzled = torch.nn.Parameter( layer.w13_blockscale_swizzled = torch.nn.Parameter(swizzle_blockscale(
self.swizzle_blockscale(layer.w13_weight_scale), layer.w13_weight_scale),
requires_grad=False) requires_grad=False)
layer.w2_blockscale_swizzled = torch.nn.Parameter( layer.w2_blockscale_swizzled = torch.nn.Parameter(swizzle_blockscale(
self.swizzle_blockscale(layer.w2_weight_scale), layer.w2_weight_scale),
requires_grad=False) requires_grad=False)
# w13 # w13
w13_input_global_scale = layer.w13_input_global_scale.max( w13_input_global_scale = layer.w13_input_global_scale.max(

View File

@ -9,8 +9,7 @@ from torch.nn.parameter import Parameter
import vllm.envs as envs import vllm.envs as envs
import vllm.model_executor.layers.fused_moe.modular_kernel as mk import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from vllm._custom_ops import (cutlass_scaled_fp4_mm, from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
cutlass_scaled_mm_supports_fp4, scaled_fp4_quant)
from vllm.distributed import get_ep_group from vllm.distributed import get_ep_group
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig
@ -28,7 +27,7 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
apply_fp4_marlin_linear, is_fp4_marlin_supported, apply_fp4_marlin_linear, is_fp4_marlin_supported,
prepare_fp4_layer_for_marlin, prepare_moe_fp4_layer_for_marlin) prepare_fp4_layer_for_marlin, prepare_moe_fp4_layer_for_marlin)
from vllm.model_executor.layers.quantization.utils.quant_utils import ( from vllm.model_executor.layers.quantization.utils.quant_utils import (
GroupShape, is_layer_skipped) GroupShape, cutlass_fp4_supported, is_layer_skipped, swizzle_blockscale)
from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
Fp8LinearOp, requantize_with_max_scale) Fp8LinearOp, requantize_with_max_scale)
from vllm.model_executor.parameter import (ModelWeightParameter, from vllm.model_executor.parameter import (ModelWeightParameter,
@ -667,14 +666,6 @@ class ModelOptNvFp4Config(QuantizationConfig):
return None return None
def cutlass_fp4_supported() -> bool:
if not current_platform.is_cuda():
return False
capability_tuple = current_platform.get_device_capability()
capability = -1 if capability_tuple is None else capability_tuple.to_int()
return cutlass_scaled_mm_supports_fp4(capability)
class ModelOptFp8KVCacheMethod(BaseKVCacheMethod): class ModelOptFp8KVCacheMethod(BaseKVCacheMethod):
""" """
Supports loading kv-cache scaling factors from FP8 checkpoints. Supports loading kv-cache scaling factors from FP8 checkpoints.
@ -772,29 +763,6 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
layer.register_parameter("weight_scale", weight_scale) layer.register_parameter("weight_scale", weight_scale)
def swizzle_blockscale(self, scale: torch.tensor):
assert (scale.dtype == torch.float8_e4m3fn)
# Pad and blockwise interleave weight_scale
scale_ndim = scale.ndim
if scale.ndim == 2:
scale = scale.unsqueeze(0)
assert scale.ndim == 3
B, M, K = scale.shape
round_up_multiple = lambda x, m: (x + m - 1) // m * m
M_padded = round_up_multiple(M, 128)
K_padded = round_up_multiple(K, 4)
padded_scale = torch.zeros((B, M_padded, K_padded), dtype=scale.dtype)
padded_scale[:B, :M, :K] = scale
batches, rows, cols = padded_scale.shape
assert rows % 128 == 0
assert cols % 4 == 0
padded_scale = padded_scale.reshape(batches, rows // 128, 4, 32,
cols // 4, 4)
swizzled_scale = padded_scale.permute((0, 1, 4, 3, 2, 5))
swizzled_scale = swizzled_scale.contiguous().cuda()
return (swizzled_scale.reshape(M, K)
if scale_ndim == 2 else swizzled_scale.reshape(B, M, K))
def process_weights_after_loading(self, layer: Module) -> None: def process_weights_after_loading(self, layer: Module) -> None:
# global scales: # global scales:
@ -814,7 +782,7 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
"Expected weight_scale.dim(1) to be divisible by 16") "Expected weight_scale.dim(1) to be divisible by 16")
assert (layer.weight_scale.dtype == torch.float8_e4m3fn), ( assert (layer.weight_scale.dtype == torch.float8_e4m3fn), (
"Weight Block scale must be represented as FP8-E4M3") "Weight Block scale must be represented as FP8-E4M3")
swizzled_weight_scale = self.swizzle_blockscale(layer.weight_scale) swizzled_weight_scale = swizzle_blockscale(layer.weight_scale)
layer.weight_scale_swizzled = Parameter(swizzled_weight_scale, layer.weight_scale_swizzled = Parameter(swizzled_weight_scale,
requires_grad=False) requires_grad=False)
@ -1060,29 +1028,6 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
weight_loader=weight_loader) weight_loader=weight_loader)
layer.register_parameter("w2_input_scale", w2_input_scale) layer.register_parameter("w2_input_scale", w2_input_scale)
def swizzle_blockscale(self, scale: torch.tensor):
assert (scale.dtype == torch.float8_e4m3fn)
# Pad and blockwise interleave weight_scale
scale_ndim = scale.ndim
if scale.ndim == 2:
scale = scale.unsqueeze(0)
assert scale.ndim == 3
B, M, K = scale.shape
round_up_multiple = lambda x, m: (x + m - 1) // m * m
M_padded = round_up_multiple(M, 128)
K_padded = round_up_multiple(K, 4)
padded_scale = torch.zeros((B, M_padded, K_padded), dtype=scale.dtype)
padded_scale[:B, :M, :K] = scale
batches, rows, cols = padded_scale.shape
assert rows % 128 == 0
assert cols % 4 == 0
padded_scale = padded_scale.reshape(batches, rows // 128, 4, 32,
cols // 4, 4)
swizzled_scale = padded_scale.permute((0, 1, 4, 3, 2, 5))
swizzled_scale = swizzled_scale.contiguous().cuda()
return (swizzled_scale.reshape(M, K)
if scale_ndim == 2 else swizzled_scale.reshape(B, M, K))
def process_weights_after_loading(self, layer: torch.nn.Module) -> None: def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
# GEMM 1 # GEMM 1
# The FlashInfer Cutlass fused MoE kernel expects the combined weights # The FlashInfer Cutlass fused MoE kernel expects the combined weights
@ -1128,8 +1073,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
"Expected weight_scale.dim(1) to be divisible by 16") "Expected weight_scale.dim(1) to be divisible by 16")
assert (layer.w13_weight_scale.dtype == torch.float8_e4m3fn), ( assert (layer.w13_weight_scale.dtype == torch.float8_e4m3fn), (
"Weight Blockscale must be represented as FP8-E4M3") "Weight Blockscale must be represented as FP8-E4M3")
w13_blockscale_swizzled = self.swizzle_blockscale( w13_blockscale_swizzled = swizzle_blockscale(layer.w13_weight_scale)
layer.w13_weight_scale)
layer.w13_blockscale_swizzled = Parameter(w13_blockscale_swizzled, layer.w13_blockscale_swizzled = Parameter(w13_blockscale_swizzled,
requires_grad=False) requires_grad=False)
@ -1151,7 +1095,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
"Expected weight_scale.dim(1) to be divisible by 16") "Expected weight_scale.dim(1) to be divisible by 16")
assert (layer.w2_weight_scale.dtype == torch.float8_e4m3fn), ( assert (layer.w2_weight_scale.dtype == torch.float8_e4m3fn), (
"Weight Blockscale must be represented as FP8-E4M3") "Weight Blockscale must be represented as FP8-E4M3")
w2_blockscale_swizzled = self.swizzle_blockscale(layer.w2_weight_scale) w2_blockscale_swizzled = swizzle_blockscale(layer.w2_weight_scale)
layer.w2_blockscale_swizzled = Parameter(w2_blockscale_swizzled, layer.w2_blockscale_swizzled = Parameter(w2_blockscale_swizzled,
requires_grad=False) requires_grad=False)

View File

@ -3,18 +3,19 @@
# Copyright © 2025, Oracle and/or its affiliates. # Copyright © 2025, Oracle and/or its affiliates.
import os import os
from typing import Any, Optional from typing import Any, Callable, Optional
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
from torch.nn.parameter import Parameter from torch.nn.parameter import Parameter
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.layers.fused_moe import FusedMoE, FusedMoEMethodBase
from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
set_weight_attrs) set_weight_attrs)
from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization import QuantizationMethods
from vllm.model_executor.layers.quantization.base_config import ( from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig) QuantizationConfig, QuantizeMethodBase)
logger = init_logger(__name__) logger = init_logger(__name__)
"""By default, use 8 bit as target precision, but it can be """By default, use 8 bit as target precision, but it can be
@ -71,9 +72,11 @@ class RTNConfig(QuantizationConfig):
return cls(weight_bits, group_size) return cls(weight_bits, group_size)
def get_quant_method(self, layer: torch.nn.Module, def get_quant_method(self, layer: torch.nn.Module,
prefix: str) -> Optional["RTNLinearMethod"]: prefix: str) -> Optional["QuantizeMethodBase"]:
if isinstance(layer, LinearBase): if isinstance(layer, LinearBase):
return RTNLinearMethod(self) return RTNLinearMethod(self)
elif isinstance(layer, FusedMoE):
return RTNMoEMethod(self)
return None return None
@ -94,11 +97,18 @@ class RTNTensor:
self.data.narrow(dim, start // factor, length // factor), self.data.narrow(dim, start // factor, length // factor),
self.scale.narrow(dim, start, length), self.quant_config) self.scale.narrow(dim, start, length), self.quant_config)
def __getitem__(self, key):
return RTNTensor(self.data[key], self.scale[key], self.quant_config)
@property @property
def shape(self): def shape(self):
shape = self.data.shape shape = self.data.shape
factor = 1 if self.quant_config.weight_bits == 8 else 2 factor = 1 if self.quant_config.weight_bits == 8 else 2
return torch.Size((shape[0] * factor, shape[1])) batch_present = len(shape) == 3
if batch_present:
return torch.Size((shape[0], shape[1] * factor, shape[2]))
else:
return torch.Size((shape[0] * factor, shape[1]))
def copy_(self, loaded_weight: torch.Tensor) -> None: def copy_(self, loaded_weight: torch.Tensor) -> None:
qweight, weight_scale = rtn_quantize(loaded_weight.cuda(), qweight, weight_scale = rtn_quantize(loaded_weight.cuda(),
@ -165,7 +175,7 @@ class RTNLinearMethod(LinearMethodBase):
weight = RTNParameter(data=torch.empty(output_size_per_partition // weight = RTNParameter(data=torch.empty(output_size_per_partition //
factor, factor,
input_size_per_partition, input_size_per_partition,
dtype=torch.int8), dtype=torch.uint8),
scale=scale, scale=scale,
quant_config=self.quant_config) quant_config=self.quant_config)
@ -180,18 +190,7 @@ class RTNLinearMethod(LinearMethodBase):
layer.output_size_per_partition = output_size_per_partition layer.output_size_per_partition = output_size_per_partition
def process_weights_after_loading(self, layer: torch.nn.Module) -> None: def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
"""torch.compile does not know how to deal with a Parameter subclass fix_weights(layer, "weight")
(aka RTNParameter). As we don't really need RTNParameters for the
forward pass, we replace them with equivalent instances of Parameters.
"""
old_weight = layer.weight
assert isinstance(old_weight, RTNParameter)
data = old_weight.data.data
delattr(layer, "weight")
new_weight = Parameter(data=data, requires_grad=False)
layer.register_parameter("weight", new_weight)
def apply(self, def apply(self,
layer: torch.nn.Module, layer: torch.nn.Module,
@ -209,6 +208,128 @@ class RTNLinearMethod(LinearMethodBase):
return out return out
class RTNMoEMethod(FusedMoEMethodBase):
def __init__(self, quant_config: RTNConfig):
self.quant_config = quant_config
def create_weights(self, layer: torch.nn.Module, num_experts: int,
hidden_size: int, intermediate_size_per_partition: int,
params_dtype: torch.dtype, **extra_weight_attrs):
factor = 1 if self.quant_config.weight_bits == 8 else 2
# Fused gate_up_proj (column parallel)
num_groups_per_col = (hidden_size // self.quant_config.group_size
if self.quant_config.group_size != -1 else 1)
w13_scale = Parameter(
torch.empty(num_experts,
2 * intermediate_size_per_partition,
num_groups_per_col,
dtype=params_dtype),
requires_grad=False,
)
layer.register_parameter("w13_scale", w13_scale)
w13_weight = RTNParameter(data=torch.empty(
num_experts,
2 * intermediate_size_per_partition // factor,
hidden_size,
dtype=torch.uint8),
scale=w13_scale,
quant_config=self.quant_config)
layer.register_parameter("w13_weight", w13_weight)
set_weight_attrs(w13_weight, extra_weight_attrs)
# down_proj (row parallel)
num_groups_per_col = (intermediate_size_per_partition //
self.quant_config.group_size
if self.quant_config.group_size != -1 else 1)
w2_scale = Parameter(torch.zeros(num_experts,
hidden_size,
num_groups_per_col,
dtype=params_dtype),
requires_grad=False)
layer.register_parameter("w2_scale", w2_scale)
w2_weight = RTNParameter(data=torch.empty(
num_experts,
hidden_size // factor,
intermediate_size_per_partition,
dtype=torch.uint8),
scale=w2_scale,
quant_config=self.quant_config)
layer.register_parameter("w2_weight", w2_weight)
set_weight_attrs(w2_weight, extra_weight_attrs)
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
weight_bits = self.quant_config.weight_bits
fix_weights(layer, "w13_weight", weight_bits == 4)
fix_weights(layer, "w2_weight", weight_bits == 4)
def apply(
self,
layer: torch.nn.Module,
x: torch.Tensor,
router_logits: torch.Tensor,
top_k: int,
renormalize: bool,
use_grouped_topk: bool = False,
topk_group: Optional[int] = None,
num_expert_group: Optional[int] = None,
global_num_experts: int = -1,
expert_map: Optional[torch.Tensor] = None,
custom_routing_function: Optional[Callable] = None,
scoring_func: str = "softmax",
e_score_correction_bias: Optional[torch.Tensor] = None,
apply_router_weight_on_input: bool = False,
activation: str = "silu",
enable_eplb: bool = False,
expert_load_view: Optional[torch.Tensor] = None,
logical_to_physical_map: Optional[torch.Tensor] = None,
logical_replica_count: Optional[torch.Tensor] = None,
) -> torch.Tensor:
if enable_eplb:
raise NotImplementedError(
"EPLB not supported for `RTNMoEMethod` yet.")
from vllm.model_executor.layers.fused_moe import fused_experts
topk_weights, topk_ids = FusedMoE.select_experts(
hidden_states=x,
router_logits=router_logits,
use_grouped_topk=use_grouped_topk,
top_k=top_k,
renormalize=renormalize,
topk_group=topk_group,
num_expert_group=num_expert_group,
custom_routing_function=custom_routing_function,
scoring_func=scoring_func,
e_score_correction_bias=e_score_correction_bias)
weight_bits = self.quant_config.weight_bits
group_size = self.quant_config.group_size
ret = fused_experts(
x,
layer.w13_weight,
layer.w2_weight,
topk_weights=topk_weights,
topk_ids=topk_ids,
inplace=True,
activation=activation,
use_int4_w4a16=weight_bits == 4,
use_int8_w8a16=weight_bits == 8,
global_num_experts=global_num_experts,
w1_scale=layer.w13_scale,
w2_scale=layer.w2_scale,
apply_router_weight_on_input=apply_router_weight_on_input,
expert_map=expert_map,
block_shape=[0, group_size])
return ret
def rtn_quantize(tensor: torch.Tensor, num_bits: int, def rtn_quantize(tensor: torch.Tensor, num_bits: int,
group_size: int) -> tuple[torch.Tensor, torch.Tensor]: group_size: int) -> tuple[torch.Tensor, torch.Tensor]:
"""Quantize a tensor using per-group static scaling factor. """Quantize a tensor using per-group static scaling factor.
@ -221,34 +342,44 @@ def rtn_quantize(tensor: torch.Tensor, num_bits: int,
If equal to -1, each row in the input tensor is treated If equal to -1, each row in the input tensor is treated
as one group. as one group.
""" """
batch_present = len(tensor.shape) == 3
if not batch_present:
tensor = tensor.unsqueeze(0)
q_range = 2**num_bits q_range = 2**num_bits
num_groups = (tensor.shape[0] * tensor.shape[1] // num_groups = (tensor.shape[1] * tensor.shape[2] //
group_size if group_size != -1 else tensor.shape[0]) group_size if group_size != -1 else tensor.shape[1])
"""Calculate a scaling factor per input group. """Calculate a scaling factor per input group.
""" """
input_flat = tensor.reshape(num_groups, -1) input_flat = tensor.reshape(tensor.shape[0], num_groups, -1)
input_min = torch.min(input_flat, dim=1, keepdim=True)[0] input_min = torch.min(input_flat, dim=2, keepdim=True)[0]
input_max = torch.max(input_flat, dim=1, keepdim=True)[0] input_max = torch.max(input_flat, dim=2, keepdim=True)[0]
input_max_abs = torch.max(input_min.abs(), input_max.abs()) input_max_abs = torch.max(input_min.abs(), input_max.abs())
scale = (input_max_abs * 2.0 / (q_range - 1)) scale = (input_max_abs * 2.0 / (q_range - 1))
"""Scale each input group, truncate and round to the nearest integer. """Scale each input group, round to the nearest integer, shift
the range and truncate.
""" """
scaled_input = input_flat / scale scaled_input = input_flat / scale
scaled_input = scaled_input.clamp(-q_range // 2, q_range // 2 - 1)
scaled_input = scaled_input.round() scaled_input = scaled_input.round()
scaled_input += q_range // 2
scaled_input = scaled_input.clamp(0, q_range - 1)
scale = scale.reshape(tensor.shape[0], -1).contiguous() scale = scale.reshape(tensor.shape[0], tensor.shape[1], -1).contiguous()
inputs_q = scaled_input.reshape(tensor.shape).to(torch.int8) inputs_q = scaled_input.reshape(tensor.shape).to(torch.uint8)
inputs_q = inputs_q.contiguous() inputs_q = inputs_q.contiguous()
if num_bits == 4: if num_bits == 4:
"""Pack two 4-bit values into each byte. """Pack two 4-bit values into each byte.
""" """
inputs_q = (inputs_q[:, 1::2] << 4) | (inputs_q[:, ::2] & 0xf) inputs_q = (inputs_q[:, :, 1::2] << 4) | (inputs_q[:, :, ::2] & 0xf)
inputs_q = inputs_q.reshape(tensor.shape[0] // 2, tensor.shape[1]) inputs_q = inputs_q.reshape(tensor.shape[0], tensor.shape[1] // 2,
tensor.shape[2])
inputs_q = inputs_q.contiguous() inputs_q = inputs_q.contiguous()
if not batch_present:
inputs_q = inputs_q.squeeze(0)
scale = scale.squeeze(0)
return inputs_q, scale return inputs_q, scale
@ -259,31 +390,60 @@ def rtn_dequantize(tensor: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
tensor: The input tensor. tensor: The input tensor.
scale: The tensor with per-group scale factors. scale: The tensor with per-group scale factors.
""" """
batch_present = len(tensor.shape) == 3
if not batch_present:
tensor = tensor.unsqueeze(0)
scale = scale.unsqueeze(0)
num_groups = scale.size(0) * scale.size(1) num_groups = scale.size(1) * scale.size(2)
input_dim, output_dim = tensor.shape batch, input_dim, output_dim = tensor.shape
num_bits = 8 if input_dim == scale.size(0) else 4 num_bits = 8 if input_dim == scale.size(1) else 4
q_range = 2**num_bits
if num_bits == 4: if num_bits == 4:
input_dim *= 2 input_dim *= 2
data = torch.empty((input_dim, output_dim), data = torch.empty((batch, input_dim, output_dim),
dtype=scale.dtype, dtype=scale.dtype,
device=tensor.device) device=tensor.device)
if num_bits == 8: if num_bits == 8:
data.copy_(tensor) data.copy_(tensor)
data -= q_range // 2
else: else:
"""Unpack two 4-bit values from each byte. """Unpack two 4-bit values from each byte.
""" """
tensor = tensor.reshape(input_dim, output_dim // 2) tensor = tensor.reshape(batch, input_dim, output_dim // 2)
for i in range(2): for i in range(2):
data[:, i::2] = (tensor << 4 * (1 - i)) >> 4 data[:, :, i::2] = ((tensor << 4 *
(1 - i)) >> 4).to(torch.int8) - q_range // 2
"""Scale each input group with its scaling factor. """Scale each input group with its scaling factor.
""" """
scale = scale.reshape(num_groups, -1) scale = scale.reshape(batch, num_groups, -1)
data = data.reshape(num_groups, -1) data = data.reshape(batch, num_groups, -1)
data = torch.mul(data, scale) data = torch.mul(data, scale)
input_deq = data.reshape((input_dim, output_dim)).contiguous() input_deq = data.reshape((batch, input_dim, output_dim)).contiguous()
if not batch_present:
input_deq = input_deq.squeeze(0)
return input_deq return input_deq
def fix_weights(layer: torch.nn.Module,
param_name: str,
reshape: bool = False):
"""torch.compile does not know how to deal with a Parameter subclass
(aka RTNParameter). As we don't really need RTNParameters for the
forward pass, we replace them with equivalent instances of Parameters.
"""
old_weight = getattr(layer, param_name)
assert isinstance(old_weight, RTNParameter)
data = old_weight.data.data
delattr(layer, param_name)
if reshape:
data = data.reshape(old_weight.shape[0], old_weight.shape[1] * 2, -1)
new_weight = Parameter(data=data, requires_grad=False)
layer.register_parameter(param_name, new_weight)

View File

@ -238,13 +238,20 @@ def per_token_group_quant_int8(
int8_min = iinfo.min int8_min = iinfo.min
x_q = torch.empty_like(x, device=x.device, dtype=dtype) x_q = torch.empty_like(x, device=x.device, dtype=dtype)
M = x.numel() // group_size
N = group_size
x_s = torch.empty( x_s = torch.empty(
x.shape[:-1] + (x.shape[-1] // group_size, ), x.shape[:-1] + (x.shape[-1] // group_size, ),
device=x.device, device=x.device,
dtype=torch.float32, dtype=torch.float32,
) )
# prefer CUDA kernel if available
if current_platform.is_cuda():
torch.ops._C.per_token_group_quant_int8(x, x_q, x_s, group_size, eps,
float(int8_min),
float(int8_max))
return x_q, x_s
M = x.numel() // group_size
N = group_size
BLOCK = triton.next_power_of_2(N) BLOCK = triton.next_power_of_2(N)
# heuristics for number of warps # heuristics for number of warps

View File

@ -2,13 +2,12 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import torch import torch
from vllm._custom_ops import cutlass_scaled_mm_supports_fp4
from vllm.platforms import current_platform
from vllm.scalar_type import scalar_types from vllm.scalar_type import scalar_types
__all__ = [ __all__ = [
"break_fp4_bytes", "dequantize_to_dtype", "ref_nvfp4_quant", "break_fp4_bytes",
"cutlass_fp4_supported" "dequantize_to_dtype",
"ref_nvfp4_quant",
] ]
FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max() FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
@ -17,14 +16,6 @@ kE2M1ToFloat = torch.tensor([0., 0.5, 1., 1.5, 2., 3., 4., 6.],
dtype=torch.float32) dtype=torch.float32)
def cutlass_fp4_supported() -> bool:
if not current_platform.is_cuda():
return False
capability_tuple = current_platform.get_device_capability()
capability = -1 if capability_tuple is None else capability_tuple.to_int()
return cutlass_scaled_mm_supports_fp4(capability)
def break_fp4_bytes(a, dtype): def break_fp4_bytes(a, dtype):
assert a.dtype == torch.uint8 assert a.dtype == torch.uint8
m, n = a.shape m, n = a.shape

View File

@ -8,8 +8,10 @@ from typing import ClassVar, NamedTuple, Optional
import numpy import numpy
import torch import torch
from vllm._custom_ops import cutlass_scaled_mm_supports_fp4
from vllm.model_executor.layers.quantization.qqq import ( from vllm.model_executor.layers.quantization.qqq import (
MARLIN_QQQ_SUPPORTED_NUM_BITS) MARLIN_QQQ_SUPPORTED_NUM_BITS)
from vllm.platforms import current_platform
from vllm.scalar_type import ScalarType, scalar_types from vllm.scalar_type import ScalarType, scalar_types
@ -592,3 +594,56 @@ def awq_pack(
q_w = q_w.reshape((-1, size_n)).contiguous() q_w = q_w.reshape((-1, size_n)).contiguous()
return pack_cols(q_w, num_bits, size_k, size_n) return pack_cols(q_w, num_bits, size_k, size_n)
def swizzle_blockscale(scale: torch.Tensor) -> torch.Tensor:
"""
Pad and block-interleave the FP4 block-scales so that they match the data
layout expected by the CUTLASS / FlashInfer kernels.
Parameters
----------
scale: torch.Tensor
Returns
-------
torch.Tensor
The swizzled tensor with the same logical shape as *scale*.
"""
assert scale.dtype == torch.float8_e4m3fn, (
"swizzle_blockscale expects the input tensor to be in "
"torch.float8_e4m3fn format.")
scale_ndim = scale.ndim
if scale_ndim == 2:
scale = scale.unsqueeze(0) # (1, M, K)
assert scale.ndim == 3, "Expected a 2-D or 3-D tensor for block scales."
B, M, K = scale.shape
def _round_up(x: int, m: int) -> int:
return (x + m - 1) // m * m
M_padded = _round_up(M, 128)
K_padded = _round_up(K, 4)
padded = torch.zeros((B, M_padded, K_padded),
dtype=scale.dtype,
device=scale.device)
padded[:B, :M, :K] = scale
# Reshape / permute to the layout required by the kernel.
padded = padded.reshape(B, M_padded // 128, 4, 32, K_padded // 4, 4)
swizzled = padded.permute(0, 1, 4, 3, 2, 5).contiguous().cuda()
if scale_ndim == 2:
return swizzled.reshape(M, K)
return swizzled.reshape(B, M, K)
def cutlass_fp4_supported() -> bool:
if not current_platform.is_cuda():
return False
capability_tuple = current_platform.get_device_capability()
capability = -1 if capability_tuple is None else capability_tuple.to_int()
return cutlass_scaled_mm_supports_fp4(capability)

View File

@ -1,7 +1,7 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Iterable, Mapping, Sequence from collections.abc import Iterable, Mapping, Sequence
from typing import Optional, TypedDict, Union from typing import Annotated, Optional, Union
import torch import torch
import torch.nn as nn import torch.nn as nn
@ -29,6 +29,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
PromptUpdate) PromptUpdate)
from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.utils.tensor_schema import TensorSchema, TensorShape
# yapf: disable # yapf: disable
from .idefics2_vision_model import Idefics2VisionConfig from .idefics2_vision_model import Idefics2VisionConfig
@ -42,15 +43,26 @@ from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
merge_multimodal_embeddings) merge_multimodal_embeddings)
class AriaImagePixelInputs(TypedDict): class AriaImagePixelInputs(TensorSchema):
pixel_values: torch.Tensor
pixel_mask: Optional[torch.Tensor]
""" """
Shape: Dimensions:
pixel_values: `(batch_size * num_images, num_channels, height, width)` - b: Batch size
pixel_mask: `(batch_size * num_images, height, width)` - n: Number of images
- c: Number of channels
- h: Height of each image
- w: Width of each image
""" """
pixel_values: Annotated[
torch.Tensor,
TensorShape("bn", 3, "h", "w"),
]
pixel_mask: Annotated[
Optional[torch.Tensor],
TensorShape("bn", "h", "w"),
]
class AriaVisionTransformer(Idefics3VisionTransformer, SupportsQuant): class AriaVisionTransformer(Idefics3VisionTransformer, SupportsQuant):
packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]} packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
@ -540,12 +552,6 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
self.vocab_size, logit_scale) self.vocab_size, logit_scale)
def _validate_image_sizes(
self, images: list[torch.Tensor]) -> list[torch.Tensor]:
if not all(img.shape == images[0].shape for img in images):
raise ValueError("All images must be the same size")
return images
def _parse_and_validate_image_input( def _parse_and_validate_image_input(
self, **kwargs: object) -> Optional[AriaImagePixelInputs]: self, **kwargs: object) -> Optional[AriaImagePixelInputs]:
pixel_values = kwargs.pop("pixel_values", None) pixel_values = kwargs.pop("pixel_values", None)
@ -554,23 +560,9 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
if pixel_values is None: if pixel_values is None:
return None return None
if not isinstance(pixel_values, (torch.Tensor, list)):
raise ValueError("Incorrect type of pixel values. "
f"Got type: {type(pixel_values)}")
pixel_values = self._validate_image_sizes(pixel_values)
pixel_values = flatten_bn(pixel_values, concat=True)
if pixel_mask is not None:
if not isinstance(pixel_mask, (torch.Tensor, list)):
raise ValueError("Incorrect type of pixel mask. "
f"Got type: {type(pixel_mask)}")
pixel_mask = flatten_bn(pixel_mask, concat=True)
return AriaImagePixelInputs( return AriaImagePixelInputs(
pixel_values=pixel_values, pixel_values=flatten_bn(pixel_values, concat=True),
pixel_mask=pixel_mask, pixel_mask=flatten_bn(pixel_mask, concat=True),
) )
def _create_patch_attention_mask( def _create_patch_attention_mask(

Some files were not shown because too many files have changed in this diff Show More