Merge remote-tracking branch 'nm/lwilkinson/fix-flashmla-full-cudagraph' into wide_ep_working_branch

This commit is contained in:
Tyler Michael Smith 2025-07-27 21:22:09 +00:00
commit f1c9ef3afd
148 changed files with 7438 additions and 1598 deletions

View File

@ -74,7 +74,7 @@ Here is an example of one test inside `latency-tests.json`:
In this example:
- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
- The `parameters` attribute control the command line arguments to be used for `vllm bench latency`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `vllm bench latency`. For example, the corresponding command line arguments for `vllm bench latency` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
@ -82,13 +82,13 @@ WARNING: The benchmarking script will save json results by itself, so please do
### Throughput test
The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `vllm bench throughput`.
The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
### Serving test
We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
We test the throughput by using `vllm bench serve` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
```json
[
@ -118,8 +118,8 @@ Inside this example:
- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
- The `server-parameters` includes the command line arguments for vLLM server.
- The `client-parameters` includes the command line arguments for `benchmark_serving.py`.
- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `benchmark_serving.py`
- The `client-parameters` includes the command line arguments for `vllm bench serve`.
- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `vllm bench serve`
The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly.

View File

@ -100,7 +100,7 @@ if __name__ == "__main__":
raw_result = json.loads(f.read())
if "serving" in str(test_file):
# this result is generated via `benchmark_serving.py`
# this result is generated via `vllm bench serve` command
# attach the benchmarking command to raw_result
try:
@ -120,7 +120,7 @@ if __name__ == "__main__":
continue
elif "latency" in f.name:
# this result is generated via `benchmark_latency.py`
# this result is generated via `vllm bench latency` command
# attach the benchmarking command to raw_result
try:
@ -148,7 +148,7 @@ if __name__ == "__main__":
continue
elif "throughput" in f.name:
# this result is generated via `benchmark_throughput.py`
# this result is generated via `vllm bench throughput` command
# attach the benchmarking command to raw_result
try:

View File

@ -73,7 +73,7 @@ get_current_llm_serving_engine() {
echo "Container: vllm"
# move to a completely irrelevant directory, to avoid import vllm from current folder
export CURRENT_LLM_SERVING_ENGINE=vllm
return
fi
}
@ -95,12 +95,14 @@ json2args() {
}
kill_gpu_processes() {
pkill -f python
pkill -f python3
pkill -f tritonserver
pkill -f pt_main_thread
pkill -f text-generation
pkill -f lmdeploy
pkill -f '[p]ython'
pkill -f '[p]ython3'
pkill -f '[t]ritonserver'
pkill -f '[p]t_main_thread'
pkill -f '[t]ext-generation'
pkill -f '[l]mdeploy'
# vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
pkill -f '[V]LLM'
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
sleep 1
@ -125,7 +127,7 @@ ensure_installed() {
}
run_serving_tests() {
# run serving tests using `benchmark_serving.py`
# run serving tests using `vllm bench serve` command
# $1: a json file specifying serving test cases
local serving_test_file
@ -225,7 +227,7 @@ run_serving_tests() {
if [[ "$dataset_name" = "sharegpt" ]]; then
client_command="python3 benchmark_serving.py \
client_command="vllm bench serve \
--backend $backend \
--tokenizer /tokenizer_cache \
--model $model \
@ -246,7 +248,7 @@ run_serving_tests() {
sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
client_command="python3 benchmark_serving.py \
client_command="vllm bench serve \
--backend $backend \
--tokenizer /tokenizer_cache \
--model $model \
@ -265,13 +267,13 @@ run_serving_tests() {
$client_args"
else
echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
exit 1
fi
echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command"
@ -302,7 +304,7 @@ run_serving_tests() {
}
run_genai_perf_tests() {
# run genai-perf tests
# run genai-perf tests
# $1: a json file specifying genai-perf test cases
local genai_perf_test_file
@ -311,14 +313,14 @@ run_genai_perf_tests() {
# Iterate over genai-perf tests
jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
test_name=$(echo "$params" | jq -r '.test_name')
# if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name."
continue
fi
# prepend the current serving engine to the test name
test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
@ -369,10 +371,10 @@ run_genai_perf_tests() {
qps=$num_prompts
echo "now qps is $qps"
fi
new_test_name=$test_name"_qps_"$qps
backend=$CURRENT_LLM_SERVING_ENGINE
if [[ "$backend" == *"vllm"* ]]; then
backend="vllm"
fi
@ -413,7 +415,7 @@ prepare_dataset() {
do
cat sonnet.txt >> sonnet_4x.txt
done
}
main() {

View File

@ -126,7 +126,8 @@ kill_gpu_processes() {
ps -aux
lsof -t -i:8000 | xargs -r kill -9
pgrep python3 | xargs -r kill -9
# vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
pgrep VLLM | xargs -r kill -9
# wait until GPU memory usage smaller than 1GB
if command -v nvidia-smi; then
@ -164,7 +165,7 @@ upload_to_buildkite() {
}
run_latency_tests() {
# run latency tests using `benchmark_latency.py`
# run latency tests using `vllm bench latency` command
# $1: a json file specifying latency test cases
local latency_test_file
@ -205,7 +206,7 @@ run_latency_tests() {
fi
fi
latency_command=" $latency_envs python3 benchmark_latency.py \
latency_command=" $latency_envs vllm bench latency \
--output-json $RESULTS_FOLDER/${test_name}.json \
$latency_args"
@ -231,7 +232,7 @@ run_latency_tests() {
}
run_throughput_tests() {
# run throughput tests using `benchmark_throughput.py`
# run throughput tests using `vllm bench throughput`
# $1: a json file specifying throughput test cases
local throughput_test_file
@ -272,7 +273,7 @@ run_throughput_tests() {
fi
fi
throughput_command=" $throughput_envs python3 benchmark_throughput.py \
throughput_command=" $throughput_envs vllm bench throughput \
--output-json $RESULTS_FOLDER/${test_name}.json \
$throughput_args"
@ -297,7 +298,7 @@ run_throughput_tests() {
}
run_serving_tests() {
# run serving tests using `benchmark_serving.py`
# run serving tests using `vllm bench serve` command
# $1: a json file specifying serving test cases
local serving_test_file
@ -393,7 +394,7 @@ run_serving_tests() {
# pass the tensor parallel size to the client so that it can be displayed
# on the benchmark dashboard
client_command="python3 benchmark_serving.py \
client_command="vllm bench serve \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
@ -447,7 +448,7 @@ main() {
(which jq) || (apt-get update && apt-get -y install jq)
(which lsof) || (apt-get update && apt-get install -y lsof)
# get the current IP address, required by benchmark_serving.py
# get the current IP address, required by `vllm bench serve` command
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
# turn of the reporting of the status of each request, to clean up the terminal output
export VLLM_LOGGING_LEVEL="WARNING"

View File

@ -13,9 +13,9 @@ NUMA_NODE=${NUMA_NODE:-1}
export CMAKE_BUILD_PARALLEL_LEVEL=32
# Setup cleanup
remove_docker_container() {
set -e;
docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
remove_docker_container() {
set -e;
docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
}
trap remove_docker_container EXIT
remove_docker_container
@ -69,7 +69,7 @@ function cpu_tests() {
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -s -v \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
# Note: disable it until supports V1
# Run AWQ test
@ -83,7 +83,7 @@ function cpu_tests() {
set -e
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
python3 benchmarks/benchmark_serving.py \
vllm bench serve \
--backend vllm \
--dataset-name random \
--model meta-llama/Llama-3.2-3B-Instruct \

View File

@ -0,0 +1,166 @@
#!/bin/bash
set -xu
remove_docker_container() {
docker rm -f tpu-test || true;
docker rm -f vllm-tpu || true;
}
trap remove_docker_container EXIT
# Remove the container that might not be cleaned up in the previous run.
remove_docker_container
# Build the docker image.
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
# Set up cleanup.
cleanup_docker() {
# Get Docker's root directory
docker_root=$(docker info -f '{{.DockerRootDir}}')
if [ -z "$docker_root" ]; then
echo "Failed to determine Docker root directory."
exit 1
fi
echo "Docker root directory: $docker_root"
# Check disk usage of the filesystem where Docker's root directory is located
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
# Define the threshold
threshold=70
if [ "$disk_usage" -gt "$threshold" ]; then
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
# Remove dangling images (those that are not tagged and not used by any container)
docker image prune -f
# Remove unused volumes / force the system prune for old images as well.
docker volume prune -f && docker system prune --force --filter "until=72h" --all
echo "Docker images and volumes cleanup completed."
else
echo "Disk usage is below $threshold%. No cleanup needed."
fi
}
cleanup_docker
# For HF_TOKEN.
source /etc/environment
docker run --privileged --net host --shm-size=16G -it \
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
vllm-tpu /bin/bash -c '
set -e # Exit immediately if a command exits with a non-zero status.
set -u # Treat unset variables as an error.
echo "--- Starting script inside Docker container ---"
# Create results directory
RESULTS_DIR=$(mktemp -d)
# If mktemp fails, set -e will cause the script to exit.
echo "Results will be stored in: $RESULTS_DIR"
# Install dependencies
echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
&& python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 \
&& python3 -m pip install --progress-bar off hf-transfer
echo "--- Python dependencies installed ---"
export VLLM_USE_V1=1
export VLLM_XLA_CHECK_RECOMPILATION=1
export VLLM_XLA_CACHE_PATH=
echo "Using VLLM V1"
echo "--- Hardware Information ---"
# tpu-info
echo "--- Starting Tests ---"
set +e
overall_script_exit_code=0
# --- Test Definitions ---
# If a test fails, this function will print logs and will not cause the main script to exit.
run_test() {
local test_num=$1
local test_name=$2
local test_command=$3
local log_file="$RESULTS_DIR/test_${test_num}.log"
local actual_exit_code
echo "--- TEST_$test_num: Running $test_name ---"
# Execute the test command.
eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2)
actual_exit_code=$?
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log
if [ "$actual_exit_code" -ne 0 ]; then
echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2
echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2
if [ -f "$log_file" ]; then
cat "$log_file" >&2
else
echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2
fi
echo "--- End of log for TEST_$test_num ($test_name) ---" >&2
return "$actual_exit_code" # Return the failure code
else
echo "TEST_$test_num ($test_name) PASSED."
return 0 # Return success
fi
}
# Helper function to call run_test and update the overall script exit code
run_and_track_test() {
local test_num_arg="$1"
local test_name_arg="$2"
local test_command_arg="$3"
# Run the test
run_test "$test_num_arg" "$test_name_arg" "$test_command_arg"
local test_specific_exit_code=$?
# If the test failed, set the overall script exit code to 1
if [ "$test_specific_exit_code" -ne 0 ]; then
# No need for extra echo here, run_test already logged the failure.
overall_script_exit_code=1
fi
}
# --- Actual Test Execution ---
run_and_track_test 1 "test_struct_output_generate.py" \
"HF_HUB_DISABLE_XET=1 python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
run_and_track_test 2 "test_moe_pallas.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
run_and_track_test 3 "test_lora.py" \
"VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
run_and_track_test 4 "test_tpu_qkv_linear.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
run_and_track_test 5 "test_spmd_model_weight_loading.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
run_and_track_test 6 "test_kv_cache_update_kernel.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
# After all tests have been attempted, exit with the overall status.
if [ "$overall_script_exit_code" -ne 0 ]; then
echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---"
else
echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---"
fi
exit "$overall_script_exit_code"
' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct.
# Capture the exit code of the docker run command
DOCKER_RUN_EXIT_CODE=$?
# The trap will run for cleanup.
# Exit the main script with the Docker run command's exit code.
if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then
echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE."
exit "$DOCKER_RUN_EXIT_CODE"
else
echo "Docker run command completed successfully."
exit 0
fi
# TODO: This test fails because it uses RANDOM_SEED sampling
# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \

View File

@ -150,18 +150,6 @@ run_and_track_test 9 "test_multimodal.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py"
run_and_track_test 10 "test_pallas.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
run_and_track_test 11 "test_struct_output_generate.py" \
"HF_HUB_DISABLE_XET=1 python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
run_and_track_test 12 "test_moe_pallas.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
run_and_track_test 13 "test_lora.py" \
"VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
run_and_track_test 14 "test_tpu_qkv_linear.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
run_and_track_test 15 "test_spmd_model_weight_loading.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
run_and_track_test 16 "test_kv_cache_update_kernel.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
# After all tests have been attempted, exit with the overall status.
if [ "$overall_script_exit_code" -ne 0 ]; then

View File

@ -11,10 +11,10 @@ cd "$(dirname "${BASH_SOURCE[0]}")/../.."
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
# run python-based benchmarks and upload the result to buildkite
python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
vllm bench latency --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
bench_latency_exit_code=$?
python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
bench_throughput_exit_code=$?
# run server-based benchmarks and upload the result to buildkite
@ -24,7 +24,7 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
# wait for server to start, timeout after 600 seconds
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
python3 benchmarks/benchmark_serving.py \
vllm bench serve \
--backend vllm \
--dataset-name sharegpt \
--dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \

View File

@ -77,7 +77,7 @@ done
echo "run benchmark test..."
echo "logging to $BM_LOG"
echo
python benchmarks/benchmark_serving.py \
vllm bench serve \
--backend vllm \
--model $MODEL \
--dataset-name sonnet \

View File

@ -7,7 +7,7 @@ permissions:
jobs:
lint-and-deploy:
runs-on: ubuntu-latest
runs-on: ubuntu-24.04-arm
steps:
- name: Checkout
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

View File

@ -98,7 +98,7 @@ Then run the benchmarking script
```bash
# download dataset
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
python3 vllm/benchmarks/benchmark_serving.py \
vllm bench serve \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--endpoint /v1/completions \
@ -111,25 +111,25 @@ If successful, you will see the following output
```
============ Serving Benchmark Result ============
Successful requests: 10
Benchmark duration (s): 5.78
Total input tokens: 1369
Total generated tokens: 2212
Request throughput (req/s): 1.73
Output token throughput (tok/s): 382.89
Total Token throughput (tok/s): 619.85
Successful requests: 10
Benchmark duration (s): 5.78
Total input tokens: 1369
Total generated tokens: 2212
Request throughput (req/s): 1.73
Output token throughput (tok/s): 382.89
Total Token throughput (tok/s): 619.85
---------------Time to First Token----------------
Mean TTFT (ms): 71.54
Median TTFT (ms): 73.88
P99 TTFT (ms): 79.49
Mean TTFT (ms): 71.54
Median TTFT (ms): 73.88
P99 TTFT (ms): 79.49
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms): 7.91
Median TPOT (ms): 7.96
P99 TPOT (ms): 8.03
Mean TPOT (ms): 7.91
Median TPOT (ms): 7.96
P99 TPOT (ms): 8.03
---------------Inter-token Latency----------------
Mean ITL (ms): 7.74
Median ITL (ms): 7.70
P99 ITL (ms): 8.39
Mean ITL (ms): 7.74
Median ITL (ms): 7.70
P99 ITL (ms): 8.39
==================================================
```
@ -141,7 +141,7 @@ If the dataset you want to benchmark is not supported yet in vLLM, even then you
{"prompt": "What is the capital of India?"}
{"prompt": "What is the capital of Iran?"}
{"prompt": "What is the capital of China?"}
```
```
```bash
# start server
@ -150,7 +150,7 @@ VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests
```bash
# run benchmarking script
python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detailed \
vllm bench serve --port 9001 --save-result --save-detailed \
--backend vllm \
--model meta-llama/Llama-3.1-8B-Instruct \
--endpoint /v1/completions \
@ -174,7 +174,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
```
```bash
python3 vllm/benchmarks/benchmark_serving.py \
vllm bench serve \
--backend openai-chat \
--model Qwen/Qwen2-VL-7B-Instruct \
--endpoint /v1/chat/completions \
@ -194,7 +194,7 @@ VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
```
``` bash
python3 benchmarks/benchmark_serving.py \
vllm bench serve \
--model meta-llama/Meta-Llama-3-8B-Instruct \
--dataset-name hf \
--dataset-path likaixin/InstructCoder \
@ -210,7 +210,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
**`lmms-lab/LLaVA-OneVision-Data`**
```bash
python3 vllm/benchmarks/benchmark_serving.py \
vllm bench serve \
--backend openai-chat \
--model Qwen/Qwen2-VL-7B-Instruct \
--endpoint /v1/chat/completions \
@ -224,7 +224,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
**`Aeala/ShareGPT_Vicuna_unfiltered`**
```bash
python3 vllm/benchmarks/benchmark_serving.py \
vllm bench serve \
--backend openai-chat \
--model Qwen/Qwen2-VL-7B-Instruct \
--endpoint /v1/chat/completions \
@ -237,7 +237,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
**`AI-MO/aimo-validation-aime`**
``` bash
python3 vllm/benchmarks/benchmark_serving.py \
vllm bench serve \
--model Qwen/QwQ-32B \
--dataset-name hf \
--dataset-path AI-MO/aimo-validation-aime \
@ -248,7 +248,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
**`philschmid/mt-bench`**
``` bash
python3 vllm/benchmarks/benchmark_serving.py \
vllm bench serve \
--model Qwen/QwQ-32B \
--dataset-name hf \
--dataset-path philschmid/mt-bench \
@ -261,7 +261,7 @@ When using OpenAI-compatible backends such as `vllm`, optional sampling
parameters can be specified. Example client command:
```bash
python3 vllm/benchmarks/benchmark_serving.py \
vllm bench serve \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--endpoint /v1/completions \
@ -296,7 +296,7 @@ The following arguments can be used to control the ramp-up:
<br/>
```bash
python3 vllm/benchmarks/benchmark_throughput.py \
vllm bench throughput \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset-name sonnet \
--dataset-path vllm/benchmarks/sonnet.txt \
@ -314,7 +314,7 @@ Total num output tokens: 1500
**VisionArena Benchmark for Vision Language Models**
``` bash
python3 vllm/benchmarks/benchmark_throughput.py \
vllm bench throughput \
--model Qwen/Qwen2-VL-7B-Instruct \
--backend vllm-chat \
--dataset-name hf \
@ -336,7 +336,7 @@ Total num output tokens: 1280
``` bash
VLLM_WORKER_MULTIPROC_METHOD=spawn \
VLLM_USE_V1=1 \
python3 vllm/benchmarks/benchmark_throughput.py \
vllm bench throughput \
--dataset-name=hf \
--dataset-path=likaixin/InstructCoder \
--model=meta-llama/Meta-Llama-3-8B-Instruct \
@ -360,7 +360,7 @@ Total num output tokens: 204800
**`lmms-lab/LLaVA-OneVision-Data`**
```bash
python3 vllm/benchmarks/benchmark_throughput.py \
vllm bench throughput \
--model Qwen/Qwen2-VL-7B-Instruct \
--backend vllm-chat \
--dataset-name hf \
@ -373,7 +373,7 @@ python3 vllm/benchmarks/benchmark_throughput.py \
**`Aeala/ShareGPT_Vicuna_unfiltered`**
```bash
python3 vllm/benchmarks/benchmark_throughput.py \
vllm bench throughput \
--model Qwen/Qwen2-VL-7B-Instruct \
--backend vllm-chat \
--dataset-name hf \
@ -385,7 +385,7 @@ python3 vllm/benchmarks/benchmark_throughput.py \
**`AI-MO/aimo-validation-aime`**
```bash
python3 benchmarks/benchmark_throughput.py \
vllm bench throughput \
--model Qwen/QwQ-32B \
--backend vllm \
--dataset-name hf \
@ -399,7 +399,7 @@ python3 benchmarks/benchmark_throughput.py \
``` bash
# download dataset
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
python3 vllm/benchmarks/benchmark_throughput.py \
vllm bench throughput \
--model meta-llama/Llama-2-7b-hf \
--backend vllm \
--dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \

View File

@ -105,7 +105,7 @@ After the script finishes, you will find the results in a new, timestamped direc
- **Log Files**: The directory (`$BASE/auto-benchmark/YYYY_MM_DD_HH_MM/`) contains detailed logs for each run:
- `vllm_log_...txt`: The log output from the vLLM server for each parameter combination.
- `bm_log_...txt`: The log output from the `benchmark_serving.py` script for each benchmark run.
- `bm_log_...txt`: The log output from the `vllm bench serve` command for each benchmark run.
- **Final Result Summary**: A file named `result.txt` is created in the log directory. It contains a summary of each tested combination and concludes with the overall best parameters found.

View File

@ -1,6 +1,6 @@
#!/bin/bash
# This script aims to tune the best server parameter combinations to maximize throughput for given requirement.
# This script aims to tune the best server parameter combinations to maximize throughput for given requirement.
# See details in README (benchmarks/auto_tune/README.md).
TAG=$(date +"%Y_%m_%d_%H_%M")
@ -56,7 +56,7 @@ start_server() {
local max_num_batched_tokens=$3
local vllm_log=$4
local profile_dir=$5
pkill -f vllm
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir vllm serve $MODEL \
@ -73,9 +73,9 @@ start_server() {
# wait for 10 minutes...
server_started=0
for i in {1..60}; do
for i in {1..60}; do
RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
if [[ "$STATUS_CODE" -eq 200 ]]; then
server_started=1
break
@ -98,10 +98,10 @@ update_best_profile() {
selected_profile_file=
if [[ "$SYSTEM" == "TPU" ]]; then
selected_profile_file="${sorted_paths[$profile_index]}/*.xplane.pb"
fi
fi
if [[ "$SYSTEM" == "GPU" ]]; then
selected_profile_file="${sorted_paths[$profile_index]}"
fi
fi
rm -f $PROFILE_PATH/*
cp $selected_profile_file $PROFILE_PATH
}
@ -129,14 +129,14 @@ run_benchmark() {
echo "server started."
fi
echo
echo "run benchmark test..."
meet_latency_requirement=0
# get a basic qps by using request-rate inf
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
adjusted_input_len=$(( INPUT_LEN - prefix_len ))
python3 benchmarks/benchmark_serving.py \
vllm bench serve \
--backend vllm \
--model $MODEL \
--dataset-name random \
@ -169,7 +169,7 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len ))
curl -X POST http://0.0.0.0:8004/reset_prefix_cache
sleep 5
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
python3 benchmarks/benchmark_serving.py \
vllm bench serve \
--backend vllm \
--model $MODEL \
--dataset-name random \

View File

@ -11,6 +11,7 @@ from typing import Any, Optional
import numpy as np
from tqdm import tqdm
from typing_extensions import deprecated
import vllm.envs as envs
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
@ -34,6 +35,10 @@ def save_to_pytorch_benchmark_format(
write_to_json(pt_file, pt_records)
@deprecated(
"benchmark_latency.py is deprecated and will be removed in a "
"future version. Please use 'vllm bench latency' instead.",
)
def main(args: argparse.Namespace):
print(args)

View File

@ -38,6 +38,7 @@ from typing import Any, Literal, Optional
import numpy as np
from tqdm.asyncio import tqdm
from transformers import PreTrainedTokenizerBase
from typing_extensions import deprecated
from backend_request_func import (
ASYNC_REQUEST_FUNCS,
@ -593,6 +594,10 @@ def save_to_pytorch_benchmark_format(
write_to_json(pt_file, pt_records)
@deprecated(
"benchmark_serving.py is deprecated and will be removed in a future "
"version. Please use 'vllm bench serve' instead.",
)
def main(args: argparse.Namespace):
print(args)
random.seed(args.seed)

View File

@ -15,6 +15,7 @@ import torch
import uvloop
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase
from typing_extensions import deprecated
from benchmark_dataset import (
AIMODataset,
@ -382,6 +383,10 @@ def get_requests(args, tokenizer):
return dataset_cls(**common_kwargs).sample(**sample_kwargs)
@deprecated(
"benchmark_throughput.py is deprecated and will be removed in a "
"future version. Please use 'vllm bench throughput' instead.",
)
def main(args: argparse.Namespace):
if args.seed is None:
args.seed = 0

View File

@ -3,7 +3,7 @@
# benchmark the overhead of disaggregated prefill.
# methodology:
# - send all request to prefill vLLM instance. It will buffer KV cache.
# - then send all request to decode instance.
# - then send all request to decode instance.
# - The TTFT of decode instance is the overhead.
set -ex
@ -12,6 +12,8 @@ kill_gpu_processes() {
# kill all processes on GPU.
pgrep pt_main_thread | xargs -r kill -9
pgrep python3 | xargs -r kill -9
# vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
pgrep VLLM | xargs -r kill -9
sleep 10
# remove vllm config file
@ -61,7 +63,7 @@ benchmark() {
--gpu-memory-utilization 0.6 \
--kv-transfer-config \
'{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
CUDA_VISIBLE_DEVICES=1 python3 \
-m vllm.entrypoints.openai.api_server \
@ -76,38 +78,38 @@ benchmark() {
wait_for_server 8200
# let the prefill instance finish prefill
python3 ../benchmark_serving.py \
--backend vllm \
--model $model \
--dataset-name $dataset_name \
--dataset-path $dataset_path \
--sonnet-input-len $input_len \
--sonnet-output-len "$output_len" \
--sonnet-prefix-len $prefix_len \
--num-prompts $num_prompts \
--port 8100 \
--save-result \
--result-dir $results_folder \
--result-filename disagg_prefill_tp1.json \
--request-rate "inf"
vllm bench serve \
--backend vllm \
--model $model \
--dataset-name $dataset_name \
--dataset-path $dataset_path \
--sonnet-input-len $input_len \
--sonnet-output-len "$output_len" \
--sonnet-prefix-len $prefix_len \
--num-prompts $num_prompts \
--port 8100 \
--save-result \
--result-dir $results_folder \
--result-filename disagg_prefill_tp1.json \
--request-rate "inf"
# send the request to decode.
# The TTFT of this command will be the overhead of disagg prefill impl.
python3 ../benchmark_serving.py \
--backend vllm \
--model $model \
--dataset-name $dataset_name \
--dataset-path $dataset_path \
--sonnet-input-len $input_len \
--sonnet-output-len "$output_len" \
--sonnet-prefix-len $prefix_len \
--num-prompts $num_prompts \
--port 8200 \
--save-result \
--result-dir $results_folder \
--result-filename disagg_prefill_tp1_overhead.json \
--request-rate "$qps"
vllm bench serve \
--backend vllm \
--model $model \
--dataset-name $dataset_name \
--dataset-path $dataset_path \
--sonnet-input-len $input_len \
--sonnet-output-len "$output_len" \
--sonnet-prefix-len $prefix_len \
--num-prompts $num_prompts \
--port 8200 \
--save-result \
--result-dir $results_folder \
--result-filename disagg_prefill_tp1_overhead.json \
--request-rate "$qps"
kill_gpu_processes
}

View File

@ -18,6 +18,8 @@ kill_gpu_processes() {
# kill all processes on GPU.
pgrep pt_main_thread | xargs -r kill -9
pgrep python3 | xargs -r kill -9
# vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
pgrep VLLM | xargs -r kill -9
for port in 8000 8100 8200; do lsof -t -i:$port | xargs -r kill -9; done
sleep 1
}
@ -58,7 +60,7 @@ launch_chunked_prefill() {
launch_disagg_prefill() {
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
# disagg prefill
CUDA_VISIBLE_DEVICES=0 python3 \
-m vllm.entrypoints.openai.api_server \
@ -97,20 +99,20 @@ benchmark() {
output_len=$2
tag=$3
python3 ../benchmark_serving.py \
--backend vllm \
--model $model \
--dataset-name $dataset_name \
--dataset-path $dataset_path \
--sonnet-input-len $input_len \
--sonnet-output-len "$output_len" \
--sonnet-prefix-len $prefix_len \
--num-prompts $num_prompts \
--port 8000 \
--save-result \
--result-dir $results_folder \
--result-filename "$tag"-qps-"$qps".json \
--request-rate "$qps"
vllm bench serve \
--backend vllm \
--model $model \
--dataset-name $dataset_name \
--dataset-path $dataset_path \
--sonnet-input-len $input_len \
--sonnet-output-len "$output_len" \
--sonnet-prefix-len $prefix_len \
--num-prompts $num_prompts \
--port 8000 \
--save-result \
--result-dir $results_folder \
--result-filename "$tag"-qps-"$qps".json \
--request-rate "$qps"
sleep 2
}

View File

@ -5,9 +5,8 @@ import itertools
import torch
from vllm import _custom_ops as ops
from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
moe_align_block_size_triton,
moe_align_block_size,
)
from vllm.triton_utils import triton
@ -21,60 +20,6 @@ def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor:
)
def check_correctness(num_tokens, num_experts=256, block_size=256, topk=8):
"""
Verifies vllm vs. Triton
"""
topk_ids = get_topk_ids(num_tokens, num_experts, topk)
# 1. malloc space for triton and vllm
# malloc enough space (max_num_tokens_padded) for the sorted ids
max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
sorted_ids_triton = torch.empty(
(max_num_tokens_padded,), dtype=torch.int32, device="cuda"
)
expert_ids_triton = torch.empty(
(max_num_tokens_padded // block_size,), dtype=torch.int32, device="cuda"
)
num_tokens_post_pad_triton = torch.empty((1,), dtype=torch.int32, device="cuda")
sorted_ids_vllm = torch.empty_like(sorted_ids_triton)
expert_ids_vllm = torch.empty_like(expert_ids_triton)
num_tokens_post_pad_vllm = torch.empty_like(num_tokens_post_pad_triton)
# 2. run implementations
moe_align_block_size_triton(
topk_ids,
num_experts,
block_size,
sorted_ids_triton,
expert_ids_triton,
num_tokens_post_pad_triton,
)
ops.moe_align_block_size(
topk_ids,
num_experts,
block_size,
sorted_ids_vllm,
expert_ids_vllm,
num_tokens_post_pad_vllm,
)
print(f"✅ VLLM implementation works with {num_experts} experts!")
# 3. compare results
if torch.allclose(expert_ids_triton, expert_ids_vllm) and torch.allclose(
num_tokens_post_pad_triton, num_tokens_post_pad_vllm
):
print("✅ Triton and VLLM implementations match.")
else:
print("❌ Triton and VLLM implementations DO NOT match.")
print("Triton expert_ids:", expert_ids_triton)
print("VLLM expert_ids:", expert_ids_vllm)
print("Triton num_tokens_post_pad:", num_tokens_post_pad_triton)
print("VLLM num_tokens_post_pad:", num_tokens_post_pad_vllm)
# test configurations
num_tokens_range = [1, 16, 256, 4096]
num_experts_range = [16, 64, 224, 256, 280, 512]
@ -87,8 +32,8 @@ configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range
x_names=["num_tokens", "num_experts", "topk"],
x_vals=configs,
line_arg="provider",
line_vals=["vllm", "triton"], # "triton"
line_names=["VLLM", "Triton"], # "Triton"
line_vals=["vllm"],
line_names=["vLLM"],
plot_name="moe-align-block-size-performance",
args={},
)
@ -98,36 +43,11 @@ def benchmark(num_tokens, num_experts, topk, provider):
block_size = 256
topk_ids = get_topk_ids(num_tokens, num_experts, topk)
max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
sorted_ids = torch.empty((max_num_tokens_padded,), dtype=torch.int32, device="cuda")
max_num_m_blocks = max_num_tokens_padded // block_size
expert_ids = torch.empty((max_num_m_blocks,), dtype=torch.int32, device="cuda")
num_tokens_post_pad = torch.empty((1,), dtype=torch.int32, device="cuda")
quantiles = [0.5, 0.2, 0.8]
if provider == "vllm":
ms, min_ms, max_ms = triton.testing.do_bench(
lambda: ops.moe_align_block_size(
topk_ids,
num_experts,
block_size,
sorted_ids.clone(),
expert_ids.clone(),
num_tokens_post_pad.clone(),
),
quantiles=quantiles,
)
elif provider == "triton":
ms, min_ms, max_ms = triton.testing.do_bench(
lambda: moe_align_block_size_triton(
topk_ids,
num_experts,
block_size,
sorted_ids.clone(),
expert_ids.clone(),
num_tokens_post_pad.clone(),
),
lambda: moe_align_block_size(topk_ids, block_size, num_experts),
quantiles=quantiles,
)
@ -151,6 +71,4 @@ if __name__ == "__main__":
)
args = parser.parse_args()
print("Running correctness check...")
check_correctness(num_tokens=1024, num_experts=args.num_experts, topk=args.topk)
benchmark.run(print_data=True, show_plots=True)

View File

@ -8,12 +8,13 @@ import ray
import torch
from transformers import AutoConfig
from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
from vllm.model_executor.layers.fused_moe.fused_moe import *
from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
_moe_permute,
_moe_unpermute_and_reduce,
moe_permute,
moe_unpermute,
)
from vllm.model_executor.layers.fused_moe.fused_moe import *
from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import *
from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize
from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser
@ -63,18 +64,19 @@ def benchmark_permute(
def run():
if use_customized_permute:
(permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = (
moe_permute(
qhidden_states,
topk_weights=topk_weights,
topk_ids=topk_ids,
token_expert_indices=token_expert_indices,
topk=topk,
n_expert=num_experts,
n_local_expert=num_experts,
expert_map=None,
align_block_size=align_block_size,
)
(
permuted_hidden_states,
a1q_scale,
first_token_off,
inv_perm_idx,
m_indices,
) = moe_permute(
qhidden_states,
a1q_scale=None,
topk_ids=topk_ids,
n_expert=num_experts,
expert_map=None,
align_block_size=align_block_size,
)
else:
(
@ -150,18 +152,19 @@ def benchmark_unpermute(
def prepare():
if use_customized_permute:
(permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = (
moe_permute(
qhidden_states,
topk_weights=topk_weights,
topk_ids=topk_ids,
token_expert_indices=token_expert_indices,
topk=topk,
n_expert=num_experts,
n_local_expert=num_experts,
expert_map=None,
align_block_size=align_block_size,
)
(
permuted_hidden_states,
a1q_scale,
first_token_off,
inv_perm_idx,
m_indices,
) = moe_permute(
qhidden_states,
a1q_scale=None,
topk_ids=topk_ids,
n_expert=num_experts,
expert_map=None,
align_block_size=align_block_size,
)
# convert to fp16/bf16 as gemm output
return (
@ -191,16 +194,19 @@ def benchmark_unpermute(
def run(input: tuple):
if use_customized_permute:
(permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = input
(
permuted_hidden_states,
first_token_off,
inv_perm_idx,
m_indices,
) = input
output = torch.empty_like(hidden_states)
moe_unpermute(
output,
permuted_hidden_states,
topk_weights,
topk_ids,
inv_perm_idx,
first_token_off,
topk,
num_experts,
num_experts,
)
else:
(
@ -211,7 +217,11 @@ def benchmark_unpermute(
inv_perm,
) = input
_moe_unpermute_and_reduce(
output_hidden_states, permuted_hidden_states, inv_perm, topk_weights
output_hidden_states,
permuted_hidden_states,
inv_perm,
topk_weights,
True,
)
# JIT compilation & warmup

View File

@ -151,7 +151,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding);
// Quantization
#if defined(__AVX512F__) || defined(__aarch64__)
#if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__))
at::Tag stride_tag = at::Tag::needs_fixed_stride_order;
// Compute int8 quantized tensor for given scaling factor.

View File

@ -10,32 +10,28 @@
void moe_permute(
const torch::Tensor& input, // [n_token, hidden]
const torch::Tensor& topk_weights, //[n_token, topk]
torch::Tensor& topk_ids, // [n_token, topk]
const torch::Tensor& topk_ids, // [n_token, topk]
const torch::Tensor& token_expert_indices, // [n_token, topk]
const std::optional<torch::Tensor>& expert_map, // [n_expert]
int64_t n_expert, int64_t n_local_expert, int64_t topk,
const std::optional<int64_t>& align_block_size,
torch::Tensor&
permuted_input, // [topk * n_token/align_block_size_m, hidden]
torch::Tensor& permuted_input, // [permuted_size, hidden]
torch::Tensor& expert_first_token_offset, // [n_local_expert + 1]
torch::Tensor& src_row_id2dst_row_id_map, // [n_token, topk]
torch::Tensor& inv_permuted_idx, // [n_token, topk]
torch::Tensor& permuted_idx, // [permute_size]
torch::Tensor& m_indices) { // [align_expand_m]
TORCH_CHECK(topk_weights.scalar_type() == at::ScalarType::Float,
"topk_weights must be float32");
TORCH_CHECK(expert_first_token_offset.scalar_type() == at::ScalarType::Long,
"expert_first_token_offset must be int64");
TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int,
"topk_ids must be int32");
TORCH_CHECK(token_expert_indices.scalar_type() == at::ScalarType::Int,
"token_expert_indices must be int32");
TORCH_CHECK(src_row_id2dst_row_id_map.scalar_type() == at::ScalarType::Int,
"src_row_id2dst_row_id_map must be int32");
TORCH_CHECK(inv_permuted_idx.scalar_type() == at::ScalarType::Int,
"inv_permuted_idx must be int32");
TORCH_CHECK(expert_first_token_offset.size(0) == n_local_expert + 1,
"expert_first_token_offset shape != n_local_expert+1")
TORCH_CHECK(
src_row_id2dst_row_id_map.sizes() == token_expert_indices.sizes(),
"token_expert_indices shape must be same as src_row_id2dst_row_id_map");
TORCH_CHECK(inv_permuted_idx.sizes() == token_expert_indices.sizes(),
"token_expert_indices shape must be same as inv_permuted_idx");
auto n_token = input.sizes()[0];
auto n_hidden = input.sizes()[1];
auto align_block_size_value =
@ -46,8 +42,9 @@ void moe_permute(
auto sort_workspace = torch::empty(
{sorter_size},
torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false));
auto copy_topk_ids = topk_ids.clone(); // copy topk_ids for preprocess
auto permuted_experts_id = torch::empty_like(topk_ids);
auto dst_row_id2src_row_id_map = torch::empty_like(src_row_id2dst_row_id_map);
auto sorted_row_idx = torch::empty_like(inv_permuted_idx);
auto align_expert_first_token_offset =
torch::zeros_like(expert_first_token_offset);
@ -67,24 +64,22 @@ void moe_permute(
const int* expert_map_ptr = get_ptr<int>(expert_map.value());
valid_num_ptr =
get_ptr<int64_t>(expert_first_token_offset) + n_local_expert;
preprocessTopkIdLauncher(get_ptr<int>(topk_ids), n_token * topk,
preprocessTopkIdLauncher(get_ptr<int>(copy_topk_ids), n_token * topk,
expert_map_ptr, n_expert, stream);
}
// expert sort topk expert id and scan expert id get expert_first_token_offset
sortAndScanExpert(get_ptr<int>(topk_ids), get_ptr<int>(token_expert_indices),
get_ptr<int>(permuted_experts_id),
get_ptr<int>(dst_row_id2src_row_id_map),
get_ptr<int64_t>(expert_first_token_offset), n_token,
n_expert, n_local_expert, topk, sorter,
get_ptr<int>(sort_workspace), stream);
sortAndScanExpert(
get_ptr<int>(copy_topk_ids), get_ptr<int>(token_expert_indices),
get_ptr<int>(permuted_experts_id), get_ptr<int>(sorted_row_idx),
get_ptr<int64_t>(expert_first_token_offset), n_token, n_expert,
n_local_expert, topk, sorter, get_ptr<int>(sort_workspace), stream);
// dispatch expandInputRowsKernelLauncher
MOE_DISPATCH(input.scalar_type(), [&] {
expandInputRowsKernelLauncher<scalar_t>(
get_ptr<scalar_t>(input), get_ptr<scalar_t>(permuted_input),
get_ptr<float>(topk_weights), get_ptr<int>(permuted_experts_id),
get_ptr<int>(dst_row_id2src_row_id_map),
get_ptr<int>(src_row_id2dst_row_id_map),
get_ptr<int>(permuted_experts_id), get_ptr<int>(sorted_row_idx),
get_ptr<int>(inv_permuted_idx), get_ptr<int>(permuted_idx),
get_ptr<int64_t>(expert_first_token_offset), n_token, valid_num_ptr,
n_hidden, topk, n_local_expert, align_block_size_value, stream);
});
@ -101,32 +96,34 @@ void moe_permute(
}
void moe_unpermute(
const torch::Tensor& permuted_hidden_states, // [n_token * topk, hidden]
const torch::Tensor& topk_weights, //[n_token, topk]
const torch::Tensor& topk_ids, // [n_token, topk]
const torch::Tensor& src_row_id2dst_row_id_map, // [n_token, topk]
const torch::Tensor& expert_first_token_offset, // [n_local_expert+1]
int64_t n_expert, int64_t n_local_expert, int64_t topk,
const torch::Tensor& permuted_hidden_states, // [n_token * topk, hidden]
const torch::Tensor& topk_weights, // [n_token, topk]
const torch::Tensor& inv_permuted_idx, // [n_token, topk]
const std::optional<torch::Tensor>&
expert_first_token_offset, // [n_local_expert+1]
int64_t topk,
torch::Tensor& hidden_states // [n_token, hidden]
) {
TORCH_CHECK(src_row_id2dst_row_id_map.sizes() == topk_ids.sizes(),
"topk_ids shape must be same as src_row_id2dst_row_id_map");
TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int,
"topk_ids must be int32");
TORCH_CHECK(
permuted_hidden_states.scalar_type() == hidden_states.scalar_type(),
"topk_ids dtype must be same as src_row_id2dst_row_id_map");
"permuted_hidden_states dtype must be same as hidden_states");
auto n_token = hidden_states.size(0);
auto n_hidden = hidden_states.size(1);
auto stream = at::cuda::getCurrentCUDAStream().stream();
const int64_t* valid_ptr =
get_ptr<int64_t>(expert_first_token_offset) + n_local_expert;
int64_t const* valid_ptr = nullptr;
if (expert_first_token_offset.has_value()) {
int n_local_expert = expert_first_token_offset.value().size(0) - 1;
valid_ptr =
get_ptr<int64_t>(expert_first_token_offset.value()) + n_local_expert;
}
MOE_DISPATCH(hidden_states.scalar_type(), [&] {
finalizeMoeRoutingKernelLauncher<scalar_t, scalar_t>(
get_ptr<scalar_t>(permuted_hidden_states),
get_ptr<scalar_t>(hidden_states), get_ptr<float>(topk_weights),
get_ptr<int>(src_row_id2dst_row_id_map), get_ptr<int>(topk_ids),
n_token, n_hidden, topk, valid_ptr, stream);
get_ptr<int>(inv_permuted_idx), n_token, n_hidden, topk, valid_ptr,
stream);
});
}

View File

@ -177,7 +177,7 @@ __global__ void getMIndicesKernel(int64_t* expert_first_token_offset,
int tidx = threadIdx.x;
extern __shared__ int64_t smem_expert_first_token_offset[];
for (int i = tidx; i <= num_local_expert; i += blockDim.x) {
smem_expert_first_token_offset[tidx] = __ldg(expert_first_token_offset + i);
smem_expert_first_token_offset[i] = __ldg(expert_first_token_offset + i);
}
__syncthreads();
auto last_token_offset = smem_expert_first_token_offset[eidx + 1];

View File

@ -57,31 +57,19 @@ void sortAndScanExpert(int* expert_for_source_row, const int* source_rows,
template <typename T>
void expandInputRowsKernelLauncher(
T const* unpermuted_input, T* permuted_output,
const float* unpermuted_scales, int* sorted_experts,
T const* unpermuted_input, T* permuted_output, int* sorted_experts,
int const* expanded_dest_row_to_expanded_source_row,
int* expanded_source_row_to_expanded_dest_row,
int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
int64_t* expert_first_token_offset, int64_t const num_rows,
int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k,
int num_local_experts, const int& align_block_size, cudaStream_t stream);
// Final kernel to unpermute and scale
// This kernel unpermutes the original data, does the k-way reduction and
// performs the final skip connection.
template <typename T, typename OutputType, bool CHECK_SKIPPED>
__global__ void finalizeMoeRoutingKernel(
T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
float const* scales, int const* expanded_source_row_to_expanded_dest_row,
int const* expert_for_source_row, int64_t const orig_cols, int64_t const k,
int64_t const* num_valid_ptr);
template <class T, class OutputType>
void finalizeMoeRoutingKernelLauncher(
T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
float const* scales, int const* expanded_source_row_to_expanded_dest_row,
int const* expert_for_source_row, int64_t const num_rows,
int64_t const cols, int64_t const k, int64_t const* num_valid_ptr,
cudaStream_t stream);
int64_t const num_rows, int64_t const cols, int64_t const k,
int64_t const* num_valid_ptr, cudaStream_t stream);
void preprocessTopkIdLauncher(int* topk_id_ptr, int size,
const int* expert_map_ptr, int num_experts,

View File

@ -2,10 +2,9 @@
template <typename T, bool CHECK_SKIPPED, bool ALIGN_BLOCK_SIZE>
__global__ void expandInputRowsKernel(
T const* unpermuted_input, T* permuted_output,
const float* unpermuted_scales, int* sorted_experts,
T const* unpermuted_input, T* permuted_output, int* sorted_experts,
int const* expanded_dest_row_to_expanded_source_row,
int* expanded_source_row_to_expanded_dest_row,
int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
int64_t* expert_first_token_offset, int64_t const num_rows,
int64_t const* num_dest_rows, int64_t const cols, int64_t k,
int num_local_experts, int align_block_size) {
@ -54,6 +53,10 @@ __global__ void expandInputRowsKernel(
assert(expanded_dest_row <= INT32_MAX);
expanded_source_row_to_expanded_dest_row[expanded_source_row] =
static_cast<int>(expanded_dest_row);
// skip non local expert token
if (!CHECK_SKIPPED || blockIdx.x < *num_dest_rows) {
permuted_idx[expanded_dest_row] = expanded_source_row;
}
}
if (!CHECK_SKIPPED || blockIdx.x < *num_dest_rows) {
@ -62,7 +65,7 @@ __global__ void expandInputRowsKernel(
using DataElem = cutlass::Array<T, ELEM_PER_THREAD>;
// Duplicate and permute rows
int64_t const source_row = expanded_source_row % num_rows;
int64_t const source_row = expanded_source_row / k;
auto const* source_row_ptr =
reinterpret_cast<DataElem const*>(unpermuted_input + source_row * cols);
@ -82,10 +85,9 @@ __global__ void expandInputRowsKernel(
template <typename T>
void expandInputRowsKernelLauncher(
T const* unpermuted_input, T* permuted_output,
const float* unpermuted_scales, int* sorted_experts,
T const* unpermuted_input, T* permuted_output, int* sorted_experts,
int const* expanded_dest_row_to_expanded_source_row,
int* expanded_source_row_to_expanded_dest_row,
int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
int64_t* expert_first_token_offset, int64_t const num_rows,
int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k,
int num_local_experts, const int& align_block_size, cudaStream_t stream) {
@ -105,11 +107,11 @@ void expandInputRowsKernelLauncher(
int64_t smem_size = sizeof(int64_t) * (num_local_experts + 1);
func<<<blocks, threads, smem_size, stream>>>(
unpermuted_input, permuted_output, unpermuted_scales, sorted_experts,
unpermuted_input, permuted_output, sorted_experts,
expanded_dest_row_to_expanded_source_row,
expanded_source_row_to_expanded_dest_row, expert_first_token_offset,
num_rows, num_valid_tokens_ptr, cols, k, num_local_experts,
align_block_size);
expanded_source_row_to_expanded_dest_row, permuted_idx,
expert_first_token_offset, num_rows, num_valid_tokens_ptr, cols, k,
num_local_experts, align_block_size);
}
template <class T, class U>
@ -128,11 +130,9 @@ template <typename T, typename OutputType, bool CHECK_SKIPPED>
__global__ void finalizeMoeRoutingKernel(
T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
float const* scales, int const* expanded_source_row_to_expanded_dest_row,
int const* expert_for_source_row, int64_t const orig_cols, int64_t const k,
int64_t const* num_valid_ptr) {
int64_t const orig_cols, int64_t const k, int64_t const* num_valid_ptr) {
assert(orig_cols % 4 == 0);
int64_t const original_row = blockIdx.x;
int64_t const num_rows = gridDim.x;
auto const offset = original_row * orig_cols;
OutputType* reduced_row_ptr = reduced_unpermuted_output + offset;
int64_t const num_valid = *num_valid_ptr;
@ -159,14 +159,13 @@ __global__ void finalizeMoeRoutingKernel(
ComputeElem thread_output;
thread_output.fill(0);
for (int k_idx = 0; k_idx < k; ++k_idx) {
int64_t const expanded_original_row = original_row + k_idx * num_rows;
int64_t const expanded_original_row = original_row * k + k_idx;
int64_t const expanded_permuted_row =
expanded_source_row_to_expanded_dest_row[expanded_original_row];
int64_t const k_offset = original_row * k + k_idx;
float const row_scale = scales[k_offset];
// Check after row_rescale has accumulated
if (CHECK_SKIPPED && expanded_permuted_row >= num_valid) {
continue;
}
@ -189,9 +188,8 @@ template <class T, class OutputType>
void finalizeMoeRoutingKernelLauncher(
T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
float const* scales, int const* expanded_source_row_to_expanded_dest_row,
int const* expert_for_source_row, int64_t const num_rows,
int64_t const cols, int64_t const k, int64_t const* num_valid_ptr,
cudaStream_t stream) {
int64_t const num_rows, int64_t const cols, int64_t const k,
int64_t const* num_valid_ptr, cudaStream_t stream) {
int64_t const blocks = num_rows;
int64_t const threads = 256;
bool const check_finished = num_valid_ptr != nullptr;
@ -201,6 +199,5 @@ void finalizeMoeRoutingKernelLauncher(
auto* const kernel = func_map[check_finished];
kernel<<<blocks, threads, 0, stream>>>(
expanded_permuted_rows, reduced_unpermuted_output, scales,
expanded_source_row_to_expanded_dest_row, expert_for_source_row, cols, k,
num_valid_ptr);
expanded_source_row_to_expanded_dest_row, cols, k, num_valid_ptr);
}

View File

@ -56,18 +56,17 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
" -> Tensor");
m.def(
"moe_permute(Tensor input, Tensor topk_weight, Tensor! topk_ids,"
"moe_permute(Tensor input, Tensor topk_ids,"
"Tensor token_expert_indices, Tensor? expert_map, int n_expert,"
"int n_local_expert,"
"int topk, int? align_block_size,Tensor! permuted_input, Tensor! "
"expert_first_token_offset, Tensor! src_row_id2dst_row_id_map, Tensor! "
"m_indices)->()");
"expert_first_token_offset, Tensor! inv_permuted_idx, Tensor! "
"permuted_idx, Tensor! m_indices)->()");
m.def(
"moe_unpermute(Tensor permuted_hidden_states, Tensor topk_weights,"
"Tensor topk_ids,Tensor src_row_id2dst_row_id_map, Tensor "
"expert_first_token_offset, int n_expert, int n_local_expert,int "
"topk, Tensor! hidden_states)->()");
"Tensor inv_permuted_idx, Tensor? expert_first_token_offset, "
"int topk, Tensor! hidden_states)->()");
m.def("moe_permute_unpermute_supported() -> bool");
m.impl("moe_permute_unpermute_supported", &moe_permute_unpermute_supported);

View File

@ -292,6 +292,11 @@ void per_token_group_quant_fp8(const torch::Tensor& input,
torch::Tensor& output_q, torch::Tensor& output_s,
int64_t group_size, double eps, double fp8_min,
double fp8_max, bool scale_ue8m0);
void per_token_group_quant_int8(const torch::Tensor& input,
torch::Tensor& output_q,
torch::Tensor& output_s, int64_t group_size,
double eps, double int8_min, double int8_max);
#endif
void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,

View File

@ -1,6 +1,8 @@
#include <ATen/cuda/CUDAContext.h>
#include <torch/all.h>
#include "../per_token_group_quant_8bit.h"
#include <cmath>
#include "../../dispatch_utils.h"
@ -336,3 +338,11 @@ void dynamic_scaled_int8_quant(
}
});
}
void per_token_group_quant_int8(const torch::Tensor& input,
torch::Tensor& output_q,
torch::Tensor& output_s, int64_t group_size,
double eps, double int8_min, double int8_max) {
per_token_group_quant_8bit(input, output_q, output_s, group_size, eps,
int8_min, int8_max);
}

View File

@ -1,6 +1,8 @@
#include <ATen/cuda/CUDAContext.h>
#include <c10/util/Float8_e4m3fn.h>
#include "../per_token_group_quant_8bit.h"
#include <cmath>
#include <cuda_fp16.h>
@ -120,7 +122,7 @@ void per_token_group_quant_8bit(const torch::Tensor& input,
torch::Tensor& output_q,
torch::Tensor& output_s, int64_t group_size,
double eps, double min_8bit, double max_8bit,
bool scale_ue8m0 = false) {
bool scale_ue8m0) {
TORCH_CHECK(input.is_contiguous());
TORCH_CHECK(output_q.is_contiguous());
@ -198,6 +200,8 @@ void per_token_group_quant_8bit(const torch::Tensor& input,
input.scalar_type(), "per_token_group_quant_8bit", ([&] {
if (dst_type == at::ScalarType::Float8_e4m3fn) {
LAUNCH_KERNEL(scalar_t, c10::Float8_e4m3fn);
} else if (dst_type == at::ScalarType::Char) {
LAUNCH_KERNEL(scalar_t, int8_t);
}
}));

View File

@ -187,8 +187,12 @@ struct PrepackedLayoutBTemplate {
CUTE_HOST_DEVICE static constexpr auto TVbNbKL_to_offset_copy(
Shape_NKL shape_mkl) {
auto layout = TVbNbKL_to_offset(shape_mkl);
return make_layout(coalesce(get<0>(layout)), get<1>(layout),
get<2>(layout));
// for 4-bit elements, having >= 64 values per column
// allows TMA to load full 32-byte sectors
auto inner_layout =
make_layout(make_shape(_256{}, size<0>(layout) / _256{}));
return make_layout(inner_layout, get<1>(layout), get<2>(layout));
}
// ((BlockN, BlockK), (BlocksN, BlocksK), L) -> (storage_idx)

View File

@ -0,0 +1,10 @@
#pragma once
#include <torch/all.h>
// TODO(wentao): refactor the folder to 8bit, then includes fp8 and int8 folders
// 8-bit per-token-group quantization helper used by both FP8 and INT8
void per_token_group_quant_8bit(const torch::Tensor& input,
torch::Tensor& output_q,
torch::Tensor& output_s, int64_t group_size,
double eps, double min_8bit, double max_8bit,
bool scale_ue8m0 = false);

View File

@ -624,6 +624,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
ops.impl("per_token_group_fp8_quant", torch::kCUDA,
&per_token_group_quant_fp8);
// Compute per-token-group INT8 quantized tensor and scaling factor.
ops.def(
"per_token_group_quant_int8(Tensor input, Tensor! output_q, Tensor! "
"output_s, int group_size, float eps, float int8_min, float int8_max) -> "
"()");
ops.impl("per_token_group_quant_int8", torch::kCUDA,
&per_token_group_quant_int8);
// reorder weight for AllSpark Ampere W8A16 Fused Gemm kernel
ops.def(
"rearrange_kn_weight_as_n32k16_order(Tensor b_qweight, Tensor b_scales, "

View File

@ -1,62 +0,0 @@
# This vLLM Dockerfile is used to construct an image that can build and run vLLM on ARM CPU platform.
FROM ubuntu:22.04 AS cpu-test-arm
ENV CCACHE_DIR=/root/.cache/ccache
ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
RUN --mount=type=cache,target=/var/cache/apt \
apt-get update -y \
&& apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
# tcmalloc provides better memory allocation efficiency, e.g., holding memory in caches to speed up access of commonly-used objects.
RUN --mount=type=cache,target=/root/.cache/pip \
pip install py-cpuinfo # Use this to gather CPU info and optimize based on ARM Neoverse cores
# Set LD_PRELOAD for tcmalloc on ARM
ENV LD_PRELOAD="/usr/lib/aarch64-linux-gnu/libtcmalloc_minimal.so.4"
RUN echo 'ulimit -c 0' >> ~/.bashrc
WORKDIR /workspace
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
pip install --upgrade pip && \
pip install -r requirements/build.txt
FROM cpu-test-arm AS build
WORKDIR /workspace/vllm
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
--mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
pip install -v -r requirements/cpu.txt
COPY . .
ARG GIT_REPO_CHECK=0
RUN --mount=type=bind,source=.git,target=.git \
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
# Disabling AVX512 specific optimizations for ARM
ARG VLLM_CPU_DISABLE_AVX512="true"
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=cache,target=/root/.cache/ccache \
--mount=type=bind,source=.git,target=.git \
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
pip install dist/*.whl && \
rm -rf dist
WORKDIR /workspace/
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]

View File

@ -1,4 +1,11 @@
# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
# This vLLM Dockerfile is used to build images that can run vLLM on both x86_64 and arm64 CPU platforms.
#
# Supported platforms:
# - linux/amd64 (x86_64)
# - linux/arm64 (aarch64)
#
# Use the `--platform` option with `docker buildx build` to specify the target architecture, e.g.:
# docker buildx build --platform=linux/arm64 -f docker/Dockerfile.cpu .
#
# Build targets:
# vllm-openai (default): used for serving deployment
@ -53,7 +60,20 @@ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --upgrade pip && \
uv pip install -r requirements/cpu.txt
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/venv/lib/libiomp5.so:$LD_PRELOAD"
ARG TARGETARCH
ENV TARGETARCH=${TARGETARCH}
RUN if [ "$TARGETARCH" = "arm64" ]; then \
PRELOAD_PATH="/usr/lib/aarch64-linux-gnu/libtcmalloc_minimal.so.4"; \
else \
PRELOAD_PATH="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/venv/lib/libiomp5.so"; \
fi && \
echo "export LD_PRELOAD=$PRELOAD_PATH" >> ~/.bashrc
# Ensure that the LD_PRELOAD environment variable for export is in effect.
SHELL ["/bin/bash", "-c"]
ENV LD_PRELOAD=${LD_PRELOAD}
RUN echo 'ulimit -c 0' >> ~/.bashrc

View File

@ -1,4 +1,4 @@
ARG NIGHTLY_DATE="20250714"
ARG NIGHTLY_DATE="20250724"
ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.12_tpuvm_$NIGHTLY_DATE"
FROM $BASE_IMAGE

View File

@ -9,10 +9,13 @@ We support tracing vLLM workers using the `torch.profiler` module. You can enabl
The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` environment variable set.
When using `benchmarks/benchmark_serving.py`, you can enable profiling by passing the `--profile` flag.
When using `vllm bench serve`, you can enable profiling by passing the `--profile` flag.
Traces can be visualized using <https://ui.perfetto.dev/>.
!!! tip
You can directly call bench module without installing vllm using `python -m vllm.entrypoints.cli.main bench`.
!!! tip
Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly.
@ -35,10 +38,10 @@ VLLM_TORCH_PROFILER_DIR=./vllm_profile \
--model meta-llama/Meta-Llama-3-70B
```
benchmark_serving.py:
vllm bench command:
```bash
python benchmarks/benchmark_serving.py \
vllm bench serve \
--backend vllm \
--model meta-llama/Meta-Llama-3-70B \
--dataset-name sharegpt \
@ -69,13 +72,13 @@ apt install nsight-systems-cli
For basic usage, you can just append `nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace=node` before any existing script you would run for offline inference.
The following is an example using the `benchmarks/benchmark_latency.py` script:
The following is an example using the `vllm bench latency` script:
```bash
nsys profile -o report.nsys-rep \
--trace-fork-before-exec=true \
--cuda-graph-trace=node \
python benchmarks/benchmark_latency.py \
vllm bench latency \
--model meta-llama/Llama-3.1-8B-Instruct \
--num-iters-warmup 5 \
--num-iters 1 \
@ -98,7 +101,7 @@ nsys profile -o report.nsys-rep \
vllm serve meta-llama/Llama-3.1-8B-Instruct
# client
python benchmarks/benchmark_serving.py \
vllm bench serve \
--backend vllm \
--model meta-llama/Llama-3.1-8B-Instruct \
--num-prompts 1 \
@ -132,7 +135,7 @@ You can view these profiles either as summaries in the CLI, using `nsys stats [p
...
** CUDA GPU Kernel Summary (cuda_gpu_kern_sum):
Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
-------- --------------- --------- ----------- ----------- -------- --------- ----------- ----------------------------------------------------------------------------------------------------
46.3 10,327,352,338 17,505 589,965.9 144,383.0 27,040 3,126,460 944,263.8 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize128x128x64_warpgroupsize1x1x1_execute_segment_k_of…
14.8 3,305,114,764 5,152 641,520.7 293,408.0 287,296 2,822,716 867,124.9 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize256x128x64_warpgroupsize2x1x1_execute_segment_k_of…
@ -143,7 +146,7 @@ You can view these profiles either as summaries in the CLI, using `nsys stats [p
2.6 587,283,113 37,824 15,526.7 3,008.0 2,719 2,517,756 139,091.1 std::enable_if<T2>(int)0&&vllm::_typeConvert<T1>::exists, void>::type vllm::fused_add_rms_norm_kern…
1.9 418,362,605 18,912 22,121.5 3,871.0 3,328 2,523,870 175,248.2 void vllm::rotary_embedding_kernel<c10::BFloat16, (bool)1>(const long *, T1 *, T1 *, const T1 *, in…
0.7 167,083,069 18,880 8,849.7 2,240.0 1,471 2,499,996 101,436.1 void vllm::reshape_and_cache_flash_kernel<__nv_bfloat16, __nv_bfloat16, (vllm::Fp8KVCacheDataType)0
...
...
```
GUI example:

View File

@ -3,14 +3,14 @@ An implementation of xPyD with dynamic scaling based on point-to-point communica
# Detailed Design
## Overall Process
As shown in Figure 1, the overall process of this **PD disaggregation** solution is described through a request flow:
As shown in Figure 1, the overall process of this **PD disaggregation** solution is described through a request flow:
1. The client sends an HTTP request to the Proxy/Router's `/v1/completions` interface.
2. The Proxy/Router selects a **1P1D (1 Prefill instance + 1 Decode instance)** through either through round-robin or random selection, generates a `request_id` (rules to be introduced later), modifies the `max_tokens` in the HTTP request message to **1**, and then forwards the request to the **P instance**.
3. Immediately afterward, the Proxy/Router forwards the **original HTTP request** to the **D instance**.
4. The **P instance** performs **Prefill** and then **actively sends the generated KV cache** to the D instance (using **PUT_ASYNC** mode). The D instance's `zmq_addr` can be resolved through the `request_id`.
5. The **D instance** has a **dedicated thread** for receiving the KV cache (to avoid blocking the main process). The received KV cache is saved into the **GPU memory buffer**, the size of which is determined by the vLLM startup parameter `kv_buffer_size`. When the GPU buffer is full, the KV cache is stored in the **local Tensor memory pool**.
6. During the **Decode**, the D instance's main process retrieves the KV cache (transmitted by the P instance) from either the **GPU buffer** or the **memory pool**, thereby **skipping Prefill**.
1. The client sends an HTTP request to the Proxy/Router's `/v1/completions` interface.
2. The Proxy/Router selects a **1P1D (1 Prefill instance + 1 Decode instance)** through either through round-robin or random selection, generates a `request_id` (rules to be introduced later), modifies the `max_tokens` in the HTTP request message to **1**, and then forwards the request to the **P instance**.
3. Immediately afterward, the Proxy/Router forwards the **original HTTP request** to the **D instance**.
4. The **P instance** performs **Prefill** and then **actively sends the generated KV cache** to the D instance (using **PUT_ASYNC** mode). The D instance's `zmq_addr` can be resolved through the `request_id`.
5. The **D instance** has a **dedicated thread** for receiving the KV cache (to avoid blocking the main process). The received KV cache is saved into the **GPU memory buffer**, the size of which is determined by the vLLM startup parameter `kv_buffer_size`. When the GPU buffer is full, the KV cache is stored in the **local Tensor memory pool**.
6. During the **Decode**, the D instance's main process retrieves the KV cache (transmitted by the P instance) from either the **GPU buffer** or the **memory pool**, thereby **skipping Prefill**.
7. After completing **Decode**, the D instance returns the result to the **Proxy/Router**, which then forwards it to the **client**.
![image1](https://github.com/user-attachments/assets/fb01bde6-755b-49f7-ad45-48a94b1e10a7)
@ -291,7 +291,7 @@ curl -X POST -s http://10.0.1.1:10001/v1/completions \
??? console "Command"
```shell
python3 benchmark_serving.py \
vllm bench serve \
--backend vllm \
--model base_model \
--tokenizer meta-llama/Llama-3.1-8B-Instruct \

View File

@ -177,6 +177,70 @@ Multi-image input can be extended to perform video captioning. We show this with
You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary
instead of using multi-image input.
Instead of NumPy arrays, you can also pass `'torch.Tensor'` instances, as shown in this example using Qwen2.5-VL:
??? code
```python
from transformers import AutoProcessor
from vllm import LLM, SamplingParams
from qwen_vl_utils import process_vision_info
model_path = "Qwen/Qwen2.5-VL-3B-Instruct/"
video_path = "https://content.pexels.com/videos/free-videos.mp4"
llm = LLM(
model=model_path,
gpu_memory_utilization=0.8,
enforce_eager=True,
limit_mm_per_prompt={"video": 1},
)
sampling_params = SamplingParams(
max_tokens=1024,
)
video_messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": [
{"type": "text", "text": "describe this video."},
{
"type": "video",
"video": video_path,
"total_pixels": 20480 * 28 * 28,
"min_pixels": 16 * 28 * 28
}
]
},
]
messages = video_messages
processor = AutoProcessor.from_pretrained(model_path)
prompt = processor.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
)
image_inputs, video_inputs = process_vision_info(messages)
mm_data = {}
if video_inputs is not None:
mm_data["video"] = video_inputs
llm_inputs = {
"prompt": prompt,
"multi_modal_data": mm_data,
}
outputs = llm.generate([llm_inputs], sampling_params=sampling_params)
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
```
!!! note
'process_vision_info' is only applicable to Qwen2.5-VL and similar models.
Full example: <gh-file:examples/offline_inference/vision_language.py>
### Audio Inputs

View File

@ -6,6 +6,7 @@ Contents:
- [Supported Hardware](supported_hardware.md)
- [AutoAWQ](auto_awq.md)
- [AutoRound](auto_round.md)
- [BitsAndBytes](bnb.md)
- [BitBLAS](bitblas.md)
- [GGUF](gguf.md)

View File

@ -0,0 +1,103 @@
# AutoRound
[AutoRound](https://github.com/intel/auto-round) is Intels advanced quantization algorithm designed to produce highly efficient **INT2, INT3, INT4, and INT8**
quantized large language models—striking an optimal balance between accuracy and deployment performance.
AutoRound applies weight-only quantization to transformer-based models, enabling significant memory savings and faster
inference while maintaining near-original accuracy. It supports a wide range of hardware platforms, including **CPUs,
Intel GPUs, HPUs, and CUDA-enabled devices**.
Please refer to the [AutoRound guide](https://github.com/intel/auto-round/blob/main/docs/step_by_step.md) for more details.
Key Features:
**AutoRound, AutoAWQ, AutoGPTQ, and GGUF** are supported
**10+ vision-language models (VLMs)** are supported
**Per-layer mixed-bit quantization** for fine-grained control
**RTN (Round-To-Nearest) mode** for quick quantization with slight accuracy loss
**Multiple quantization recipes**: best, base, and light
✅ Advanced utilities such as immediate packing and support for **10+ backends**
## Installation
```bash
uv pip install auto-round
```
## Quantizing a model
For VLMs, please change to `auto-round-mllm` in CLI usage and `AutoRoundMLLM` in API usage.
### CLI usage
```bash
auto-round \
--model Qwen/Qwen3-0.6B \
--bits 4 \
--group_size 128 \
--format "auto_round" \
--output_dir ./tmp_autoround
```
```bash
auto-round \
--model Qwen/Qwen3-0.6B \
--format "gguf:q4_k_m" \
--output_dir ./tmp_autoround
```
### API usage
```python
from transformers import AutoModelForCausalLM, AutoTokenizer
from auto_round import AutoRound
model_name = "Qwen/Qwen3-0.6B"
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)
bits, group_size, sym = 4, 128, True
autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym)
# the best accuracy, 4-5X slower, low_gpu_mem_usage could save ~20G but ~30% slower
# autoround = AutoRound(model, tokenizer, nsamples=512, iters=1000, low_gpu_mem_usage=True, bits=bits, group_size=group_size, sym=sym)
# 2-3X speedup, slight accuracy drop at W4G128
# autoround = AutoRound(model, tokenizer, nsamples=128, iters=50, lr=5e-3, bits=bits, group_size=group_size, sym=sym )
output_dir = "./tmp_autoround"
# format= 'auto_round'(default), 'auto_gptq', 'auto_awq'
autoround.quantize_and_save(output_dir, format="auto_round")
```
## Running a quantized model with vLLM
Here is some example code to run auto-round format in vLLM:
```python
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
]
sampling_params = SamplingParams(temperature=0.6, top_p=0.95)
model_name = "Intel/DeepSeek-R1-0528-Qwen3-8B-int4-AutoRound"
llm = LLM(model=model_name)
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
# Acknowledgement
Special thanks to open-source low precision libraries such as AutoGPTQ, AutoAWQ, GPTQModel, Triton, Marlin, and
ExLLaMAV2 for providing low-precision CUDA kernels, which are leveraged in AutoRound.

View File

@ -33,7 +33,7 @@ Testing has been conducted on AWS Graviton3 instances for compatibility.
# --8<-- [end:pre-built-images]
# --8<-- [start:build-image-from-source]
```bash
docker build -f docker/Dockerfile.arm \
docker build -f docker/Dockerfile.cpu \
--tag vllm-cpu-env .
# Launching OpenAI server

View File

@ -365,6 +365,7 @@ th {
| `Grok1ModelForCausalLM` | Grok1 | `hpcai-tech/grok-1`. | ✅︎ | ✅︎ | ✅︎ |
| `HunYuanDenseV1ForCausalLM` | Hunyuan-7B-Instruct-0124 | `tencent/Hunyuan-7B-Instruct-0124` | ✅︎ | | ✅︎ |
| `HunYuanMoEV1ForCausalLM` | Hunyuan-80B-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | | ✅︎ |
| `HCXVisionForCausalLM` | HyperCLOVAX-SEED-Vision-Instruct-3B | `naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B` | | | ✅︎ |
| `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `InternLM3ForCausalLM` | InternLM3 | `internlm/internlm3-8b-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
@ -592,6 +593,7 @@ Specified using `--task generate`.
| `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ |
| `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ |
| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ |
| `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, etc. | | ✅︎ | ✅︎ |
| `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | | | ✅︎ |
| `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | | ✅︎ |
@ -612,6 +614,7 @@ Specified using `--task generate`.
| `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ | ⚠️ |
| `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ | ✅︎ |
| `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `Phi4MultimodalForCausalLM` | Phi-4-multimodal (HF Transformers) | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct` (with revision `refs/pr/70`), etc. | ✅︎ | ✅︎ | ✅︎ |
| `PixtralForConditionalGeneration` | Mistral 3 (Mistral format), Pixtral (Mistral format) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistralai/Pixtral-12B-2409`, etc. | | ✅︎ | ✅︎ |
| `QwenVLForConditionalGeneration`<sup>^</sup> | Qwen-VL | T + I<sup>E+</sup> | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A<sup>+</sup> | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ | ✅︎ |

View File

@ -2,10 +2,14 @@
Reinforcement Learning from Human Feedback (RLHF) is a technique that fine-tunes language models using human-generated preference data to align model outputs with desired behaviors.
vLLM can be used to generate the completions for RLHF. The best way to do this is with libraries like [TRL](https://github.com/huggingface/trl), [OpenRLHF](https://github.com/OpenRLHF/OpenRLHF) and [verl](https://github.com/volcengine/verl).
vLLM can be used to generate the completions for RLHF. Some ways to do this include using libraries like [TRL](https://github.com/huggingface/trl), [OpenRLHF](https://github.com/OpenRLHF/OpenRLHF), [verl](https://github.com/volcengine/verl) and [unsloth](https://github.com/unslothai/unsloth).
See the following basic examples to get started if you don't want to use an existing library:
- [Training and inference processes are located on separate GPUs (inspired by OpenRLHF)](../examples/offline_inference/rlhf.md)
- [Training and inference processes are colocated on the same GPUs using Ray](../examples/offline_inference/rlhf_colocate.md)
- [Utilities for performing RLHF with vLLM](../examples/offline_inference/rlhf_utils.md)
See the following notebooks showing how to use vLLM for GRPO:
- [Qwen-3 4B GRPO using Unsloth + vLLM](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_(4B)-GRPO.ipynb)

View File

@ -190,6 +190,37 @@ def run_phi4mm(question: str, audio_count: int) -> ModelRequestData:
)
def run_phi4_multimodal(question: str, audio_count: int) -> ModelRequestData:
"""
Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
show how to process audio inputs.
"""
model_path = snapshot_download(
"microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70"
)
# Since the vision-lora and speech-lora co-exist with the base model,
# we have to manually specify the path of the lora weights.
speech_lora_path = os.path.join(model_path, "speech-lora")
placeholders = "<|audio|>" * audio_count
prompts = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
engine_args = EngineArgs(
model=model_path,
max_model_len=12800,
max_num_seqs=2,
enable_lora=True,
max_lora_rank=320,
limit_mm_per_prompt={"audio": audio_count},
)
return ModelRequestData(
engine_args=engine_args,
prompt=prompts,
lora_requests=[LoRARequest("speech", 1, speech_lora_path)],
)
# Qwen2-Audio
def run_qwen2_audio(question: str, audio_count: int) -> ModelRequestData:
model_name = "Qwen/Qwen2-Audio-7B-Instruct"
@ -303,6 +334,7 @@ model_example_map = {
"granite_speech": run_granite_speech,
"minicpmo": run_minicpmo,
"phi4_mm": run_phi4mm,
"phi4_multimodal": run_phi4_multimodal,
"qwen2_audio": run_qwen2_audio,
"qwen2_5_omni": run_qwen2_5_omni,
"ultravox": run_ultravox,

View File

@ -3,12 +3,12 @@
import argparse
import datetime
import os
import re
from typing import Union
import albumentations
import numpy as np
import rasterio
import regex as re
import torch
from einops import rearrange
from terratorch.datamodules import Sen1Floods11NonGeoDataModule

View File

@ -29,6 +29,7 @@ import shutil
from pathlib import Path
from vllm import LLM, EngineArgs
from vllm.model_executor.model_loader import ShardedStateLoader
from vllm.utils import FlexibleArgumentParser
@ -39,7 +40,10 @@ def parse_args():
"--output", "-o", required=True, type=str, help="path to output checkpoint"
)
parser.add_argument(
"--file-pattern", type=str, help="string pattern of saved filenames"
"--file-pattern",
type=str,
default=ShardedStateLoader.DEFAULT_PATTERN,
help="string pattern of saved filenames",
)
parser.add_argument(
"--max-file-size",

View File

@ -316,6 +316,85 @@ def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
)
# naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B
def run_hyperclovax_seed_vision(
questions: list[str], modality: str
) -> ModelRequestData:
model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=8192 if modality == "image" else 16384,
limit_mm_per_prompt={modality: 1},
)
messages = list()
for question in questions:
if modality == "image":
"""
ocr: List the words in the image in raster order.
Even if the word order feels unnatural for reading,
the model will handle it as long as it follows raster order.
e.g. "Naver, CLOVA, bigshane"
lens_keywords: List the entity names in the image.
e.g. "iPhone"
lens_local_keywords: List the entity names with quads in the image.
e.g. "[0.07, 0.21, 0.92, 0.90] iPhone"
"""
messages.append(
[
{
"role": "user",
"content": [
{
"type": "image",
"ocr": "",
"lens_keywords": "",
"lens_local_keywords": "",
},
{
"type": "text",
"text": question,
},
],
}
]
)
elif modality == "video":
messages.append(
[
{
"role": "user",
"content": [
{
"type": "video",
},
{
"type": "text",
"text": question,
},
],
}
]
)
else:
raise ValueError(f"Unsupported modality: {modality}")
prompts = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
)
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
stop_token_ids=None,
)
# Idefics3-8B-Llama3
def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
@ -389,6 +468,39 @@ def run_tarsier(questions: list[str], modality: str) -> ModelRequestData:
)
# Intern-S1
def run_interns1(questions: list[str], modality: str) -> ModelRequestData:
model_name = "internlm/Intern-S1"
engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=8192,
max_num_seqs=2,
limit_mm_per_prompt={modality: 1},
enforce_eager=True,
)
if modality == "image":
placeholder = "<IMG_CONTEXT>"
elif modality == "video":
placeholder = "<video>"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
messages = [
[{"role": "user", "content": f"{placeholder}\n{question}"}]
for question in questions
]
prompts = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# InternVL
def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
model_name = "OpenGVLab/InternVL3-2B"
@ -987,6 +1099,41 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
)
# HF format Phi-4-multimodal-instruct
def run_phi4_multimodal(questions: list[str], modality: str) -> ModelRequestData:
"""
Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
show how to process image inputs.
"""
assert modality == "image"
model_path = snapshot_download(
"microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70"
)
# Since the vision-lora and speech-lora co-exist with the base model,
# we have to manually specify the path of the lora weights.
vision_lora_path = os.path.join(model_path, "vision-lora")
prompts = [
f"<|user|><|image|>{question}<|end|><|assistant|>" for question in questions
]
engine_args = EngineArgs(
model=model_path,
max_model_len=5120,
max_num_seqs=2,
max_num_batched_tokens=12800,
enable_lora=True,
max_lora_rank=320,
# Note - mm_processor_kwargs can also be passed to generate/chat calls
mm_processor_kwargs={"dynamic_hd": 16},
limit_mm_per_prompt={"image": 1},
)
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
)
# Pixtral HF-format
def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
@ -1222,7 +1369,9 @@ model_example_map = {
"glm4v": run_glm4v,
"glm4_1v": run_glm4_1v,
"h2ovl_chat": run_h2ovl,
"hyperclovax_seed_vision": run_hyperclovax_seed_vision,
"idefics3": run_idefics3,
"interns1": run_interns1,
"internvl_chat": run_internvl,
"nemotron_vl": run_nemotron_vl,
"keye_vl": run_keye_vl,
@ -1244,6 +1393,7 @@ model_example_map = {
"paligemma2": run_paligemma2,
"phi3_v": run_phi3v,
"phi4_mm": run_phi4mm,
"phi4_multimodal": run_phi4_multimodal,
"pixtral_hf": run_pixtral_hf,
"qwen_vl": run_qwen_vl,
"qwen2_vl": run_qwen2_vl,

View File

@ -253,6 +253,33 @@ def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData:
)
def load_interns1(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "internlm/Intern-S1"
engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=4096,
limit_mm_per_prompt={"image": len(image_urls)},
)
placeholders = "\n".join(
f"Image-{i}: <IMG_CONTEXT>\n" for i, _ in enumerate(image_urls, start=1)
)
messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
prompt = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=[fetch_image(url) for url in image_urls],
)
def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "OpenGVLab/InternVL2-2B"
@ -289,6 +316,53 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
)
def load_hyperclovax_seed_vision(
question: str, image_urls: list[str]
) -> ModelRequestData:
model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=16384,
limit_mm_per_prompt={"image": len(image_urls)},
)
message = {"role": "user", "content": list()}
for _image_url in image_urls:
message["content"].append(
{
"type": "image",
"image": _image_url,
"ocr": "",
"lens_keywords": "",
"lens_local_keywords": "",
}
)
message["content"].append(
{
"type": "text",
"text": question,
}
)
prompt = tokenizer.apply_chat_template(
[
message,
],
tokenize=False,
add_generation_prompt=True,
)
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
stop_token_ids=None,
image_data=[fetch_image(url) for url in image_urls],
)
def load_llava(question: str, image_urls: list[str]) -> ModelRequestData:
# NOTE: CAUTION! Original Llava models wasn't really trained on multi-image inputs,
# it will generate poor response for multi-image inputs!
@ -686,6 +760,40 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
)
def load_phi4_multimodal(question: str, image_urls: list[str]) -> ModelRequestData:
"""
Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
show how to process multi images inputs.
"""
model_path = snapshot_download(
"microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70"
)
# Since the vision-lora and speech-lora co-exist with the base model,
# we have to manually specify the path of the lora weights.
vision_lora_path = os.path.join(model_path, "vision-lora")
engine_args = EngineArgs(
model=model_path,
max_model_len=4096,
max_num_seqs=2,
limit_mm_per_prompt={"image": len(image_urls)},
enable_lora=True,
max_lora_rank=320,
# Note - mm_processor_kwargs can also be passed to generate/chat calls
mm_processor_kwargs={"dynamic_hd": 4},
)
placeholders = "<|image|>" * len(image_urls)
prompt = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=[fetch_image(url) for url in image_urls],
lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
)
def load_qwen_vl_chat(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "Qwen/Qwen-VL-Chat"
engine_args = EngineArgs(
@ -899,7 +1007,9 @@ model_example_map = {
"gemma3": load_gemma3,
"h2ovl_chat": load_h2ovl,
"idefics3": load_idefics3,
"interns1": load_interns1,
"internvl_chat": load_internvl,
"hyperclovax_seed_vision": load_hyperclovax_seed_vision,
"keye_vl": load_keye_vl,
"kimi_vl": load_kimi_vl,
"llava": load_llava,
@ -912,6 +1022,7 @@ model_example_map = {
"ovis": load_ovis,
"phi3_v": load_phi3v,
"phi4_mm": load_phi4mm,
"phi4_multimodal": load_phi4_multimodal,
"pixtral_hf": load_pixtral_hf,
"qwen_vl_chat": load_qwen_vl_chat,
"qwen2_vl": load_qwen2_vl,

View File

@ -29,7 +29,7 @@ PROXY_PORT=${PROXY_PORT:-30001}
PREFILL_GPUS=${PREFILL_GPUS:-0}
DECODE_GPUS=${DECODE_GPUS:-1,2,3}
PREFILL_PORTS=${PREFILL_PORTS:-20003}
DECODE_PORTS=${DECODE_PORTS:-20005,20007,20009}
DECODE_PORTS=${DECODE_PORTS:-20005,20007,20009}
echo "Warning: P2P NCCL disaggregated prefill XpYd support for vLLM v1 is experimental and subject to change."
echo ""
@ -164,7 +164,7 @@ main() {
local gpu_id=${PREFILL_GPU_ARRAY[$i]}
local port=${PREFILL_PORT_ARRAY[$i]}
local kv_port=$((21001 + i))
echo " Prefill server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
CUDA_VISIBLE_DEVICES=$gpu_id VLLM_USE_V1=1 vllm serve $MODEL \
--enforce-eager \
@ -193,7 +193,7 @@ main() {
local gpu_id=${DECODE_GPU_ARRAY[$i]}
local port=${DECODE_PORT_ARRAY[$i]}
local kv_port=$((22001 + i))
echo " Decode server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
--enforce-eager \
@ -233,7 +233,7 @@ main() {
# Run Benchmark
# =============================================================================
cd ../../../benchmarks/
python3 benchmark_serving.py --port 10001 --seed $(date +%s) \
vllm bench serve --port 10001 --seed $(date +%s) \
--model $MODEL \
--dataset-name random --random-input-len 7500 --random-output-len 200 \
--num-prompts 200 --burstiness 100 --request-rate 2 | tee benchmark.log
@ -243,4 +243,4 @@ main() {
cleanup
}
main
main

View File

@ -28,7 +28,7 @@ Submit some sample requests to the server:
```bash
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
python3 ../../../benchmarks/benchmark_serving.py \
vllm bench serve \
--model mistralai/Mistral-7B-v0.1 \
--tokenizer mistralai/Mistral-7B-v0.1 \
--endpoint /v1/completions \

View File

@ -122,7 +122,7 @@ main() {
# begin benchmark
cd ../../../../benchmarks/
python3 benchmark_serving.py --port 9000 --seed $(date +%s) \
vllm bench serve --port 9000 --seed $(date +%s) \
--model meta-llama/Llama-3.1-8B-Instruct \
--dataset-name random --random-input-len 7500 --random-output-len 200 \
--num-prompts 200 --burstiness 100 --request-rate 3.6 | tee benchmark.log
@ -133,4 +133,4 @@ main() {
}
main
main

View File

@ -10,7 +10,8 @@ setuptools>=77.0.3,<80.0.0
--extra-index-url https://download.pytorch.org/whl/cpu
torch==2.6.0+cpu; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218
torch==2.7.0; platform_system == "Darwin"
torch==2.7.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
torch==2.7.0; platform_machine == "ppc64le"
torch==2.6.0; platform_machine == "aarch64" # for arm64 CPUs, torch 2.7.0 has a issue: https://github.com/vllm-project/vllm/issues/17960
# required for the image processor of minicpm-o-2_6, this must be updated alongside torch
torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
@ -25,3 +26,6 @@ datasets # for benchmark scripts
intel-openmp==2024.2.1; platform_machine == "x86_64"
intel_extension_for_pytorch==2.6.0; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218
triton==3.2.0; platform_machine == "x86_64" # Triton is required for torch 2.6+cpu, as it is imported in torch.compile.
# Use this to gather CPU info and optimize based on ARM Neoverse cores
py-cpuinfo; platform_machine == "aarch64"

View File

@ -19,8 +19,8 @@ nixl==0.3.0
--find-links https://storage.googleapis.com/libtpu-releases/index.html
--find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
--find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
torch==2.9.0.dev20250716
torchvision==0.24.0.dev20250716
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250716-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250716-cp312-cp312-linux_x86_64.whl ; python_version == "3.12"
torch==2.9.0.dev20250724
torchvision==0.24.0.dev20250724
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250724-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250724-cp312-cp312-linux_x86_64.whl ; python_version == "3.12"

View File

@ -1062,8 +1062,17 @@ class VllmRunner:
return [req_output.outputs.score for req_output in req_outputs]
def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
executor = self.llm.llm_engine.model_executor
return executor.apply_model(func)
if hasattr(self.llm.llm_engine, "model_executor"):
# This works either in V0 or in V1 with
# VLLM_ENABLE_V1_MULTIPROCESSING=0
executor = self.llm.llm_engine.model_executor
return executor.apply_model(func)
# This works in V1 with VLLM_ALLOW_INSECURE_SERIALIZATION=1
def _apply_model(self):
return func(self.get_model())
return self.llm.llm_engine.collective_rpc(_apply_model)
def __enter__(self):
return self

View File

@ -0,0 +1,93 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import base64
import io
import numpy as np
import pytest
import requests
import torch
from ...utils import RemoteOpenAIServer
MODEL_NAME = "christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM"
DTYPE = "float16"
@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
@pytest.fixture(scope="module")
def server():
args = [
"--task",
"embed",
# use half precision for speed and memory savings in CI environment
"--dtype",
DTYPE,
"--enforce-eager",
"--trust-remote-code",
"--skip-tokenizer-init",
"--max-num-seqs",
"32"
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_single_request(server: RemoteOpenAIServer, model_name: str):
pixel_values = torch.full((6, 512, 512), 1.0, dtype=torch.float16)
location_coords = torch.full((1, 2), 1.0, dtype=torch.float16)
buffer_tiff = io.BytesIO()
torch.save(pixel_values, buffer_tiff)
buffer_tiff.seek(0)
binary_data = buffer_tiff.read()
base64_tensor_embedding = base64.b64encode(binary_data).decode('utf-8')
buffer_coord = io.BytesIO()
torch.save(location_coords, buffer_coord)
buffer_coord.seek(0)
binary_data = buffer_coord.read()
base64_coord_embedding = base64.b64encode(binary_data).decode('utf-8')
prompt = {
"model":
model_name,
"additional_data": {
"prompt_token_ids": [1]
},
"encoding_format":
"base64",
"messages": [{
"role":
"user",
"content": [{
"type": "image_embeds",
"image_embeds": {
"pixel_values": base64_tensor_embedding,
"location_coords": base64_coord_embedding,
},
}],
}]
}
# test single pooling
response = requests.post(server.url_for("pooling"), json=prompt)
response.raise_for_status()
output = response.json()["data"][0]['data']
np_response = np.frombuffer(base64.b64decode(output), dtype=np.float32)
assert len(np_response) == 524288

View File

@ -0,0 +1,191 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Optional
import pytest
import torch
import vllm.v1.attention.backends.rocm_aiter_fa # noqa: F401
from vllm.platforms import current_platform
NUM_HEADS = [(4, 4), (8, 2), (16, 2)]
HEAD_SIZES = [128, 256]
BLOCK_SIZES = [16, 32]
DTYPES = [torch.float16, torch.bfloat16]
QDTYPES = [None]
# one value large enough to test overflow in index calculation.
# one value small enough to test the schema op check
NUM_BLOCKS = [32768, 2048]
def ref_paged_attn(
query: torch.Tensor,
key_cache: torch.Tensor,
value_cache: torch.Tensor,
query_lens: list[int],
kv_lens: list[int],
block_tables: torch.Tensor,
scale: float,
sliding_window: Optional[int] = None,
soft_cap: Optional[float] = None,
) -> torch.Tensor:
num_seqs = len(query_lens)
block_tables = block_tables.cpu().numpy()
_, block_size, num_kv_heads, head_size = key_cache.shape
outputs: list[torch.Tensor] = []
start_idx = 0
for i in range(num_seqs):
query_len = query_lens[i]
kv_len = kv_lens[i]
q = query[start_idx:start_idx + query_len]
q *= scale
num_kv_blocks = (kv_len + block_size - 1) // block_size
block_indices = block_tables[i, :num_kv_blocks]
k = key_cache[block_indices].view(-1, num_kv_heads, head_size)
k = k[:kv_len]
v = value_cache[block_indices].view(-1, num_kv_heads, head_size)
v = v[:kv_len]
if q.shape[1] != k.shape[1]:
k = torch.repeat_interleave(k, q.shape[1] // k.shape[1], dim=1)
v = torch.repeat_interleave(v, q.shape[1] // v.shape[1], dim=1)
attn = torch.einsum("qhd,khd->hqk", q, k).float()
empty_mask = torch.ones(query_len, kv_len)
mask = torch.triu(empty_mask, diagonal=kv_len - query_len + 1).bool()
if sliding_window is not None:
sliding_window_mask = torch.triu(empty_mask,
diagonal=kv_len -
(query_len + sliding_window) +
1).bool().logical_not()
mask |= sliding_window_mask
if soft_cap is not None:
attn = soft_cap * torch.tanh(attn / soft_cap)
attn.masked_fill_(mask, float("-inf"))
attn = torch.softmax(attn, dim=-1).to(v.dtype)
out = torch.einsum("hqk,khd->qhd", attn, v)
outputs.append(out)
start_idx += query_len
return torch.cat(outputs, dim=0)
@pytest.mark.skipif(not current_platform.is_rocm(),
reason="Only ROCm is supported")
@pytest.mark.parametrize("seq_lens",
[[(10, 1328), (5, 18),
(129, 463)], [(8, 523), (24, 37), (3, 2011)]])
@pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", HEAD_SIZES)
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
@pytest.mark.parametrize("sliding_window", [None, 256])
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("soft_cap", [None])
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
@pytest.mark.parametrize("q_dtype", QDTYPES)
@torch.inference_mode()
def test_varlen_with_paged_kv(
seq_lens: list[tuple[int, int]],
num_heads: tuple[int, int],
head_size: int,
sliding_window: Optional[int],
dtype: torch.dtype,
block_size: int,
soft_cap: Optional[float],
num_blocks: int,
q_dtype: Optional[torch.dtype],
) -> None:
torch.set_default_device("cuda")
current_platform.seed_everything(0)
num_seqs = len(seq_lens)
query_lens = [x[0] for x in seq_lens]
kv_lens = [x[1] for x in seq_lens]
num_query_heads = num_heads[0]
num_kv_heads = num_heads[1]
assert num_query_heads % num_kv_heads == 0
max_query_len = max(query_lens)
max_kv_len = max(kv_lens)
window_size = ((sliding_window - 1, 0) if sliding_window is not None else
(-1, -1))
scale = head_size**-0.5
query = torch.randn(sum(query_lens),
num_query_heads,
head_size,
dtype=dtype)
key_cache = torch.randn(num_blocks,
block_size,
num_kv_heads,
head_size,
dtype=dtype)
value_cache = torch.randn_like(key_cache)
cu_query_lens = torch.tensor([0] + query_lens,
dtype=torch.int32).cumsum(dim=0,
dtype=torch.int32)
cu_seq_lens = torch.tensor([0] + kv_lens,
dtype=torch.int32).cumsum(dim=0,
dtype=torch.int32)
kv_lens = torch.tensor(kv_lens, dtype=torch.int32)
max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
block_tables = torch.randint(0,
num_blocks,
(num_seqs, max_num_blocks_per_seq),
dtype=torch.int32)
output = torch.empty_like(query)
maybe_quantized_query = query
maybe_quantized_key_cache = key_cache
maybe_quantized_value_cache = value_cache
k_descale = None
v_descale = None
if q_dtype is not None:
# QKV are drawn from N(0, 1): no need for a fp8 scaling factor
maybe_quantized_query = query.to(q_dtype)
maybe_quantized_key_cache = key_cache.to(q_dtype)
maybe_quantized_value_cache = value_cache.to(q_dtype)
scale_shape = (num_seqs, num_kv_heads)
k_descale = torch.ones(scale_shape, dtype=torch.float32)
v_descale = torch.ones(scale_shape, dtype=torch.float32)
torch.ops.vllm.flash_attn_varlen_func(
maybe_quantized_query,
maybe_quantized_key_cache,
maybe_quantized_value_cache,
out=output,
cu_seqlens_q=cu_query_lens,
max_seqlen_q=max_query_len,
max_seqlen_k=max_kv_len,
softmax_scale=scale,
alibi_slopes=None,
window_size=window_size,
block_table=block_tables,
cu_seqlens_k=cu_seq_lens,
k_scale=k_descale,
v_scale=v_descale,
)
ref_output = ref_paged_attn(
query=query,
key_cache=key_cache,
value_cache=value_cache,
query_lens=query_lens,
kv_lens=kv_lens,
block_tables=block_tables,
scale=scale,
sliding_window=sliding_window,
soft_cap=soft_cap,
)
atol, rtol = 2e-2, 2e-2
if q_dtype is not None:
atol, rtol = 1.5e-1, 1.5e-1
torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol), \
f"{torch.max(torch.abs(output - ref_output))}"

View File

@ -17,28 +17,34 @@ from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
moe_permute, moe_permute_unpermute_supported, moe_unpermute)
from vllm.platforms import current_platform
NUM_EXPERTS = [16, 64]
NUM_EXPERTS = [16, 64, 256]
TOP_KS = [2, 4, 6, 8]
EP_SIZE = [1, 4, 16]
current_platform.seed_everything(0)
def torch_permute(hidden_states: torch.Tensor,
topk_ids: torch.Tensor,
token_expert_indices: torch.Tensor,
topk: int,
n_expert: int,
n_local_expert: int,
start_expert: int,
expert_map: Optional[torch.Tensor] = None,
align_block_size: Optional[int] = None,
fill_invalid_expert: int = -1) -> list[torch.Tensor]:
def torch_permute(
hidden_states: torch.Tensor,
topk_ids: torch.Tensor,
# token_expert_indices: torch.Tensor,
topk: int,
n_expert: int,
n_local_expert: int,
start_expert: int,
expert_map: Optional[torch.Tensor] = None,
align_block_size: Optional[int] = None,
fill_invalid_expert: int = -1) -> list[torch.Tensor]:
n_token, n_hidden = hidden_states.shape[0], hidden_states.shape[1]
if expert_map is not None:
is_local_expert = (expert_map[topk_ids] != -1)
not_local_expert = (expert_map[topk_ids] == -1)
topk_ids = is_local_expert * (
topk_ids - start_expert) + not_local_expert * (topk_ids + n_expert)
token_expert_indices = torch.arange(0,
n_token * topk,
dtype=torch.int32,
device=hidden_states.device).reshape(
(n_token, topk))
sorted_topk_ids, sorted_indices = torch.sort(topk_ids.flatten(),
stable=True)
@ -59,8 +65,8 @@ def torch_permute(hidden_states: torch.Tensor,
valid_row_idx = []
if align_block_size is None:
permuted_hidden_states = hidden_states[dst_row_id2src_row_id_map %
n_token, ...]
permuted_hidden_states = hidden_states[dst_row_id2src_row_id_map //
topk, ...]
permuted_row_size = permuted_hidden_states.shape[0]
m_indices = torch.empty(permuted_row_size,
device="cuda",
@ -73,14 +79,21 @@ def torch_permute(hidden_states: torch.Tensor,
0, n_token * topk, device="cuda",
dtype=torch.int32)[src2dst_idx].reshape((n_token, topk))
valid_row_idx += [i for i in range(expert_first_token_offset[-1])]
dst_row_id2src_row_id_map[
expert_first_token_offset[-1]:] = n_token * topk
return [
permuted_hidden_states, expert_first_token_offset,
src_row_id2dst_row_id_map, m_indices, valid_row_idx
src_row_id2dst_row_id_map, dst_row_id2src_row_id_map, m_indices,
valid_row_idx
]
else:
permuted_row_size = (topk * n_token + n_expert *
(align_block_size - 1) + align_block_size -
1) // align_block_size * align_block_size
permuted_idx = torch.full((permuted_row_size, ),
n_token * topk,
dtype=torch.int32,
device=hidden_states.device)
permuted_hidden_states = torch.empty((permuted_row_size, n_hidden),
device="cuda",
dtype=hidden_states.dtype)
@ -105,13 +118,16 @@ def torch_permute(hidden_states: torch.Tensor,
align_first_token_offset = align_expert_first_token_offset[i - 1]
align_last_token_offset = align_expert_first_token_offset[i]
dst_row_id2src_row_id_in_expert = dst_row_id2src_row_id_map[
first_token_offset:first_token_offset +
n_token_in_expert] % n_token
first_token_offset:first_token_offset + n_token_in_expert]
# store token in current expert with align_first_token_offset
permuted_hidden_states[align_first_token_offset:\
align_first_token_offset+n_token_in_expert,\
...] = hidden_states[\
dst_row_id2src_row_id_in_expert, ...]
dst_row_id2src_row_id_in_expert // topk,\
...]
permuted_idx[align_first_token_offset:\
align_first_token_offset+\
n_token_in_expert] = dst_row_id2src_row_id_in_expert
# set current expert m_indices
m_indices[align_first_token_offset:align_last_token_offset] = i - 1
valid_row_idx += [
@ -135,7 +151,7 @@ def torch_permute(hidden_states: torch.Tensor,
src2dst_idx].reshape((n_token, topk))
return [
permuted_hidden_states, align_expert_first_token_offset,
align_src_row_id2dst_row_id, m_indices, valid_row_idx
align_src_row_id2dst_row_id, permuted_idx, m_indices, valid_row_idx
]
@ -146,15 +162,18 @@ def torch_unpermute(permuted_hidden_states: torch.Tensor,
valid_row_idx: torch.Tensor, topk: int,
n_expert: int) -> torch.Tensor:
# ignore invalid row
n_hidden = permuted_hidden_states.shape[1]
mask = torch.zeros(permuted_hidden_states.shape[0],
dtype=bool,
device="cuda")
mask[valid_row_idx] = True
permuted_hidden_states[~mask] = 0
idx = src_row_id2dst_row_id_map.flatten()[
token_expert_indices.flatten()].reshape(token_expert_indices.shape)
output = permuted_hidden_states[idx, ...] * topk_weights[..., None]
output = output.sum(dim=1).to(permuted_hidden_states.dtype)
permuted_hidden_states = permuted_hidden_states[
src_row_id2dst_row_id_map.flatten(), ...]
permuted_hidden_states = permuted_hidden_states.view(-1, topk, n_hidden)
output = (permuted_hidden_states * topk_weights.unsqueeze(2)).sum(1).to(
permuted_hidden_states.dtype)
return output
@ -184,43 +203,56 @@ def test_moe_permute_unpermute(n_token: int, n_hidden: int, topk: int,
gating_output = torch.randn((n_token, n_expert), device="cuda").to(dtype)
topk_weights, topk_ids, token_expert_indices = fused_topk(
hidden_states, gating_output, topk, False)
gold0, gold1, gold2, gold3, valid_row_idx = torch_permute(
hidden_states,
topk_ids,
token_expert_indices,
topk,
n_expert,
n_local_expert,
start_expert,
expert_map=expert_map,
align_block_size=align_block_size,
fill_invalid_expert=fill_invalid_expert)
(gold_permuted_hidden_states, gold_expert_first_token_offset,
gold_inv_permuted_idx, gold_permuted_idx, gold_m_indices,
valid_row_idx) = torch_permute(
hidden_states,
topk_ids,
# token_expert_indices,
topk,
n_expert,
n_local_expert,
start_expert,
expert_map=expert_map,
align_block_size=align_block_size,
fill_invalid_expert=fill_invalid_expert)
result0, result1, result2, result3 = moe_permute(
hidden_states, topk_weights, topk_ids, token_expert_indices, topk,
n_expert, n_local_expert, expert_map, align_block_size,
fill_invalid_expert)
(permuted_hidden_states, _, expert_first_token_offset, inv_permuted_idx,
m_indices) = moe_permute(hidden_states=hidden_states,
a1q_scale=None,
topk_ids=topk_ids,
n_expert=n_expert,
n_local_expert=n_local_expert,
expert_map=expert_map,
align_block_size=align_block_size,
fill_invalid_expert=fill_invalid_expert)
# check expert_first_token_offset
torch.testing.assert_close(gold1, result1, atol=0, rtol=0)
# check src_row_id2dst_row_id_map
torch.testing.assert_close(gold2, result2, atol=0, rtol=0)
# check mindice
torch.testing.assert_close(gold3, result3, atol=0, rtol=0)
# check permuted_hidden_states, only valid token
torch.testing.assert_close(gold0[valid_row_idx],
result0[valid_row_idx],
torch.testing.assert_close(gold_expert_first_token_offset,
expert_first_token_offset,
atol=0,
rtol=0)
# check src_row_id2dst_row_id_map
torch.testing.assert_close(gold_inv_permuted_idx.flatten(),
inv_permuted_idx,
atol=0,
rtol=0)
# check mindice
torch.testing.assert_close(gold_m_indices, m_indices, atol=0, rtol=0)
# check permuted_hidden_states, only valid token
torch.testing.assert_close(gold_permuted_hidden_states[valid_row_idx],
permuted_hidden_states[valid_row_idx],
atol=0,
rtol=0)
# add a random tensor to simulate group gemm
result0 = 0.5 * result0 + torch.randn_like(result0)
result0 = 0.5 * permuted_hidden_states + torch.randn_like(
permuted_hidden_states)
result4 = torch.empty_like(hidden_states)
moe_unpermute(result4, result0, topk_weights, inv_permuted_idx,
expert_first_token_offset)
result4 = moe_unpermute(result0, topk_weights, topk_ids, result2, result1,
topk, n_expert, n_local_expert)
gold4 = torch_unpermute(result0, topk_weights, topk_ids,
token_expert_indices, result2, valid_row_idx, topk,
n_local_expert)
token_expert_indices, inv_permuted_idx,
valid_row_idx, topk, n_local_expert)
# check unpermuted hidden
torch.testing.assert_close(result4, gold4, atol=2e-2, rtol=0)

View File

@ -22,10 +22,12 @@ REVISION_ROBERTA = os.environ.get("REVISION", "main")
@pytest.mark.skipif(current_platform.is_rocm(),
reason="Xformers backend is not supported on ROCm.")
def test_model_loading_with_params(vllm_runner):
def test_model_loading_with_params(vllm_runner, monkeypatch):
"""
Test parameter weight loading with tp>1.
"""
# to use apply_model
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
with vllm_runner(model_name=MODEL_NAME,
revision=REVISION,
dtype="float16",
@ -61,10 +63,12 @@ def test_model_loading_with_params(vllm_runner):
@pytest.mark.skipif(current_platform.is_rocm(),
reason="Xformers backend is not supported on ROCm.")
def test_roberta_model_loading_with_params(vllm_runner):
def test_roberta_model_loading_with_params(vllm_runner, monkeypatch):
"""
Test parameter weight loading with tp>1.
"""
# to use apply_model
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
with vllm_runner(model_name=MODEL_NAME_ROBERTA,
revision=REVISION_ROBERTA,
dtype="float16",
@ -101,10 +105,12 @@ def test_roberta_model_loading_with_params(vllm_runner):
@pytest.mark.skipif(current_platform.is_rocm(),
reason="Xformers backend is not supported on ROCm.")
def test_facebook_roberta_model_loading_with_params(vllm_runner):
def test_facebook_roberta_model_loading_with_params(vllm_runner, monkeypatch):
"""
Test loading roberta-base model with no lm_head.
"""
# to use apply_model
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
model_name = "FacebookAI/roberta-base"
with vllm_runner(model_name=model_name,
dtype="float16",

View File

@ -39,17 +39,9 @@ def v1(run_with_both_engines):
pytest.param("ssmits/Qwen2-7B-Instruct-embed-base",
marks=[pytest.mark.skip_v0, pytest.mark.cpu_model]),
# [Encoder-only]
pytest.param(
"BAAI/bge-base-en-v1.5",
marks=[
# CPU only supports V1
pytest.mark.core_model,
pytest.mark.skip_v1
]),
pytest.param("sentence-transformers/all-MiniLM-L12-v2",
marks=[pytest.mark.skip_v1]),
pytest.param("intfloat/multilingual-e5-small",
marks=[pytest.mark.skip_v1]),
pytest.param("BAAI/bge-base-en-v1.5", marks=[pytest.mark.core_model]),
pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
pytest.param("intfloat/multilingual-e5-small"),
pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
marks=[pytest.mark.skip_v1]),
# [Cross-Encoder]

View File

@ -23,6 +23,14 @@ RERANK_MODELS = [
]
@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
@pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
def test_embed_models_mteb(hf_runner, vllm_runner,
model_info: EmbedModelInfo) -> None:

View File

@ -677,6 +677,7 @@ VLM_TEST_SETTINGS = {
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
multi_image_prompt="Picture 1: <vlm_image>\nPicture 2: <vlm_image>\nDescribe these two images with one paragraph respectively.", # noqa: E501
max_model_len=4096,
max_num_seqs=2,
auto_cls=AutoModelForVision2Seq,

View File

@ -22,6 +22,9 @@ from transformers import (AutoConfig, AutoProcessor, AutoTokenizer,
GenerationConfig)
from vllm import LLM, SamplingParams
from vllm.v1.executor.abstract import Executor
from vllm.v1.kv_cache_interface import (ChunkedLocalAttentionSpec,
FullAttentionSpec)
from ....utils import multi_gpu_test
@ -69,6 +72,26 @@ def run_maverick_serving(model: str):
raise
def get_rope_layers_config(model_path: str) -> list[int]:
"""
Get the interleaved RoPE configuration from HuggingFace config
Args:
model_path: Path to the local directory containing the reduced
Maverick model checkpoint
Returns:
List of 0 or 1 indicating whether each layer uses RoPE and local attn
0 indicates that RoPE is not used while 1 indicates that RoPE is used.
"""
config_path = Path(model_path) / "config.json"
model_config = json.loads(config_path.read_text())
text_config = model_config["text_config"]
no_rope_layers = text_config["no_rope_layers"]
print(f"Found no_rope_layers: {no_rope_layers}")
return no_rope_layers
def create_reduced_maverick_model(
original_model_name:
str = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
@ -113,7 +136,6 @@ def create_reduced_maverick_model(
print("Loading original model configuration...")
original_config = AutoConfig.from_pretrained(original_model_name,
trust_remote_code=True)
print("Creating reduced configuration...")
reduced_config = create_reduced_config(original_config, text_layers,
num_experts, vision_layers)
@ -510,21 +532,32 @@ def save_weights_to_safetensors(weights: dict[str, torch.Tensor],
f"{index_data['metadata']['total_size'] / (1024**3):.2f} GB")
def run_reduced_model(model_path: str,
should_profile: bool = False,
**kwargs) -> None:
"""Test the created reduced model with vLLM."""
print(f"\nTesting reduced model at {model_path}...")
llm = LLM(
model=model_path,
trust_remote_code=True,
max_model_len=512, # Small context for testing
gpu_memory_utilization=0.3, # Conservative memory usage
**kwargs,
def check_attention_spec_interleaved_rope(
llm: LLM,
num_attention_layers: int,
num_ranks: int,
rope_layers: list[int],
):
"""Check that the attention spec is correct."""
assert isinstance(llm.llm_engine.model_executor, Executor)
kv_cache_specs_per_rank = llm.llm_engine.model_executor.get_kv_cache_specs(
)
for rank in range(num_ranks):
kv_cache_specs = kv_cache_specs_per_rank[rank]
assert len(kv_cache_specs.keys()) == num_attention_layers
for i in range(num_attention_layers):
if rope_layers[i] == 0:
expected_spec = FullAttentionSpec
else:
expected_spec = ChunkedLocalAttentionSpec
assert isinstance(
kv_cache_specs[
f"language_model.model.layers.{i}.self_attn.attn"],
expected_spec)
def run_reduced_model(llm: LLM, should_profile: bool = False) -> None:
"""Test the created reduced model with vLLM."""
sampling_params = SamplingParams(temperature=0.8,
top_p=0.95,
max_tokens=50)
@ -551,6 +584,7 @@ def run_reduced_model(model_path: str,
@pytest.mark.parametrize("tp,ep", [(2, True)])
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
def test_dummy_maverick(
monkeypatch,
original_model_name: str,
text_layers: int,
num_experts: int,
@ -562,6 +596,10 @@ def test_dummy_maverick(
force_recreate: bool = True,
profile: bool = False,
) -> None:
# Disable multiprocessing allows us to access model executor from LLM engine
monkeypatch.setenv("VLLM_USE_V1", "1")
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
model_path = create_reduced_maverick_model(
original_model_name=original_model_name,
output_dir=output_dir,
@ -573,11 +611,27 @@ def test_dummy_maverick(
print(f"\nReduced model created successfully at: {model_path}")
run_reduced_model(model_path=model_path,
should_profile=profile,
enforce_eager=enforce_eager,
tensor_parallel_size=tp,
enable_expert_parallel=ep)
rope_layers = get_rope_layers_config(model_path)
llm = LLM(
model=model_path,
trust_remote_code=True,
max_model_len=512, # Small context for testing
gpu_memory_utilization=0.3, # Conservative memory usage
enforce_eager=enforce_eager,
tensor_parallel_size=tp,
enable_expert_parallel=ep,
)
check_attention_spec_interleaved_rope(
llm,
text_layers,
tp,
rope_layers,
)
print(f"\nTesting reduced model at {model_path}...")
run_reduced_model(llm=llm, should_profile=profile)
def main():

View File

@ -0,0 +1,252 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
from collections.abc import Sequence
from typing import Optional
import librosa
import pytest
from huggingface_hub import snapshot_download
from vllm.assets.image import ImageAsset
from vllm.lora.request import LoRARequest
from vllm.multimodal.image import rescale_image_size
from vllm.platforms import current_platform
from ....conftest import (IMAGE_ASSETS, HfRunner, PromptAudioInput,
PromptImageInput, VllmRunner)
from ....utils import large_gpu_test
from ...utils import check_logprobs_close
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"stop_sign":
"<|user|>\n<|image|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n", # noqa: E501
"cherry_blossom":
"<|user|>\n<|image|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n", # noqa: E501
})
HF_MULTIIMAGE_IMAGE_PROMPT = "<|user|>\n<|image|>\n<|image|>\nDescribe these images.<|end|>\n<|assistant|>\n" # noqa: E501
model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct",
revision="refs/pr/70")
# Since the vision-lora and speech-lora co-exist with the base model,
# we have to manually specify the path of the lora weights.
vision_lora_path = os.path.join(model_path, "vision-lora")
speech_question = os.path.join(model_path, "examples",
"what_is_shown_in_this_image.wav")
models = [model_path]
target_dtype = "half"
# ROCm Triton FA can run into shared memory issues with these models,
# use other backends in the meantime
# FIXME (mattwong, gshtrasb, hongxiayan)
if current_platform.is_rocm():
os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
def run_test(
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
inputs: Sequence[tuple[list[str], PromptImageInput,
Optional[PromptAudioInput]]],
model: str,
*,
max_model_len: int,
dtype: str,
max_tokens: int,
num_logprobs: int,
mm_limit: int,
tensor_parallel_size: int,
distributed_executor_backend: Optional[str] = None,
):
"""Inference result should be the same between hf and vllm.
All the image fixtures for the test are from IMAGE_ASSETS.
For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects
and corresponding MultiModalConfig as input.
Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf.
"""
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
# max_model_len should be greater than image_feature_size
with vllm_runner(
model,
task="generate",
max_model_len=max_model_len,
max_num_seqs=2,
dtype=dtype,
limit_mm_per_prompt={"image": mm_limit},
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enable_lora=True,
max_lora_rank=320,
gpu_memory_utilization=0.8, # set to 0.8 to avoid OOM in CI
enforce_eager=True,
trust_remote_code=False,
) as vllm_model:
lora_request = LoRARequest("vision", 1, vision_lora_path)
vllm_outputs_per_case = [
vllm_model.generate_greedy_logprobs(prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images,
audios=audios,
lora_request=lora_request)
for prompts, images, audios in inputs
]
with hf_runner(model, dtype=dtype) as hf_model:
hf_model.model.load_adapter(
vision_lora_path,
adapter_name="vision",
)
hf_processor = hf_model.processor
eos_token_id = hf_processor.tokenizer.eos_token_id
hf_outputs_per_case = [
hf_model.generate_greedy_logprobs_limit(prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images,
audios=audios,
eos_token_id=eos_token_id)
for prompts, images, audios in inputs
]
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
vllm_outputs_per_case):
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
"size_factors",
[
# No image
[],
# Single-scale
[1.0],
# Single-scale, batched
[1.0, 1.0, 1.0],
# Multi-scale
[0.25, 0.5, 1.0],
],
)
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_model_len", [12800])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
dtype: str, max_model_len: int, max_tokens: int,
num_logprobs: int) -> None:
images = [asset.pil_image for asset in image_assets]
inputs_per_image = [(
[prompt for _ in size_factors],
[rescale_image_size(image, factor) for factor in size_factors],
None,
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
run_test(
hf_runner,
vllm_runner,
inputs_per_image,
model,
dtype=dtype,
max_model_len=max_model_len,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
mm_limit=1,
tensor_parallel_size=1,
)
@large_gpu_test(min_gb=48)
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
"size_factors",
[
# No image
# [],
# Single-scale
[1.0],
# Single-scale, batched
[1.0, 1.0, 1.0],
# Multi-scale
[0.25, 0.5, 1.0],
],
)
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_model_len", [25600])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
size_factors, dtype: str, max_model_len: int,
max_tokens: int, num_logprobs: int) -> None:
images = [asset.pil_image for asset in image_assets]
inputs_per_case = [
(
[HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
[[rescale_image_size(image, factor) for image in images]
for factor in size_factors],
None,
),
]
run_test(
hf_runner,
vllm_runner,
inputs_per_case,
model,
dtype=dtype,
max_model_len=max_model_len,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
mm_limit=2,
tensor_parallel_size=1,
)
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_model_len", [12800])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_vision_speech_models(hf_runner, vllm_runner, model, dtype: str,
max_model_len: int, max_tokens: int,
num_logprobs: int) -> None:
# use the example speech question so that the model outputs are reasonable
audio = librosa.load(speech_question, sr=16000)
image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
inputs_vision_speech = [
(
["<|user|><|image|><|audio|><|end|><|assistant|>"],
[image],
[audio],
),
]
run_test(
hf_runner,
vllm_runner,
inputs_vision_speech,
model,
dtype=dtype,
max_model_len=max_model_len,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
mm_limit=1,
tensor_parallel_size=1,
)

View File

@ -41,12 +41,18 @@ def glm4_1v_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
def _test_processing_correctness(
model_id: str,
model_id_or_arch: str,
hit_rate: float,
num_batches: int,
simplify_rate: float,
):
model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
if model_id_or_arch in HF_EXAMPLE_MODELS.get_supported_archs():
# Use model architecture to get the default model id
model_info = HF_EXAMPLE_MODELS.get_hf_info(model_id_or_arch)
model_id = model_info.default
else:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id_or_arch)
model_id = model_id_or_arch
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")
@ -58,7 +64,7 @@ def _test_processing_correctness(
trust_remote_code=model_info.trust_remote_code,
seed=0,
dtype="auto",
revision=None,
revision=model_info.revision,
hf_overrides=model_info.hf_overrides,
)
@ -272,12 +278,14 @@ def _test_processing_correctness_one(
"THUDM/GLM-4.1V-9B-Thinking",
"ibm-granite/granite-speech-3.3-2b",
"h2oai/h2ovl-mississippi-800m",
"internlm/Intern-S1",
"OpenGVLab/InternVL2-1B",
"OpenGVLab/InternVL3-1B",
"HuggingFaceM4/Idefics3-8B-Llama3",
"HuggingFaceTB/SmolVLM2-2.2B-Instruct",
"moonshotai/Kimi-VL-A3B-Instruct",
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
"naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
"llava-hf/llava-1.5-7b-hf",
"llava-hf/llava-v1.6-mistral-7b-hf",
"llava-hf/LLaVA-NeXT-Video-7B-hf",
@ -330,6 +338,28 @@ def test_processing_correctness(
)
# Phi4MultimodalForCausalLM share same model repo with original format
# Phi4MMForCausalLM, so we add it as a separate test case
# Remove this test after conversion PR merged:
# https://huggingface.co/microsoft/Phi-4-multimodal-instruct/discussions/70
@pytest.mark.parametrize("model_arch", ["Phi4MultimodalForCausalLM"])
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
@pytest.mark.parametrize("num_batches", [32])
@pytest.mark.parametrize("simplify_rate", [1.0])
def test_processing_correctness_phi4_multimodal(
model_arch: str,
hit_rate: float,
num_batches: int,
simplify_rate: float,
):
_test_processing_correctness(
model_arch,
hit_rate=hit_rate,
num_batches=num_batches,
simplify_rate=simplify_rate,
)
def _assert_inputs_equal(
a: MultiModalInputs,
b: MultiModalInputs,

View File

@ -201,6 +201,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
trust_remote_code=True),
"HunYuanDenseV1ForCausalLM":_HfExamplesInfo("tencent/Hunyuan-7B-Instruct-0124",
trust_remote_code=True),
"HCXVisionForCausalLM": _HfExamplesInfo(
"naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
trust_remote_code=True),
"InternLMForCausalLM": _HfExamplesInfo("internlm/internlm-chat-7b",
trust_remote_code=True),
"InternLM2ForCausalLM": _HfExamplesInfo("internlm/internlm2-chat-7b",
@ -218,6 +221,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"fp8": "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"}), # noqa: E501
"LLaMAForCausalLM": _HfExamplesInfo("decapoda-research/llama-7b-hf",
is_available_online=False),
"Llama4ForCausalLM": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501
is_available_online=False),
"MambaForCausalLM": _HfExamplesInfo("state-spaces/mamba-130m-hf"),
"Mamba2ForCausalLM": _HfExamplesInfo("mistralai/Mamba-Codestral-7B-v0.1"),
"FalconMambaForCausalLM": _HfExamplesInfo("tiiuae/falcon-mamba-7b-instruct"), # noqa: E501
@ -376,6 +381,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
extras={"2B": "OpenGVLab/InternVL2-2B",
"3.0": "OpenGVLab/InternVL3-1B"}, # noqa: E501
trust_remote_code=True),
"InternS1ForConditionalGeneration": _HfExamplesInfo("internlm/Intern-S1",
trust_remote_code=True),
"Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3", # noqa: E501
{"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}), # noqa: E501
"KeyeForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501
@ -426,6 +433,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"1.6-gemma": "AIDC-AI/Ovis1.6-Gemma2-9B"}), # noqa: E501
"Phi4MMForCausalLM": _HfExamplesInfo("microsoft/Phi-4-multimodal-instruct",
trust_remote_code=True),
"Phi4MultimodalForCausalLM": _HfExamplesInfo("microsoft/Phi-4-multimodal-instruct", # noqa: E501
revision="refs/pr/70"),
"PixtralForConditionalGeneration": _HfExamplesInfo("mistralai/Pixtral-12B-2409", # noqa: E501
tokenizer_mode="mistral"),
"QwenVLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen-VL",

View File

@ -17,7 +17,9 @@ from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tenso
CompressedTensorsW4A4Fp4, CompressedTensorsW4A16Fp4,
CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8,
CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8,
CompressedTensorsWNA16, cutlass_fp4_supported)
CompressedTensorsWNA16)
from vllm.model_executor.layers.quantization.utils.quant_utils import (
cutlass_fp4_supported)
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
sparse_cutlass_supported)
from vllm.platforms import current_platform

View File

@ -8,7 +8,10 @@ import pytest
from tests.quantization.utils import is_quant_method_supported
MODELS = ["microsoft/Phi-3-mini-4k-instruct"]
MODELS = [
"microsoft/Phi-3-mini-4k-instruct", # dense model
"ai21labs/Jamba-tiny-dev", # MoE model
]
@pytest.mark.skipif(not is_quant_method_supported("rtn"),

View File

@ -4,6 +4,7 @@
import pytest
import torch
from vllm.model_executor.models.fuyu import FuyuImagePatchInputs
from vllm.model_executor.models.phi3v import Phi3VImagePixelInputs
@ -124,3 +125,24 @@ def test_tensor_schema_with_invalid_resolve_binding_dims():
"w": 336
},
)
def test_tensor_schema_with_list_of_symbolic_dim():
flat_data = torch.stack([torch.randn(768) for _ in range(3)]) # (bn=3, fn)
patches_per_image = [64, 64, 64] # len = bn = 3
FuyuImagePatchInputs(
flat_data=flat_data,
patches_per_image=patches_per_image,
)
def test_tensor_schema_with_list_of_symbolic_dim_mismatch_in_length():
flat_data = torch.stack([torch.randn(768) for _ in range(4)]) # (bn=4, fn)
patches_per_image = [64, 64, 64] # len = 3 ≠ bn
with pytest.raises(ValueError, match="expected 'bn'=4, got 3"):
FuyuImagePatchInputs(
flat_data=flat_data,
patches_per_image=patches_per_image,
)

View File

@ -93,6 +93,7 @@ def create_common_attn_metadata(
max_query_len=max_query_len,
block_table_tensor=block_table_tensor,
slot_mapping=slot_mapping,
causal=True,
)

View File

@ -13,7 +13,6 @@ UNSUPPORTED_MODELS_V1 = [
"openai/whisper-large-v3", # transcription
"facebook/bart-large-cnn", # encoder decoder
"state-spaces/mamba-130m-hf", # mamba1
"BAAI/bge-m3", # embedding
]
MODEL = "meta-llama/Llama-3.2-1B-Instruct"

View File

@ -1,9 +1,8 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import re
import pytest
import regex as re
import requests
import torch

View File

@ -59,7 +59,7 @@ def test_basic(
# actually test chunked prompt
max_num_batched_tokens=1024,
max_model_len=8192,
gpu_memory_utilization=0.95,
gpu_memory_utilization=0.7,
max_num_seqs=max_num_seqs,
tensor_parallel_size=tensor_parallel_size) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts,

View File

@ -67,4 +67,9 @@ class InfEncoder(json.JSONEncoder):
def write_to_json(filename: str, records: list) -> None:
with open(filename, "w") as f:
json.dump(records, f, cls=InfEncoder)
json.dump(
records,
f,
cls=InfEncoder,
default=lambda o: f"<{type(o).__name__} is not JSON serializable>",
)

View File

@ -4790,26 +4790,26 @@ class VllmConfig:
def __str__(self):
return (
f"model={self.model_config.model!r},"
f" speculative_config={self.speculative_config!r},"
f" tokenizer={self.model_config.tokenizer!r}, "
f"skip_tokenizer_init={self.model_config.skip_tokenizer_init},"
f" tokenizer_mode={self.model_config.tokenizer_mode}, "
f"model={self.model_config.model!r}, "
f"speculative_config={self.speculative_config!r}, "
f"tokenizer={self.model_config.tokenizer!r}, "
f"skip_tokenizer_init={self.model_config.skip_tokenizer_init}, "
f"tokenizer_mode={self.model_config.tokenizer_mode}, "
f"revision={self.model_config.revision}, "
f"override_neuron_config={self.model_config.override_neuron_config},"
f" tokenizer_revision={self.model_config.tokenizer_revision}, "
f"override_neuron_config={self.model_config.override_neuron_config}, " # noqa
f"tokenizer_revision={self.model_config.tokenizer_revision}, "
f"trust_remote_code={self.model_config.trust_remote_code}, "
f"dtype={self.model_config.dtype}, "
f"max_seq_len={self.model_config.max_model_len},"
f" download_dir={self.load_config.download_dir!r}, "
f"max_seq_len={self.model_config.max_model_len}, "
f"download_dir={self.load_config.download_dir!r}, "
f"load_format={self.load_config.load_format}, "
f"tensor_parallel_size={self.parallel_config.tensor_parallel_size},"
f" pipeline_parallel_size={self.parallel_config.pipeline_parallel_size}, " # noqa
f"tensor_parallel_size={self.parallel_config.tensor_parallel_size}, " # noqa
f"pipeline_parallel_size={self.parallel_config.pipeline_parallel_size}, " # noqa
f"disable_custom_all_reduce={self.parallel_config.disable_custom_all_reduce}, " # noqa
f"quantization={self.model_config.quantization}, "
f"enforce_eager={self.model_config.enforce_eager}, "
f"kv_cache_dtype={self.cache_config.cache_dtype}, "
f" device_config={self.device_config.device}, "
f"device_config={self.device_config.device}, "
f"decoding_config={self.decoding_config!r}, "
f"observability_config={self.observability_config!r}, "
f"seed={self.model_config.seed}, "

View File

@ -156,8 +156,16 @@ class SharedStorageConnector(KVConnectorBase_V1):
logger.info("Inject KV cache of %d tokens to the paged memory",
len(request.slot_mapping))
for layer_name in forward_context.no_compile_layers:
attn_layer = forward_context.no_compile_layers[layer_name]
kv_cache_layer = attn_layer.kv_cache[\
layer = forward_context.no_compile_layers[layer_name]
# Only process layers that have kv_cache
# attribute (attention layers) Skip non-attention
# layers like FusedMoE/MLP etc.
kv_cache_attr = getattr(layer, 'kv_cache', None)
if kv_cache_attr is None:
continue
kv_cache_layer = kv_cache_attr[ \
forward_context.virtual_engine]
filename = self._generate_filename_debug(

View File

@ -1649,7 +1649,8 @@ class EngineArgs:
if (self.max_num_seqs is None
and usage_context in default_max_num_seqs):
self.max_num_seqs = default_max_num_seqs[usage_context]
self.max_num_seqs = min(default_max_num_seqs[usage_context],
self.max_num_batched_tokens or sys.maxsize)
logger.debug("Setting max_num_seqs to %d for %s usage context.",
self.max_num_seqs, use_context_value)

View File

@ -97,11 +97,16 @@ class MQLLMEngineClient(EngineClient):
self.model_config = engine_config.model_config
self.decoding_config = engine_config.decoding_config
# Create the tokenizer group.
self.tokenizer = init_tokenizer_from_configs(
model_config=self.model_config,
scheduler_config=engine_config.scheduler_config,
lora_config=engine_config.lora_config)
if self.vllm_config.model_config.skip_tokenizer_init:
self.tokenizer = None
else:
# Create the tokenizer group.
self.tokenizer = init_tokenizer_from_configs(
model_config=self.model_config,
scheduler_config=engine_config.scheduler_config,
lora_config=engine_config.lora_config)
self.input_preprocessor = InputPreprocessor(self.model_config,
self.tokenizer)
@ -375,7 +380,10 @@ class MQLLMEngineClient(EngineClient):
return self.input_preprocessor
async def get_tokenizer(self, lora_request: Optional[LoRARequest] = None):
return await self.tokenizer.get_lora_tokenizer_async(lora_request)
if self.tokenizer is None:
return None
else:
return await self.tokenizer.get_lora_tokenizer_async(lora_request)
async def get_vllm_config(self) -> VllmConfig:
return self.vllm_config

View File

@ -14,6 +14,7 @@ from pydantic import ValidationError
from tqdm.auto import tqdm
from typing_extensions import TypeVar, deprecated
import vllm.envs as envs
from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput,
BeamSearchSequence,
create_sort_beams_key_function)
@ -44,9 +45,10 @@ from vllm.model_executor.layers.quantization import QuantizationMethods
from vllm.outputs import (ClassificationRequestOutput, EmbeddingRequestOutput,
PoolingRequestOutput, RequestOutput,
ScoringRequestOutput)
from vllm.pooling_params import PoolingParams, PoolingTask
from vllm.pooling_params import PoolingParams
from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams,
RequestOutputKind, SamplingParams)
from vllm.tasks import PoolingTask
from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
get_cached_tokenizer)
from vllm.usage.usage_lib import UsageContext
@ -277,6 +279,16 @@ class LLM:
self.request_counter = Counter()
self.default_sampling_params: Union[dict[str, Any], None] = None
if envs.VLLM_USE_V1:
supported_tasks = self.llm_engine \
.get_supported_tasks() # type: ignore
else:
supported_tasks = self.llm_engine.model_config.supported_tasks
logger.info("Supported_tasks: %s", supported_tasks)
self.supported_tasks = supported_tasks
def get_tokenizer(
self,
lora_request: Optional[LoRARequest] = None,
@ -1170,8 +1182,7 @@ class LLM:
A list of `EmbeddingRequestOutput` objects containing the
embedding vectors in the same order as the input prompts.
"""
model_config = self.llm_engine.model_config
if "embed" not in model_config.supported_tasks:
if "embed" not in self.supported_tasks:
raise ValueError("Embedding API is not supported by this model. "
"Please set `--task embed`.")
@ -1215,8 +1226,7 @@ class LLM:
A list of `ClassificationRequestOutput` objects containing the
embedding vectors in the same order as the input prompts.
"""
model_config = self.llm_engine.model_config
if "classify" not in model_config.supported_tasks:
if "classify" not in self.supported_tasks:
raise ValueError(
"Classification API is not supported by this model. "
"Please set `--task classify`.")
@ -1397,8 +1407,8 @@ class LLM:
raise ValueError(" ".join(messages))
if all(t not in model_config.supported_tasks
for t in ("embed", "classify")):
supported_tasks = self.supported_tasks
if all(t not in supported_tasks for t in ("embed", "classify")):
raise ValueError("Score API is not supported by this model. "
"Please set `--task embed` or `--task classify`.")

View File

@ -1586,6 +1586,14 @@ async def init_app_state(
state.vllm_config = vllm_config
model_config = vllm_config.model_config
if envs.VLLM_USE_V1:
supported_tasks = await engine_client \
.get_supported_tasks() # type: ignore
else:
supported_tasks = model_config.supported_tasks
logger.info("Supported_tasks: %s", supported_tasks)
resolved_chat_template = load_chat_template(args.chat_template)
if resolved_chat_template is not None:
# Get the tokenizer to check official template
@ -1647,7 +1655,7 @@ async def init_app_state(
reasoning_parser=args.reasoning_parser,
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
enable_force_include_usage=args.enable_force_include_usage,
) if "generate" in model_config.supported_tasks else None
) if "generate" in supported_tasks else None
state.openai_serving_chat = OpenAIServingChat(
engine_client,
model_config,
@ -1664,7 +1672,7 @@ async def init_app_state(
reasoning_parser=args.reasoning_parser,
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
enable_force_include_usage=args.enable_force_include_usage,
) if "generate" in model_config.supported_tasks else None
) if "generate" in supported_tasks else None
state.openai_serving_completion = OpenAIServingCompletion(
engine_client,
model_config,
@ -1673,7 +1681,7 @@ async def init_app_state(
return_tokens_as_token_ids=args.return_tokens_as_token_ids,
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
enable_force_include_usage=args.enable_force_include_usage,
) if "generate" in model_config.supported_tasks else None
) if "generate" in supported_tasks else None
state.openai_serving_pooling = OpenAIServingPooling(
engine_client,
model_config,
@ -1681,7 +1689,7 @@ async def init_app_state(
request_logger=request_logger,
chat_template=resolved_chat_template,
chat_template_content_format=args.chat_template_content_format,
) if "encode" in model_config.supported_tasks else None
) if "encode" in supported_tasks else None
state.openai_serving_embedding = OpenAIServingEmbedding(
engine_client,
model_config,
@ -1689,24 +1697,22 @@ async def init_app_state(
request_logger=request_logger,
chat_template=resolved_chat_template,
chat_template_content_format=args.chat_template_content_format,
) if "embed" in model_config.supported_tasks else None
) if "embed" in supported_tasks else None
state.openai_serving_classification = ServingClassification(
engine_client,
model_config,
state.openai_serving_models,
request_logger=request_logger,
) if "classify" in model_config.supported_tasks else None
) if "classify" in supported_tasks else None
enable_serving_reranking = ("classify" in model_config.supported_tasks
and getattr(model_config.hf_config,
"num_labels", 0) == 1)
enable_serving_reranking = ("classify" in supported_tasks and getattr(
model_config.hf_config, "num_labels", 0) == 1)
state.openai_serving_scores = ServingScores(
engine_client,
model_config,
state.openai_serving_models,
request_logger=request_logger,
) if ("embed" in model_config.supported_tasks
or enable_serving_reranking) else None
) if ("embed" in supported_tasks or enable_serving_reranking) else None
state.openai_serving_tokenization = OpenAIServingTokenization(
engine_client,
@ -1721,13 +1727,13 @@ async def init_app_state(
model_config,
state.openai_serving_models,
request_logger=request_logger,
) if "transcription" in model_config.supported_tasks else None
) if "transcription" in supported_tasks else None
state.openai_serving_translation = OpenAIServingTranslation(
engine_client,
model_config,
state.openai_serving_models,
request_logger=request_logger,
) if "transcription" in model_config.supported_tasks else None
) if "transcription" in supported_tasks else None
state.task = model_config.task
state.enable_server_load_tracking = args.enable_server_load_tracking

View File

@ -1007,6 +1007,13 @@ class CompletionRequest(OpenAIBaseModel):
"default: 0). Any priority other than 0 will raise an error "
"if the served model does not use priority scheduling."),
)
request_id: str = Field(
default_factory=lambda: f"{random_uuid()}",
description=(
"The request_id related to this request. If the caller does "
"not set it, a random_uuid will be generated. This id is used "
"through out the inference process and return in response."),
)
logits_processors: Optional[LogitsProcessors] = Field(
default=None,
description=(
@ -1251,6 +1258,13 @@ class EmbeddingCompletionRequest(OpenAIBaseModel):
"default: 0). Any priority other than 0 will raise an error "
"if the served model does not use priority scheduling."),
)
request_id: str = Field(
default_factory=lambda: f"{random_uuid()}",
description=(
"The request_id related to this request. If the caller does "
"not set it, a random_uuid will be generated. This id is used "
"through out the inference process and return in response."),
)
# --8<-- [end:embedding-extra-params]
@ -1302,6 +1316,13 @@ class EmbeddingChatRequest(OpenAIBaseModel):
"default: 0). Any priority other than 0 will raise an error "
"if the served model does not use priority scheduling."),
)
request_id: str = Field(
default_factory=lambda: f"{random_uuid()}",
description=(
"The request_id related to this request. If the caller does "
"not set it, a random_uuid will be generated. This id is used "
"through out the inference process and return in response."),
)
# --8<-- [end:chat-embedding-extra-params]
@model_validator(mode="before")

View File

@ -14,6 +14,7 @@ import torch
from prometheus_client import start_http_server
from tqdm import tqdm
import vllm.envs as envs
from vllm.config import VllmConfig
from vllm.engine.arg_utils import AsyncEngineArgs, optional_type
from vllm.engine.protocol import EngineClient
@ -335,6 +336,14 @@ async def run_batch(
model_config = vllm_config.model_config
if envs.VLLM_USE_V1:
supported_tasks = await engine_client \
.get_supported_tasks() # type: ignore
else:
supported_tasks = model_config.supported_tasks
logger.info("Supported_tasks: %s", supported_tasks)
# Create the openai serving objects.
openai_serving_models = OpenAIServingModels(
engine_client=engine_client,
@ -351,7 +360,7 @@ async def run_batch(
chat_template=None,
chat_template_content_format="auto",
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
) if "generate" in model_config.supported_tasks else None
) if "generate" in supported_tasks else None
openai_serving_embedding = OpenAIServingEmbedding(
engine_client,
model_config,
@ -359,19 +368,17 @@ async def run_batch(
request_logger=request_logger,
chat_template=None,
chat_template_content_format="auto",
) if "embed" in model_config.supported_tasks else None
) if "embed" in supported_tasks else None
enable_serving_reranking = ("classify" in model_config.supported_tasks
and getattr(model_config.hf_config,
"num_labels", 0) == 1)
enable_serving_reranking = ("classify" in supported_tasks and getattr(
model_config.hf_config, "num_labels", 0) == 1)
openai_serving_scores = ServingScores(
engine_client,
model_config,
openai_serving_models,
request_logger=request_logger,
) if ("embed" in model_config.supported_tasks
or enable_serving_reranking) else None
) if ("embed" in supported_tasks or enable_serving_reranking) else None
tracker = BatchProgressTracker()
logger.info("Reading batch from %s...", args.input_file)

View File

@ -113,7 +113,9 @@ class OpenAIServingCompletion(OpenAIServing):
return self.create_error_response(
"Echo is unsupported with prompt embeds.")
request_id = f"cmpl-{self._base_request_id(raw_request)}"
request_id = (
f"cmpl-"
f"{self._base_request_id(raw_request, request.request_id)}")
created_time = int(time.time())
request_metadata = RequestResponseMetadata(request_id=request_id)

View File

@ -163,8 +163,9 @@ class OpenAIServingEmbedding(EmbeddingMixin):
for the API specification. This API mimics the OpenAI Embedding API.
"""
model_name = self._get_model_name(request.model)
request_id = (f"{self.request_id_prefix}-"
f"{self._base_request_id(raw_request)}")
request_id = (
f"{self.request_id_prefix}-"
f"{self._base_request_id(raw_request, request.request_id)}")
ctx = EmbeddingServeContext(
request=request,

View File

@ -880,7 +880,10 @@ class OpenAIServing:
_chat_template_kwargs.update(chat_template_kwargs or {})
request_prompt: Union[str, list[int]]
if isinstance(tokenizer, MistralTokenizer):
if tokenizer is None:
request_prompt = "placeholder"
elif isinstance(tokenizer, MistralTokenizer):
request_prompt = apply_mistral_chat_template(
tokenizer,
messages=messages,
@ -910,7 +913,14 @@ class OpenAIServing:
request = tool_parser(tokenizer).adjust_request( # type: ignore
request=request)
if isinstance(request_prompt, str):
if tokenizer is None:
assert isinstance(request_prompt, str), (
"Prompt has to be a string", \
"when the tokenizer is not initialised"
)
prompt_inputs = TextTokensPrompt(prompt=request_prompt,
prompt_token_ids=[1])
elif isinstance(request_prompt, str):
prompt_inputs = await self._tokenize_prompt_input_async(
request,
tokenizer,
@ -947,9 +957,11 @@ class OpenAIServing:
def _load_and_validate_embed(embed: bytes) -> EmbedsPrompt:
tensor = torch.load(io.BytesIO(base64.b64decode(embed)),
weights_only=True)
assert isinstance(
tensor,
(torch.FloatTensor, torch.BFloat16Tensor, torch.HalfTensor))
assert isinstance(tensor, torch.Tensor) and tensor.dtype in (
torch.float32,
torch.bfloat16,
torch.float16,
)
if tensor.dim() > 2:
tensor = tensor.squeeze(0)
assert tensor.dim() == 2

View File

@ -96,7 +96,11 @@ class OpenAIServingPooling(OpenAIServing):
self.max_model_len, truncate_prompt_tokens)
lora_request = self._maybe_get_adapters(request)
tokenizer = await self.engine_client.get_tokenizer(lora_request)
if self.model_config.skip_tokenizer_init:
tokenizer = None
else:
tokenizer = await self.engine_client.get_tokenizer(lora_request
)
if isinstance(request, PoolingChatRequest):
(

View File

@ -16,8 +16,8 @@ from vllm.config import VllmConfig
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.pooling_params import PoolingTask
from vllm.sequence import ExecuteModelRequest, PoolerOutput
from vllm.tasks import SupportedTask
from vllm.utils import make_async
from vllm.worker.worker_base import WorkerBase
@ -136,9 +136,9 @@ class ExecutorBase(ABC):
return self.collective_rpc(rpc_func)
@cached_property # Avoid unnecessary RPC calls
def supported_pooling_tasks(self) -> tuple[PoolingTask, ...]:
output = self.collective_rpc("get_supported_pooling_tasks")
return tuple({task for tasks in output for task in tasks})
def supported_tasks(self) -> tuple[SupportedTask, ...]:
output = self.collective_rpc("get_supported_tasks")
return output[0]
def execute_model(
self, execute_model_req: ExecuteModelRequest

View File

@ -1127,6 +1127,7 @@ def flashinfer_fused_moe_blockscale_fp8(
tile_tokens_dim=_get_tile_tokens_dim(x.shape[0], top_k,
global_num_experts),
routing_method_type=2, # DeepSeek-styled routing method
use_shuffled_weight=False,
)

View File

@ -5,144 +5,8 @@ from typing import Optional
import torch
from vllm import _custom_ops as ops
from vllm.triton_utils import tl, triton
from vllm.utils import cdiv, round_up
@triton.jit
def moe_align_block_size_stage1(
topk_ids_ptr,
tokens_cnts_ptr,
num_experts: tl.constexpr,
numel: tl.constexpr,
tokens_per_thread: tl.constexpr,
):
pid = tl.program_id(0)
start_idx = pid * tokens_per_thread
off_c = (pid + 1) * num_experts
for i in range(tokens_per_thread):
if start_idx + i < numel:
idx = tl.load(topk_ids_ptr + start_idx + i)
token_cnt = tl.load(tokens_cnts_ptr + off_c + idx)
tl.store(tokens_cnts_ptr + off_c + idx, token_cnt + 1)
@triton.jit
def moe_align_block_size_stage2(
tokens_cnts_ptr,
num_experts: tl.constexpr,
):
pid = tl.program_id(0)
last_cnt = 0
for i in range(1, num_experts + 1):
token_cnt = tl.load(tokens_cnts_ptr + i * num_experts + pid)
last_cnt = last_cnt + token_cnt
tl.store(tokens_cnts_ptr + i * num_experts + pid, last_cnt)
@triton.jit
def moe_align_block_size_stage3(
total_tokens_post_pad_ptr,
tokens_cnts_ptr,
cumsum_ptr,
num_experts: tl.constexpr,
block_size: tl.constexpr,
):
last_cumsum = 0
off_cnt = num_experts * num_experts
for i in range(1, num_experts + 1):
token_cnt = tl.load(tokens_cnts_ptr + off_cnt + i - 1)
last_cumsum = last_cumsum + tl.cdiv(token_cnt, block_size) * block_size
tl.store(cumsum_ptr + i, last_cumsum)
tl.store(total_tokens_post_pad_ptr, last_cumsum)
@triton.jit
def moe_align_block_size_stage4(
topk_ids_ptr,
sorted_token_ids_ptr,
expert_ids_ptr,
tokens_cnts_ptr,
cumsum_ptr,
num_experts: tl.constexpr,
block_size: tl.constexpr,
numel: tl.constexpr,
tokens_per_thread: tl.constexpr,
):
pid = tl.program_id(0)
start_idx = tl.load(cumsum_ptr + pid)
end_idx = tl.load(cumsum_ptr + pid + 1)
for i in range(start_idx, end_idx, block_size):
tl.store(expert_ids_ptr + i // block_size, pid)
start_idx = pid * tokens_per_thread
off_t = pid * num_experts
for i in range(start_idx, tl.minimum(start_idx + tokens_per_thread,
numel)):
expert_id = tl.load(topk_ids_ptr + i)
token_cnt = tl.load(tokens_cnts_ptr + off_t + expert_id)
rank_post_pad = token_cnt + tl.load(cumsum_ptr + expert_id)
tl.store(sorted_token_ids_ptr + rank_post_pad, i)
tl.store(tokens_cnts_ptr + off_t + expert_id, token_cnt + 1)
# Triton implementation based on:
# https://github.com/sgl-project/sglang/commit/ba5112ff691d791a9e38c6c71f59324a5fcb49d0
def moe_align_block_size_triton(
topk_ids: torch.Tensor,
num_experts: int,
block_size: int,
sorted_token_ids: torch.Tensor,
expert_ids: torch.Tensor,
num_tokens_post_pad: torch.Tensor,
) -> None:
numel = topk_ids.numel()
grid = (num_experts, )
tokens_cnts = torch.zeros((num_experts + 1, num_experts),
dtype=torch.int32,
device=topk_ids.device)
cumsum = torch.zeros((num_experts + 1, ),
dtype=torch.int32,
device=topk_ids.device)
tokens_per_thread = cdiv(numel, num_experts)
sorted_token_ids.fill_(numel)
expert_ids.zero_()
moe_align_block_size_stage1[grid](
topk_ids,
tokens_cnts,
num_experts,
numel,
tokens_per_thread,
)
moe_align_block_size_stage2[grid](
tokens_cnts,
num_experts,
)
moe_align_block_size_stage3[(1, )](
num_tokens_post_pad,
tokens_cnts,
cumsum,
num_experts,
block_size,
)
moe_align_block_size_stage4[grid](
topk_ids,
sorted_token_ids,
expert_ids,
tokens_cnts,
cumsum,
num_experts,
block_size,
numel,
tokens_per_thread,
)
from vllm.triton_utils import triton
from vllm.utils import round_up
def moe_align_block_size(

View File

@ -76,43 +76,43 @@ def _moe_unpermute_and_reduce(
def moe_permute(
hidden_states: torch.Tensor,
topk_weights: torch.Tensor,
a1q_scale: Optional[torch.Tensor],
topk_ids: torch.Tensor,
token_expert_indices: torch.Tensor,
topk: int,
n_expert: int,
n_local_expert: int,
n_local_expert: int = -1,
expert_map: Optional[torch.Tensor] = None,
align_block_size: Optional[int] = None,
fill_invalid_expert: int = -1
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
) -> tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor, torch.Tensor,
torch.Tensor]:
"""
This function expands and permutes activation to gather uncontinuous tokens
for each expert.
Parameters:
- hidden_states (torch.Tensor): The input tensor to the MoE layer.
- topk_weights (torch.Tensor): topk expert route weight for each token.
- a1q_scale (Optional[torch.Tensor]): quant scale for hidden_states
- topk_ids (torch.Tensor): topk expert route id for each token.
- token_expert_indices (torch.Tensor): indice for expanded hidden.
- topk (int): The number of top-k experts to select.
- n_expert (int): The number of expert.
- n_local_expert (int): The number of expert in current EP rank.
- expert_map (Optional[torch.Tensor]): A tensor mapping expert indices
from the global expert space to the local expert space of the expert
from the global expert space to the local expert space of the expert
parallel shard.
- align_block_size (Optional[int]): align group gemm block size for deepgemm
- fill_invalid_expert(int): fill expert id in m_indices for invalid expert
to workaround DeepGemm unsupported -1 in m_indices
Returns:
- permuted_hidden_states (torch.Tensor): permuted activation.
- a1q_scale (Optional[torch.Tensor]): quant scale for hidden_states
- expert_first_token_offset (torch.Tensor): offset of the first token
of each expert for standard grouped gemm. if enable 'align_block_size'
expert_first_token_offset will align up to 'align_block_size'.
- src_row_id2dst_row_id_map (torch.Tensor): idx map for moe_unpermute.
- inv_permuted_idx (torch.Tensor): idx map for moe_unpermute.
- permuted_idx (torch.Tensor): idx map from hidden to permuted_hidden.
- m_indices: m_indices for grouped gemm in deepgemm,`m_indices[i]` records
the group which the j-th row of the LHS belong to.`
"""
n_token, n_hidden = hidden_states.size()
topk = topk_ids.size(1)
assert (n_hidden * hidden_states.element_size()
) % 16 == 0, "permue kernel need hidden dim align to 16B"
permuted_row_size = n_token * topk
@ -120,12 +120,19 @@ def moe_permute(
permuted_row_size = (permuted_row_size + n_expert *
(align_block_size - 1) + align_block_size -
1) // align_block_size * align_block_size
if n_local_expert == -1:
n_local_expert = n_expert
permuted_hidden_states = torch.empty(
(permuted_row_size, n_hidden),
dtype=hidden_states.dtype,
device=hidden_states.device,
)
token_expert_indices = torch.arange(0,
n_token * topk,
dtype=torch.int32,
device=hidden_states.device).reshape(
(n_token, topk))
m_indices = torch.full((permuted_row_size, ),
fill_invalid_expert,
dtype=torch.int32,
@ -133,57 +140,54 @@ def moe_permute(
expert_first_token_offset = torch.empty(n_local_expert + 1,
dtype=torch.int64,
device=hidden_states.device)
src_row_id2dst_row_id_map = torch.empty((n_token, topk),
dtype=torch.int32,
device=hidden_states.device)
torch.ops._moe_C.moe_permute(hidden_states, topk_weights, topk_ids,
token_expert_indices, expert_map, n_expert,
n_local_expert, topk, align_block_size,
permuted_hidden_states,
expert_first_token_offset,
src_row_id2dst_row_id_map, m_indices)
return (permuted_hidden_states, expert_first_token_offset,
src_row_id2dst_row_id_map, m_indices)
permuted_idx = torch.full((permuted_row_size, ),
n_token * topk,
dtype=torch.int32,
device=hidden_states.device)
inv_permuted_idx = torch.empty((n_token, topk),
dtype=torch.int32,
device=hidden_states.device)
topk_ids = topk_ids.to(torch.int32)
torch.ops._moe_C.moe_permute(hidden_states, topk_ids, token_expert_indices,
expert_map, n_expert, n_local_expert, topk,
align_block_size, permuted_hidden_states,
expert_first_token_offset, inv_permuted_idx,
permuted_idx, m_indices)
if a1q_scale is not None:
a1q_scale = a1q_scale[permuted_idx.clamp(max=n_token * topk - 1) //
topk]
return (permuted_hidden_states, a1q_scale, expert_first_token_offset,
inv_permuted_idx.flatten(), m_indices)
def moe_unpermute(
out: torch.Tensor,
permuted_hidden_states: torch.Tensor,
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
src_row_id2dst_row_id_map: torch.Tensor,
expert_first_token_offset: torch.Tensor,
topk: int,
n_expert: int,
n_local_expert: int,
) -> torch.Tensor:
inv_permuted_idx: torch.Tensor,
expert_first_token_offset: Optional[torch.Tensor] = None,
) -> None:
"""
This function expands and permutes activation to gathering uncontinuous
tokens for each expert.
Parameters:
- out (torch.Tensor): output tensor
- permuted_hidden_states (torch.Tensor): permuted activation.
- topk_weights (torch.Tensor): topk expert route weight for each token.
- topk_ids (torch.Tensor): topk expert route id for each token.
- expert_first_token_offset (torch.Tensor): offset of the first token
of each expert for grouped gemm.
- topk (int): The number of top-k experts to select.
- n_expert (int): The number of expert.
- n_local_expert (int): The number of expert in current EP rank.
- inv_permuted_idx (torch.Tensor): row idx map for moe_unpermute.
- expert_first_token_offset (Optional[torch.Tensor]): offset of the first
token of each expert for grouped gemm.
Returns:
- hidden_states (torch.Tensor): The reduced and unpermuted activation
tensor.
"""
n_token, n_hidden = topk_weights.size(0), permuted_hidden_states.size(-1)
topk = topk_weights.size(1)
n_hidden = permuted_hidden_states.size(-1)
assert (n_hidden * permuted_hidden_states.element_size()
) % 16 == 0, "unpermue kernel need hidden dim align to 16B"
hidden_states = torch.empty((n_token, n_hidden),
dtype=permuted_hidden_states.dtype,
device=permuted_hidden_states.device)
torch.ops._moe_C.moe_unpermute(permuted_hidden_states, topk_weights,
topk_ids, src_row_id2dst_row_id_map,
expert_first_token_offset, n_expert,
n_local_expert, topk, hidden_states)
return hidden_states
inv_permuted_idx, expert_first_token_offset,
topk, out)
def moe_permute_unpermute_supported():

View File

@ -24,6 +24,7 @@ from vllm.model_executor.layers.mamba.mamba_utils import (
extra_groups_for_head_shards, get_mamba_state_shape)
from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
causal_conv1d_fn, causal_conv1d_update)
from vllm.model_executor.layers.mamba.ops.layernorm_gated import rms_norm_gated
from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
selective_state_update)
from vllm.model_executor.layers.mamba.ops.ssd_combined import (
@ -133,21 +134,15 @@ class Mixer2RMSNormGated(CustomOp):
return x * nn.functional.silu(gate.to(
torch.float32)).to(input_dtype)
if self.tp_size > 1 or self.n_groups != 1:
if (((self.n_groups % self.tp_size) != 0) or self.n_groups != 1):
return self.forward_native(x, gate)
from vllm import _custom_ops as ops
# cast x and gate to float32 before silu
out = torch.empty_like(x)
y = x * nn.functional.silu(gate.to(torch.float32))
ops.rms_norm(
out,
y.to(x.dtype),
self.weight.data,
self.variance_epsilon,
)
return out
return rms_norm_gated(x,
self.weight.data,
bias=None,
z=gate,
eps=self.variance_epsilon,
norm_before_gate=False)
def mamba_v2_sharded_weight_loader(

View File

@ -0,0 +1,168 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Copyright (c) 2024, Tri Dao.
# Adapted from https://github.com/state-spaces/mamba/blob/60dadf2e0ee730ac337035d5533de10bc26e4847/mamba_ssm/ops/triton/layernorm_gated.py
import torch
from vllm.triton_utils import tl, triton
@triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None})
@triton.heuristics({"HAS_Z": lambda args: args["Z"] is not None})
@triton.jit
def _layer_norm_fwd_1pass_kernel(
X, # pointer to the input
Y, # pointer to the output
W, # pointer to the weights
B, # pointer to the biases
Z, # pointer to the other branch
Mean, # pointer to the mean
Rstd, # pointer to the 1/std
stride_x_row: tl.int64,
stride_y_row: tl.int64,
stride_z_row: tl.int64,
M: tl.int64, # number of rows in X
N: tl.int64, # number of columns in X
eps, # epsilon to avoid division by zero
BLOCK_N: tl.constexpr,
HAS_BIAS: tl.constexpr,
HAS_Z: tl.constexpr,
NORM_BEFORE_GATE: tl.constexpr,
IS_RMS_NORM: tl.constexpr,
):
# Map the program id to the row of X and Y it should compute.
row = tl.program_id(0)
group = tl.program_id(1)
X += row * stride_x_row + group * N
Y += row * stride_y_row + group * N
if HAS_Z:
Z += row * stride_z_row + group * N
if not IS_RMS_NORM:
Mean += group * M
Rstd += group * M
W += group * N
if HAS_BIAS:
B += group * N
# Compute mean and variance
cols = tl.arange(0, BLOCK_N)
x = tl.load(X + cols, mask=cols < N, other=0.).to(tl.float32)
if HAS_Z and not NORM_BEFORE_GATE:
z = tl.load(Z + cols, mask=cols < N).to(tl.float32)
x *= z * tl.sigmoid(z)
if not IS_RMS_NORM:
mean = tl.sum(x, axis=0) / N
tl.store(Mean + row, mean)
xbar = tl.where(cols < N, x - mean, 0.)
var = tl.sum(xbar * xbar, axis=0) / N
else:
xbar = tl.where(cols < N, x, 0.)
var = tl.sum(xbar * xbar, axis=0) / N
rstd = 1 / tl.sqrt(var + eps)
tl.store(Rstd + row, rstd)
# Normalize and apply linear transformation
mask = cols < N
w = tl.load(W + cols, mask=mask).to(tl.float32)
if HAS_BIAS:
b = tl.load(B + cols, mask=mask).to(tl.float32)
x_hat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd
y = x_hat * w + b if HAS_BIAS else x_hat * w
if HAS_Z and NORM_BEFORE_GATE:
z = tl.load(Z + cols, mask=mask).to(tl.float32)
y *= z * tl.sigmoid(z)
# Write output
tl.store(Y + cols, y, mask=mask)
def _layer_norm_fwd(x,
weight,
bias,
eps,
z=None,
out=None,
group_size=None,
norm_before_gate=True,
is_rms_norm=False):
M, N = x.shape
if group_size is None:
group_size = N
assert N % group_size == 0
ngroups = N // group_size
assert x.stride(-1) == 1
if z is not None:
assert z.stride(-1) == 1
assert z.shape == (M, N)
assert weight.shape == (N, )
assert weight.stride(-1) == 1
if bias is not None:
assert bias.stride(-1) == 1
assert bias.shape == (N, )
# allocate output
if out is not None:
assert out.shape == x.shape
else:
out = torch.empty_like(x)
assert out.stride(-1) == 1
mean = torch.empty((ngroups * M, ), dtype=torch.float32,
device=x.device) if not is_rms_norm else None
rstd = torch.empty((ngroups * M, ), dtype=torch.float32, device=x.device)
# Less than 64KB per feature: enqueue fused kernel
MAX_FUSED_SIZE = 65536 // x.element_size()
BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(group_size))
if group_size > BLOCK_N:
raise RuntimeError(
"This layer norm doesn't support feature dim >= 64KB.")
# heuristics for number of warps
num_warps = min(max(BLOCK_N // 256, 1), 8)
grid = (M, ngroups)
with torch.cuda.device(x.device.index):
_layer_norm_fwd_1pass_kernel[grid](x,
out,
weight,
bias,
z,
mean,
rstd,
x.stride(0),
out.stride(0),
z.stride(0) if z is not None else 0,
M,
group_size,
eps,
BLOCK_N=BLOCK_N,
NORM_BEFORE_GATE=norm_before_gate,
IS_RMS_NORM=is_rms_norm,
num_warps=num_warps)
return out, mean, rstd
def rms_norm_gated(x,
weight,
bias,
z=None,
eps=1e-6,
group_size=None,
norm_before_gate=True):
x_shape_og = x.shape
# reshape input data into 2D tensor
x = x.reshape(-1, x.shape[-1])
if x.stride(-1) != 1:
x = x.contiguous()
if z is not None:
assert z.shape == x_shape_og
z = z.reshape(-1, z.shape[-1])
if z.stride(-1) != 1:
z = z.contiguous()
weight = weight.contiguous()
if bias is not None:
bias = bias.contiguous()
y, _, _ = _layer_norm_fwd(x,
weight,
bias,
eps,
z=z,
group_size=group_size,
norm_before_gate=norm_before_gate,
is_rms_norm=True)
return y.reshape(x_shape_og)

View File

@ -16,8 +16,9 @@ from vllm.config import ModelConfig, PoolerConfig
from vllm.model_executor.pooling_metadata import ( # noqa: E501
PoolingMetadata as V0PoolingMetadata)
from vllm.model_executor.pooling_metadata import PoolingTensors
from vllm.pooling_params import PoolingParams, PoolingTask
from vllm.pooling_params import PoolingParams
from vllm.sequence import PoolerOutput, PoolingSequenceGroupOutput
from vllm.tasks import PoolingTask
from vllm.utils import resolve_obj_by_qualname
from vllm.v1.pool.metadata import PoolingMetadata as V1PoolingMetadata

View File

@ -33,7 +33,7 @@ from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
find_matched_target, is_activation_quantization_format,
should_ignore_layer)
from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
from vllm.model_executor.layers.quantization.utils.nvfp4_emulation_utils import ( # noqa: E501
from vllm.model_executor.layers.quantization.utils.quant_utils import (
cutlass_fp4_supported)
from vllm.platforms import current_platform

View File

@ -27,8 +27,8 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
prepare_moe_fp4_layer_for_marlin)
from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
prepare_moe_fp8_layer_for_marlin)
from vllm.model_executor.layers.quantization.utils.nvfp4_emulation_utils import ( # noqa: E501
cutlass_fp4_supported)
from vllm.model_executor.layers.quantization.utils.quant_utils import (
cutlass_fp4_supported, swizzle_blockscale)
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
all_close_1d, normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize)
from vllm.model_executor.utils import set_weight_attrs
@ -193,29 +193,6 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
{"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
set_weight_attrs(w2_input_scale, extra_weight_attrs)
def swizzle_blockscale(self, scale: torch.tensor):
assert (scale.dtype == torch.float8_e4m3fn)
# Pad and blockwise interleave weight_scale
scale_ndim = scale.ndim
if scale.ndim == 2:
scale = scale.unsqueeze(0)
assert scale.ndim == 3
B, M, K = scale.shape
round_up_multiple = lambda x, m: (x + m - 1) // m * m
M_padded = round_up_multiple(M, 128)
K_padded = round_up_multiple(K, 4)
padded_scale = torch.zeros((B, M_padded, K_padded), dtype=scale.dtype)
padded_scale[:B, :M, :K] = scale
batches, rows, cols = padded_scale.shape
assert rows % 128 == 0
assert cols % 4 == 0
padded_scale = padded_scale.reshape(batches, rows // 128, 4, 32,
cols // 4, 4)
swizzled_scale = padded_scale.permute((0, 1, 4, 3, 2, 5))
swizzled_scale = swizzled_scale.contiguous().cuda()
return (swizzled_scale.reshape(M, K)
if scale_ndim == 2 else swizzled_scale.reshape(B, M, K))
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
# From packed to weight
@ -243,13 +220,13 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
return
# swizzle weight scales
layer.w13_blockscale_swizzled = torch.nn.Parameter(
self.swizzle_blockscale(layer.w13_weight_scale),
requires_grad=False)
layer.w13_blockscale_swizzled = torch.nn.Parameter(swizzle_blockscale(
layer.w13_weight_scale),
requires_grad=False)
layer.w2_blockscale_swizzled = torch.nn.Parameter(
self.swizzle_blockscale(layer.w2_weight_scale),
requires_grad=False)
layer.w2_blockscale_swizzled = torch.nn.Parameter(swizzle_blockscale(
layer.w2_weight_scale),
requires_grad=False)
# w13
w13_input_global_scale = layer.w13_input_global_scale.max(

View File

@ -9,8 +9,7 @@ from torch.nn.parameter import Parameter
import vllm.envs as envs
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from vllm._custom_ops import (cutlass_scaled_fp4_mm,
cutlass_scaled_mm_supports_fp4, scaled_fp4_quant)
from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
from vllm.distributed import get_ep_group
from vllm.logger import init_logger
from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig
@ -28,7 +27,7 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
apply_fp4_marlin_linear, is_fp4_marlin_supported,
prepare_fp4_layer_for_marlin, prepare_moe_fp4_layer_for_marlin)
from vllm.model_executor.layers.quantization.utils.quant_utils import (
GroupShape, is_layer_skipped)
GroupShape, cutlass_fp4_supported, is_layer_skipped, swizzle_blockscale)
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
Fp8LinearOp, requantize_with_max_scale)
from vllm.model_executor.parameter import (ModelWeightParameter,
@ -667,14 +666,6 @@ class ModelOptNvFp4Config(QuantizationConfig):
return None
def cutlass_fp4_supported() -> bool:
if not current_platform.is_cuda():
return False
capability_tuple = current_platform.get_device_capability()
capability = -1 if capability_tuple is None else capability_tuple.to_int()
return cutlass_scaled_mm_supports_fp4(capability)
class ModelOptFp8KVCacheMethod(BaseKVCacheMethod):
"""
Supports loading kv-cache scaling factors from FP8 checkpoints.
@ -772,29 +763,6 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
layer.register_parameter("weight_scale", weight_scale)
def swizzle_blockscale(self, scale: torch.tensor):
assert (scale.dtype == torch.float8_e4m3fn)
# Pad and blockwise interleave weight_scale
scale_ndim = scale.ndim
if scale.ndim == 2:
scale = scale.unsqueeze(0)
assert scale.ndim == 3
B, M, K = scale.shape
round_up_multiple = lambda x, m: (x + m - 1) // m * m
M_padded = round_up_multiple(M, 128)
K_padded = round_up_multiple(K, 4)
padded_scale = torch.zeros((B, M_padded, K_padded), dtype=scale.dtype)
padded_scale[:B, :M, :K] = scale
batches, rows, cols = padded_scale.shape
assert rows % 128 == 0
assert cols % 4 == 0
padded_scale = padded_scale.reshape(batches, rows // 128, 4, 32,
cols // 4, 4)
swizzled_scale = padded_scale.permute((0, 1, 4, 3, 2, 5))
swizzled_scale = swizzled_scale.contiguous().cuda()
return (swizzled_scale.reshape(M, K)
if scale_ndim == 2 else swizzled_scale.reshape(B, M, K))
def process_weights_after_loading(self, layer: Module) -> None:
# global scales:
@ -814,7 +782,7 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
"Expected weight_scale.dim(1) to be divisible by 16")
assert (layer.weight_scale.dtype == torch.float8_e4m3fn), (
"Weight Block scale must be represented as FP8-E4M3")
swizzled_weight_scale = self.swizzle_blockscale(layer.weight_scale)
swizzled_weight_scale = swizzle_blockscale(layer.weight_scale)
layer.weight_scale_swizzled = Parameter(swizzled_weight_scale,
requires_grad=False)
@ -1060,29 +1028,6 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
weight_loader=weight_loader)
layer.register_parameter("w2_input_scale", w2_input_scale)
def swizzle_blockscale(self, scale: torch.tensor):
assert (scale.dtype == torch.float8_e4m3fn)
# Pad and blockwise interleave weight_scale
scale_ndim = scale.ndim
if scale.ndim == 2:
scale = scale.unsqueeze(0)
assert scale.ndim == 3
B, M, K = scale.shape
round_up_multiple = lambda x, m: (x + m - 1) // m * m
M_padded = round_up_multiple(M, 128)
K_padded = round_up_multiple(K, 4)
padded_scale = torch.zeros((B, M_padded, K_padded), dtype=scale.dtype)
padded_scale[:B, :M, :K] = scale
batches, rows, cols = padded_scale.shape
assert rows % 128 == 0
assert cols % 4 == 0
padded_scale = padded_scale.reshape(batches, rows // 128, 4, 32,
cols // 4, 4)
swizzled_scale = padded_scale.permute((0, 1, 4, 3, 2, 5))
swizzled_scale = swizzled_scale.contiguous().cuda()
return (swizzled_scale.reshape(M, K)
if scale_ndim == 2 else swizzled_scale.reshape(B, M, K))
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
# GEMM 1
# The FlashInfer Cutlass fused MoE kernel expects the combined weights
@ -1128,8 +1073,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
"Expected weight_scale.dim(1) to be divisible by 16")
assert (layer.w13_weight_scale.dtype == torch.float8_e4m3fn), (
"Weight Blockscale must be represented as FP8-E4M3")
w13_blockscale_swizzled = self.swizzle_blockscale(
layer.w13_weight_scale)
w13_blockscale_swizzled = swizzle_blockscale(layer.w13_weight_scale)
layer.w13_blockscale_swizzled = Parameter(w13_blockscale_swizzled,
requires_grad=False)
@ -1151,7 +1095,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
"Expected weight_scale.dim(1) to be divisible by 16")
assert (layer.w2_weight_scale.dtype == torch.float8_e4m3fn), (
"Weight Blockscale must be represented as FP8-E4M3")
w2_blockscale_swizzled = self.swizzle_blockscale(layer.w2_weight_scale)
w2_blockscale_swizzled = swizzle_blockscale(layer.w2_weight_scale)
layer.w2_blockscale_swizzled = Parameter(w2_blockscale_swizzled,
requires_grad=False)

View File

@ -3,18 +3,19 @@
# Copyright © 2025, Oracle and/or its affiliates.
import os
from typing import Any, Optional
from typing import Any, Callable, Optional
import torch
import torch.nn.functional as F
from torch.nn.parameter import Parameter
from vllm.logger import init_logger
from vllm.model_executor.layers.fused_moe import FusedMoE, FusedMoEMethodBase
from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
set_weight_attrs)
from vllm.model_executor.layers.quantization import QuantizationMethods
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig)
QuantizationConfig, QuantizeMethodBase)
logger = init_logger(__name__)
"""By default, use 8 bit as target precision, but it can be
@ -71,9 +72,11 @@ class RTNConfig(QuantizationConfig):
return cls(weight_bits, group_size)
def get_quant_method(self, layer: torch.nn.Module,
prefix: str) -> Optional["RTNLinearMethod"]:
prefix: str) -> Optional["QuantizeMethodBase"]:
if isinstance(layer, LinearBase):
return RTNLinearMethod(self)
elif isinstance(layer, FusedMoE):
return RTNMoEMethod(self)
return None
@ -94,11 +97,18 @@ class RTNTensor:
self.data.narrow(dim, start // factor, length // factor),
self.scale.narrow(dim, start, length), self.quant_config)
def __getitem__(self, key):
return RTNTensor(self.data[key], self.scale[key], self.quant_config)
@property
def shape(self):
shape = self.data.shape
factor = 1 if self.quant_config.weight_bits == 8 else 2
return torch.Size((shape[0] * factor, shape[1]))
batch_present = len(shape) == 3
if batch_present:
return torch.Size((shape[0], shape[1] * factor, shape[2]))
else:
return torch.Size((shape[0] * factor, shape[1]))
def copy_(self, loaded_weight: torch.Tensor) -> None:
qweight, weight_scale = rtn_quantize(loaded_weight.cuda(),
@ -165,7 +175,7 @@ class RTNLinearMethod(LinearMethodBase):
weight = RTNParameter(data=torch.empty(output_size_per_partition //
factor,
input_size_per_partition,
dtype=torch.int8),
dtype=torch.uint8),
scale=scale,
quant_config=self.quant_config)
@ -180,18 +190,7 @@ class RTNLinearMethod(LinearMethodBase):
layer.output_size_per_partition = output_size_per_partition
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
"""torch.compile does not know how to deal with a Parameter subclass
(aka RTNParameter). As we don't really need RTNParameters for the
forward pass, we replace them with equivalent instances of Parameters.
"""
old_weight = layer.weight
assert isinstance(old_weight, RTNParameter)
data = old_weight.data.data
delattr(layer, "weight")
new_weight = Parameter(data=data, requires_grad=False)
layer.register_parameter("weight", new_weight)
fix_weights(layer, "weight")
def apply(self,
layer: torch.nn.Module,
@ -209,6 +208,128 @@ class RTNLinearMethod(LinearMethodBase):
return out
class RTNMoEMethod(FusedMoEMethodBase):
def __init__(self, quant_config: RTNConfig):
self.quant_config = quant_config
def create_weights(self, layer: torch.nn.Module, num_experts: int,
hidden_size: int, intermediate_size_per_partition: int,
params_dtype: torch.dtype, **extra_weight_attrs):
factor = 1 if self.quant_config.weight_bits == 8 else 2
# Fused gate_up_proj (column parallel)
num_groups_per_col = (hidden_size // self.quant_config.group_size
if self.quant_config.group_size != -1 else 1)
w13_scale = Parameter(
torch.empty(num_experts,
2 * intermediate_size_per_partition,
num_groups_per_col,
dtype=params_dtype),
requires_grad=False,
)
layer.register_parameter("w13_scale", w13_scale)
w13_weight = RTNParameter(data=torch.empty(
num_experts,
2 * intermediate_size_per_partition // factor,
hidden_size,
dtype=torch.uint8),
scale=w13_scale,
quant_config=self.quant_config)
layer.register_parameter("w13_weight", w13_weight)
set_weight_attrs(w13_weight, extra_weight_attrs)
# down_proj (row parallel)
num_groups_per_col = (intermediate_size_per_partition //
self.quant_config.group_size
if self.quant_config.group_size != -1 else 1)
w2_scale = Parameter(torch.zeros(num_experts,
hidden_size,
num_groups_per_col,
dtype=params_dtype),
requires_grad=False)
layer.register_parameter("w2_scale", w2_scale)
w2_weight = RTNParameter(data=torch.empty(
num_experts,
hidden_size // factor,
intermediate_size_per_partition,
dtype=torch.uint8),
scale=w2_scale,
quant_config=self.quant_config)
layer.register_parameter("w2_weight", w2_weight)
set_weight_attrs(w2_weight, extra_weight_attrs)
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
weight_bits = self.quant_config.weight_bits
fix_weights(layer, "w13_weight", weight_bits == 4)
fix_weights(layer, "w2_weight", weight_bits == 4)
def apply(
self,
layer: torch.nn.Module,
x: torch.Tensor,
router_logits: torch.Tensor,
top_k: int,
renormalize: bool,
use_grouped_topk: bool = False,
topk_group: Optional[int] = None,
num_expert_group: Optional[int] = None,
global_num_experts: int = -1,
expert_map: Optional[torch.Tensor] = None,
custom_routing_function: Optional[Callable] = None,
scoring_func: str = "softmax",
e_score_correction_bias: Optional[torch.Tensor] = None,
apply_router_weight_on_input: bool = False,
activation: str = "silu",
enable_eplb: bool = False,
expert_load_view: Optional[torch.Tensor] = None,
logical_to_physical_map: Optional[torch.Tensor] = None,
logical_replica_count: Optional[torch.Tensor] = None,
) -> torch.Tensor:
if enable_eplb:
raise NotImplementedError(
"EPLB not supported for `RTNMoEMethod` yet.")
from vllm.model_executor.layers.fused_moe import fused_experts
topk_weights, topk_ids = FusedMoE.select_experts(
hidden_states=x,
router_logits=router_logits,
use_grouped_topk=use_grouped_topk,
top_k=top_k,
renormalize=renormalize,
topk_group=topk_group,
num_expert_group=num_expert_group,
custom_routing_function=custom_routing_function,
scoring_func=scoring_func,
e_score_correction_bias=e_score_correction_bias)
weight_bits = self.quant_config.weight_bits
group_size = self.quant_config.group_size
ret = fused_experts(
x,
layer.w13_weight,
layer.w2_weight,
topk_weights=topk_weights,
topk_ids=topk_ids,
inplace=True,
activation=activation,
use_int4_w4a16=weight_bits == 4,
use_int8_w8a16=weight_bits == 8,
global_num_experts=global_num_experts,
w1_scale=layer.w13_scale,
w2_scale=layer.w2_scale,
apply_router_weight_on_input=apply_router_weight_on_input,
expert_map=expert_map,
block_shape=[0, group_size])
return ret
def rtn_quantize(tensor: torch.Tensor, num_bits: int,
group_size: int) -> tuple[torch.Tensor, torch.Tensor]:
"""Quantize a tensor using per-group static scaling factor.
@ -221,34 +342,44 @@ def rtn_quantize(tensor: torch.Tensor, num_bits: int,
If equal to -1, each row in the input tensor is treated
as one group.
"""
batch_present = len(tensor.shape) == 3
if not batch_present:
tensor = tensor.unsqueeze(0)
q_range = 2**num_bits
num_groups = (tensor.shape[0] * tensor.shape[1] //
group_size if group_size != -1 else tensor.shape[0])
num_groups = (tensor.shape[1] * tensor.shape[2] //
group_size if group_size != -1 else tensor.shape[1])
"""Calculate a scaling factor per input group.
"""
input_flat = tensor.reshape(num_groups, -1)
input_min = torch.min(input_flat, dim=1, keepdim=True)[0]
input_max = torch.max(input_flat, dim=1, keepdim=True)[0]
input_flat = tensor.reshape(tensor.shape[0], num_groups, -1)
input_min = torch.min(input_flat, dim=2, keepdim=True)[0]
input_max = torch.max(input_flat, dim=2, keepdim=True)[0]
input_max_abs = torch.max(input_min.abs(), input_max.abs())
scale = (input_max_abs * 2.0 / (q_range - 1))
"""Scale each input group, truncate and round to the nearest integer.
"""Scale each input group, round to the nearest integer, shift
the range and truncate.
"""
scaled_input = input_flat / scale
scaled_input = scaled_input.clamp(-q_range // 2, q_range // 2 - 1)
scaled_input = scaled_input.round()
scaled_input += q_range // 2
scaled_input = scaled_input.clamp(0, q_range - 1)
scale = scale.reshape(tensor.shape[0], -1).contiguous()
inputs_q = scaled_input.reshape(tensor.shape).to(torch.int8)
scale = scale.reshape(tensor.shape[0], tensor.shape[1], -1).contiguous()
inputs_q = scaled_input.reshape(tensor.shape).to(torch.uint8)
inputs_q = inputs_q.contiguous()
if num_bits == 4:
"""Pack two 4-bit values into each byte.
"""
inputs_q = (inputs_q[:, 1::2] << 4) | (inputs_q[:, ::2] & 0xf)
inputs_q = inputs_q.reshape(tensor.shape[0] // 2, tensor.shape[1])
inputs_q = (inputs_q[:, :, 1::2] << 4) | (inputs_q[:, :, ::2] & 0xf)
inputs_q = inputs_q.reshape(tensor.shape[0], tensor.shape[1] // 2,
tensor.shape[2])
inputs_q = inputs_q.contiguous()
if not batch_present:
inputs_q = inputs_q.squeeze(0)
scale = scale.squeeze(0)
return inputs_q, scale
@ -259,31 +390,60 @@ def rtn_dequantize(tensor: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
tensor: The input tensor.
scale: The tensor with per-group scale factors.
"""
batch_present = len(tensor.shape) == 3
if not batch_present:
tensor = tensor.unsqueeze(0)
scale = scale.unsqueeze(0)
num_groups = scale.size(0) * scale.size(1)
input_dim, output_dim = tensor.shape
num_groups = scale.size(1) * scale.size(2)
batch, input_dim, output_dim = tensor.shape
num_bits = 8 if input_dim == scale.size(0) else 4
num_bits = 8 if input_dim == scale.size(1) else 4
q_range = 2**num_bits
if num_bits == 4:
input_dim *= 2
data = torch.empty((input_dim, output_dim),
data = torch.empty((batch, input_dim, output_dim),
dtype=scale.dtype,
device=tensor.device)
if num_bits == 8:
data.copy_(tensor)
data -= q_range // 2
else:
"""Unpack two 4-bit values from each byte.
"""
tensor = tensor.reshape(input_dim, output_dim // 2)
tensor = tensor.reshape(batch, input_dim, output_dim // 2)
for i in range(2):
data[:, i::2] = (tensor << 4 * (1 - i)) >> 4
data[:, :, i::2] = ((tensor << 4 *
(1 - i)) >> 4).to(torch.int8) - q_range // 2
"""Scale each input group with its scaling factor.
"""
scale = scale.reshape(num_groups, -1)
data = data.reshape(num_groups, -1)
scale = scale.reshape(batch, num_groups, -1)
data = data.reshape(batch, num_groups, -1)
data = torch.mul(data, scale)
input_deq = data.reshape((input_dim, output_dim)).contiguous()
input_deq = data.reshape((batch, input_dim, output_dim)).contiguous()
if not batch_present:
input_deq = input_deq.squeeze(0)
return input_deq
def fix_weights(layer: torch.nn.Module,
param_name: str,
reshape: bool = False):
"""torch.compile does not know how to deal with a Parameter subclass
(aka RTNParameter). As we don't really need RTNParameters for the
forward pass, we replace them with equivalent instances of Parameters.
"""
old_weight = getattr(layer, param_name)
assert isinstance(old_weight, RTNParameter)
data = old_weight.data.data
delattr(layer, param_name)
if reshape:
data = data.reshape(old_weight.shape[0], old_weight.shape[1] * 2, -1)
new_weight = Parameter(data=data, requires_grad=False)
layer.register_parameter(param_name, new_weight)

View File

@ -238,13 +238,20 @@ def per_token_group_quant_int8(
int8_min = iinfo.min
x_q = torch.empty_like(x, device=x.device, dtype=dtype)
M = x.numel() // group_size
N = group_size
x_s = torch.empty(
x.shape[:-1] + (x.shape[-1] // group_size, ),
device=x.device,
dtype=torch.float32,
)
# prefer CUDA kernel if available
if current_platform.is_cuda():
torch.ops._C.per_token_group_quant_int8(x, x_q, x_s, group_size, eps,
float(int8_min),
float(int8_max))
return x_q, x_s
M = x.numel() // group_size
N = group_size
BLOCK = triton.next_power_of_2(N)
# heuristics for number of warps

View File

@ -2,13 +2,12 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import torch
from vllm._custom_ops import cutlass_scaled_mm_supports_fp4
from vllm.platforms import current_platform
from vllm.scalar_type import scalar_types
__all__ = [
"break_fp4_bytes", "dequantize_to_dtype", "ref_nvfp4_quant",
"cutlass_fp4_supported"
"break_fp4_bytes",
"dequantize_to_dtype",
"ref_nvfp4_quant",
]
FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
@ -17,14 +16,6 @@ kE2M1ToFloat = torch.tensor([0., 0.5, 1., 1.5, 2., 3., 4., 6.],
dtype=torch.float32)
def cutlass_fp4_supported() -> bool:
if not current_platform.is_cuda():
return False
capability_tuple = current_platform.get_device_capability()
capability = -1 if capability_tuple is None else capability_tuple.to_int()
return cutlass_scaled_mm_supports_fp4(capability)
def break_fp4_bytes(a, dtype):
assert a.dtype == torch.uint8
m, n = a.shape

View File

@ -8,8 +8,10 @@ from typing import ClassVar, NamedTuple, Optional
import numpy
import torch
from vllm._custom_ops import cutlass_scaled_mm_supports_fp4
from vllm.model_executor.layers.quantization.qqq import (
MARLIN_QQQ_SUPPORTED_NUM_BITS)
from vllm.platforms import current_platform
from vllm.scalar_type import ScalarType, scalar_types
@ -592,3 +594,56 @@ def awq_pack(
q_w = q_w.reshape((-1, size_n)).contiguous()
return pack_cols(q_w, num_bits, size_k, size_n)
def swizzle_blockscale(scale: torch.Tensor) -> torch.Tensor:
"""
Pad and block-interleave the FP4 block-scales so that they match the data
layout expected by the CUTLASS / FlashInfer kernels.
Parameters
----------
scale: torch.Tensor
Returns
-------
torch.Tensor
The swizzled tensor with the same logical shape as *scale*.
"""
assert scale.dtype == torch.float8_e4m3fn, (
"swizzle_blockscale expects the input tensor to be in "
"torch.float8_e4m3fn format.")
scale_ndim = scale.ndim
if scale_ndim == 2:
scale = scale.unsqueeze(0) # (1, M, K)
assert scale.ndim == 3, "Expected a 2-D or 3-D tensor for block scales."
B, M, K = scale.shape
def _round_up(x: int, m: int) -> int:
return (x + m - 1) // m * m
M_padded = _round_up(M, 128)
K_padded = _round_up(K, 4)
padded = torch.zeros((B, M_padded, K_padded),
dtype=scale.dtype,
device=scale.device)
padded[:B, :M, :K] = scale
# Reshape / permute to the layout required by the kernel.
padded = padded.reshape(B, M_padded // 128, 4, 32, K_padded // 4, 4)
swizzled = padded.permute(0, 1, 4, 3, 2, 5).contiguous().cuda()
if scale_ndim == 2:
return swizzled.reshape(M, K)
return swizzled.reshape(B, M, K)
def cutlass_fp4_supported() -> bool:
if not current_platform.is_cuda():
return False
capability_tuple = current_platform.get_device_capability()
capability = -1 if capability_tuple is None else capability_tuple.to_int()
return cutlass_scaled_mm_supports_fp4(capability)

View File

@ -1,7 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Iterable, Mapping, Sequence
from typing import Optional, TypedDict, Union
from typing import Annotated, Optional, Union
import torch
import torch.nn as nn
@ -29,6 +29,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
PromptUpdate)
from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.sequence import IntermediateTensors
from vllm.utils.tensor_schema import TensorSchema, TensorShape
# yapf: disable
from .idefics2_vision_model import Idefics2VisionConfig
@ -42,15 +43,26 @@ from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
merge_multimodal_embeddings)
class AriaImagePixelInputs(TypedDict):
pixel_values: torch.Tensor
pixel_mask: Optional[torch.Tensor]
class AriaImagePixelInputs(TensorSchema):
"""
Shape:
pixel_values: `(batch_size * num_images, num_channels, height, width)`
pixel_mask: `(batch_size * num_images, height, width)`
Dimensions:
- b: Batch size
- n: Number of images
- c: Number of channels
- h: Height of each image
- w: Width of each image
"""
pixel_values: Annotated[
torch.Tensor,
TensorShape("bn", 3, "h", "w"),
]
pixel_mask: Annotated[
Optional[torch.Tensor],
TensorShape("bn", "h", "w"),
]
class AriaVisionTransformer(Idefics3VisionTransformer, SupportsQuant):
packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
@ -540,12 +552,6 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
self.vocab_size, logit_scale)
def _validate_image_sizes(
self, images: list[torch.Tensor]) -> list[torch.Tensor]:
if not all(img.shape == images[0].shape for img in images):
raise ValueError("All images must be the same size")
return images
def _parse_and_validate_image_input(
self, **kwargs: object) -> Optional[AriaImagePixelInputs]:
pixel_values = kwargs.pop("pixel_values", None)
@ -554,23 +560,9 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
if pixel_values is None:
return None
if not isinstance(pixel_values, (torch.Tensor, list)):
raise ValueError("Incorrect type of pixel values. "
f"Got type: {type(pixel_values)}")
pixel_values = self._validate_image_sizes(pixel_values)
pixel_values = flatten_bn(pixel_values, concat=True)
if pixel_mask is not None:
if not isinstance(pixel_mask, (torch.Tensor, list)):
raise ValueError("Incorrect type of pixel mask. "
f"Got type: {type(pixel_mask)}")
pixel_mask = flatten_bn(pixel_mask, concat=True)
return AriaImagePixelInputs(
pixel_values=pixel_values,
pixel_mask=pixel_mask,
pixel_values=flatten_bn(pixel_values, concat=True),
pixel_mask=flatten_bn(pixel_mask, concat=True),
)
def _create_patch_attention_mask(

Some files were not shown because too many files have changed in this diff Show More