mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-22 15:11:19 +08:00
Merge remote-tracking branch 'nm/lwilkinson/fix-flashmla-full-cudagraph' into wide_ep_working_branch
This commit is contained in:
commit
f1c9ef3afd
@ -74,7 +74,7 @@ Here is an example of one test inside `latency-tests.json`:
|
|||||||
In this example:
|
In this example:
|
||||||
|
|
||||||
- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
|
- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
|
||||||
- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
|
- The `parameters` attribute control the command line arguments to be used for `vllm bench latency`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `vllm bench latency`. For example, the corresponding command line arguments for `vllm bench latency` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
|
||||||
|
|
||||||
Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
|
Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
|
||||||
|
|
||||||
@ -82,13 +82,13 @@ WARNING: The benchmarking script will save json results by itself, so please do
|
|||||||
|
|
||||||
### Throughput test
|
### Throughput test
|
||||||
|
|
||||||
The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
|
The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `vllm bench throughput`.
|
||||||
|
|
||||||
The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
|
The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
|
||||||
|
|
||||||
### Serving test
|
### Serving test
|
||||||
|
|
||||||
We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
|
We test the throughput by using `vllm bench serve` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
|
||||||
|
|
||||||
```json
|
```json
|
||||||
[
|
[
|
||||||
@ -118,8 +118,8 @@ Inside this example:
|
|||||||
|
|
||||||
- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
|
- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
|
||||||
- The `server-parameters` includes the command line arguments for vLLM server.
|
- The `server-parameters` includes the command line arguments for vLLM server.
|
||||||
- The `client-parameters` includes the command line arguments for `benchmark_serving.py`.
|
- The `client-parameters` includes the command line arguments for `vllm bench serve`.
|
||||||
- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `benchmark_serving.py`
|
- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `vllm bench serve`
|
||||||
|
|
||||||
The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly.
|
The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly.
|
||||||
|
|
||||||
|
|||||||
@ -100,7 +100,7 @@ if __name__ == "__main__":
|
|||||||
raw_result = json.loads(f.read())
|
raw_result = json.loads(f.read())
|
||||||
|
|
||||||
if "serving" in str(test_file):
|
if "serving" in str(test_file):
|
||||||
# this result is generated via `benchmark_serving.py`
|
# this result is generated via `vllm bench serve` command
|
||||||
|
|
||||||
# attach the benchmarking command to raw_result
|
# attach the benchmarking command to raw_result
|
||||||
try:
|
try:
|
||||||
@ -120,7 +120,7 @@ if __name__ == "__main__":
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
elif "latency" in f.name:
|
elif "latency" in f.name:
|
||||||
# this result is generated via `benchmark_latency.py`
|
# this result is generated via `vllm bench latency` command
|
||||||
|
|
||||||
# attach the benchmarking command to raw_result
|
# attach the benchmarking command to raw_result
|
||||||
try:
|
try:
|
||||||
@ -148,7 +148,7 @@ if __name__ == "__main__":
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
elif "throughput" in f.name:
|
elif "throughput" in f.name:
|
||||||
# this result is generated via `benchmark_throughput.py`
|
# this result is generated via `vllm bench throughput` command
|
||||||
|
|
||||||
# attach the benchmarking command to raw_result
|
# attach the benchmarking command to raw_result
|
||||||
try:
|
try:
|
||||||
|
|||||||
@ -95,12 +95,14 @@ json2args() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
kill_gpu_processes() {
|
kill_gpu_processes() {
|
||||||
pkill -f python
|
pkill -f '[p]ython'
|
||||||
pkill -f python3
|
pkill -f '[p]ython3'
|
||||||
pkill -f tritonserver
|
pkill -f '[t]ritonserver'
|
||||||
pkill -f pt_main_thread
|
pkill -f '[p]t_main_thread'
|
||||||
pkill -f text-generation
|
pkill -f '[t]ext-generation'
|
||||||
pkill -f lmdeploy
|
pkill -f '[l]mdeploy'
|
||||||
|
# vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
|
||||||
|
pkill -f '[V]LLM'
|
||||||
|
|
||||||
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
|
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
|
||||||
sleep 1
|
sleep 1
|
||||||
@ -125,7 +127,7 @@ ensure_installed() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
run_serving_tests() {
|
run_serving_tests() {
|
||||||
# run serving tests using `benchmark_serving.py`
|
# run serving tests using `vllm bench serve` command
|
||||||
# $1: a json file specifying serving test cases
|
# $1: a json file specifying serving test cases
|
||||||
|
|
||||||
local serving_test_file
|
local serving_test_file
|
||||||
@ -225,7 +227,7 @@ run_serving_tests() {
|
|||||||
|
|
||||||
if [[ "$dataset_name" = "sharegpt" ]]; then
|
if [[ "$dataset_name" = "sharegpt" ]]; then
|
||||||
|
|
||||||
client_command="python3 benchmark_serving.py \
|
client_command="vllm bench serve \
|
||||||
--backend $backend \
|
--backend $backend \
|
||||||
--tokenizer /tokenizer_cache \
|
--tokenizer /tokenizer_cache \
|
||||||
--model $model \
|
--model $model \
|
||||||
@ -246,7 +248,7 @@ run_serving_tests() {
|
|||||||
sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
|
sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
|
||||||
sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
|
sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
|
||||||
|
|
||||||
client_command="python3 benchmark_serving.py \
|
client_command="vllm bench serve \
|
||||||
--backend $backend \
|
--backend $backend \
|
||||||
--tokenizer /tokenizer_cache \
|
--tokenizer /tokenizer_cache \
|
||||||
--model $model \
|
--model $model \
|
||||||
|
|||||||
@ -126,7 +126,8 @@ kill_gpu_processes() {
|
|||||||
ps -aux
|
ps -aux
|
||||||
lsof -t -i:8000 | xargs -r kill -9
|
lsof -t -i:8000 | xargs -r kill -9
|
||||||
pgrep python3 | xargs -r kill -9
|
pgrep python3 | xargs -r kill -9
|
||||||
|
# vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
|
||||||
|
pgrep VLLM | xargs -r kill -9
|
||||||
|
|
||||||
# wait until GPU memory usage smaller than 1GB
|
# wait until GPU memory usage smaller than 1GB
|
||||||
if command -v nvidia-smi; then
|
if command -v nvidia-smi; then
|
||||||
@ -164,7 +165,7 @@ upload_to_buildkite() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
run_latency_tests() {
|
run_latency_tests() {
|
||||||
# run latency tests using `benchmark_latency.py`
|
# run latency tests using `vllm bench latency` command
|
||||||
# $1: a json file specifying latency test cases
|
# $1: a json file specifying latency test cases
|
||||||
|
|
||||||
local latency_test_file
|
local latency_test_file
|
||||||
@ -205,7 +206,7 @@ run_latency_tests() {
|
|||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
latency_command=" $latency_envs python3 benchmark_latency.py \
|
latency_command=" $latency_envs vllm bench latency \
|
||||||
--output-json $RESULTS_FOLDER/${test_name}.json \
|
--output-json $RESULTS_FOLDER/${test_name}.json \
|
||||||
$latency_args"
|
$latency_args"
|
||||||
|
|
||||||
@ -231,7 +232,7 @@ run_latency_tests() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
run_throughput_tests() {
|
run_throughput_tests() {
|
||||||
# run throughput tests using `benchmark_throughput.py`
|
# run throughput tests using `vllm bench throughput`
|
||||||
# $1: a json file specifying throughput test cases
|
# $1: a json file specifying throughput test cases
|
||||||
|
|
||||||
local throughput_test_file
|
local throughput_test_file
|
||||||
@ -272,7 +273,7 @@ run_throughput_tests() {
|
|||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
throughput_command=" $throughput_envs python3 benchmark_throughput.py \
|
throughput_command=" $throughput_envs vllm bench throughput \
|
||||||
--output-json $RESULTS_FOLDER/${test_name}.json \
|
--output-json $RESULTS_FOLDER/${test_name}.json \
|
||||||
$throughput_args"
|
$throughput_args"
|
||||||
|
|
||||||
@ -297,7 +298,7 @@ run_throughput_tests() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
run_serving_tests() {
|
run_serving_tests() {
|
||||||
# run serving tests using `benchmark_serving.py`
|
# run serving tests using `vllm bench serve` command
|
||||||
# $1: a json file specifying serving test cases
|
# $1: a json file specifying serving test cases
|
||||||
|
|
||||||
local serving_test_file
|
local serving_test_file
|
||||||
@ -393,7 +394,7 @@ run_serving_tests() {
|
|||||||
|
|
||||||
# pass the tensor parallel size to the client so that it can be displayed
|
# pass the tensor parallel size to the client so that it can be displayed
|
||||||
# on the benchmark dashboard
|
# on the benchmark dashboard
|
||||||
client_command="python3 benchmark_serving.py \
|
client_command="vllm bench serve \
|
||||||
--save-result \
|
--save-result \
|
||||||
--result-dir $RESULTS_FOLDER \
|
--result-dir $RESULTS_FOLDER \
|
||||||
--result-filename ${new_test_name}.json \
|
--result-filename ${new_test_name}.json \
|
||||||
@ -447,7 +448,7 @@ main() {
|
|||||||
(which jq) || (apt-get update && apt-get -y install jq)
|
(which jq) || (apt-get update && apt-get -y install jq)
|
||||||
(which lsof) || (apt-get update && apt-get install -y lsof)
|
(which lsof) || (apt-get update && apt-get install -y lsof)
|
||||||
|
|
||||||
# get the current IP address, required by benchmark_serving.py
|
# get the current IP address, required by `vllm bench serve` command
|
||||||
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
|
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
|
||||||
# turn of the reporting of the status of each request, to clean up the terminal output
|
# turn of the reporting of the status of each request, to clean up the terminal output
|
||||||
export VLLM_LOGGING_LEVEL="WARNING"
|
export VLLM_LOGGING_LEVEL="WARNING"
|
||||||
|
|||||||
@ -83,7 +83,7 @@ function cpu_tests() {
|
|||||||
set -e
|
set -e
|
||||||
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
|
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
|
||||||
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
|
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
|
||||||
python3 benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--dataset-name random \
|
--dataset-name random \
|
||||||
--model meta-llama/Llama-3.2-3B-Instruct \
|
--model meta-llama/Llama-3.2-3B-Instruct \
|
||||||
|
|||||||
166
.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
Executable file
166
.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
Executable file
@ -0,0 +1,166 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -xu
|
||||||
|
|
||||||
|
|
||||||
|
remove_docker_container() {
|
||||||
|
docker rm -f tpu-test || true;
|
||||||
|
docker rm -f vllm-tpu || true;
|
||||||
|
}
|
||||||
|
|
||||||
|
trap remove_docker_container EXIT
|
||||||
|
|
||||||
|
# Remove the container that might not be cleaned up in the previous run.
|
||||||
|
remove_docker_container
|
||||||
|
|
||||||
|
# Build the docker image.
|
||||||
|
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
|
||||||
|
|
||||||
|
# Set up cleanup.
|
||||||
|
cleanup_docker() {
|
||||||
|
# Get Docker's root directory
|
||||||
|
docker_root=$(docker info -f '{{.DockerRootDir}}')
|
||||||
|
if [ -z "$docker_root" ]; then
|
||||||
|
echo "Failed to determine Docker root directory."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "Docker root directory: $docker_root"
|
||||||
|
# Check disk usage of the filesystem where Docker's root directory is located
|
||||||
|
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
|
||||||
|
# Define the threshold
|
||||||
|
threshold=70
|
||||||
|
if [ "$disk_usage" -gt "$threshold" ]; then
|
||||||
|
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
|
||||||
|
# Remove dangling images (those that are not tagged and not used by any container)
|
||||||
|
docker image prune -f
|
||||||
|
# Remove unused volumes / force the system prune for old images as well.
|
||||||
|
docker volume prune -f && docker system prune --force --filter "until=72h" --all
|
||||||
|
echo "Docker images and volumes cleanup completed."
|
||||||
|
else
|
||||||
|
echo "Disk usage is below $threshold%. No cleanup needed."
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
cleanup_docker
|
||||||
|
|
||||||
|
# For HF_TOKEN.
|
||||||
|
source /etc/environment
|
||||||
|
|
||||||
|
docker run --privileged --net host --shm-size=16G -it \
|
||||||
|
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
|
||||||
|
vllm-tpu /bin/bash -c '
|
||||||
|
set -e # Exit immediately if a command exits with a non-zero status.
|
||||||
|
set -u # Treat unset variables as an error.
|
||||||
|
|
||||||
|
echo "--- Starting script inside Docker container ---"
|
||||||
|
|
||||||
|
# Create results directory
|
||||||
|
RESULTS_DIR=$(mktemp -d)
|
||||||
|
# If mktemp fails, set -e will cause the script to exit.
|
||||||
|
echo "Results will be stored in: $RESULTS_DIR"
|
||||||
|
|
||||||
|
# Install dependencies
|
||||||
|
echo "--- Installing Python dependencies ---"
|
||||||
|
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
||||||
|
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
||||||
|
&& python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 \
|
||||||
|
&& python3 -m pip install --progress-bar off hf-transfer
|
||||||
|
echo "--- Python dependencies installed ---"
|
||||||
|
export VLLM_USE_V1=1
|
||||||
|
export VLLM_XLA_CHECK_RECOMPILATION=1
|
||||||
|
export VLLM_XLA_CACHE_PATH=
|
||||||
|
echo "Using VLLM V1"
|
||||||
|
|
||||||
|
echo "--- Hardware Information ---"
|
||||||
|
# tpu-info
|
||||||
|
echo "--- Starting Tests ---"
|
||||||
|
set +e
|
||||||
|
overall_script_exit_code=0
|
||||||
|
|
||||||
|
# --- Test Definitions ---
|
||||||
|
# If a test fails, this function will print logs and will not cause the main script to exit.
|
||||||
|
run_test() {
|
||||||
|
local test_num=$1
|
||||||
|
local test_name=$2
|
||||||
|
local test_command=$3
|
||||||
|
local log_file="$RESULTS_DIR/test_${test_num}.log"
|
||||||
|
local actual_exit_code
|
||||||
|
|
||||||
|
echo "--- TEST_$test_num: Running $test_name ---"
|
||||||
|
|
||||||
|
# Execute the test command.
|
||||||
|
eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2)
|
||||||
|
actual_exit_code=$?
|
||||||
|
|
||||||
|
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log
|
||||||
|
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log
|
||||||
|
|
||||||
|
if [ "$actual_exit_code" -ne 0 ]; then
|
||||||
|
echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2
|
||||||
|
echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2
|
||||||
|
if [ -f "$log_file" ]; then
|
||||||
|
cat "$log_file" >&2
|
||||||
|
else
|
||||||
|
echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2
|
||||||
|
fi
|
||||||
|
echo "--- End of log for TEST_$test_num ($test_name) ---" >&2
|
||||||
|
return "$actual_exit_code" # Return the failure code
|
||||||
|
else
|
||||||
|
echo "TEST_$test_num ($test_name) PASSED."
|
||||||
|
return 0 # Return success
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Helper function to call run_test and update the overall script exit code
|
||||||
|
run_and_track_test() {
|
||||||
|
local test_num_arg="$1"
|
||||||
|
local test_name_arg="$2"
|
||||||
|
local test_command_arg="$3"
|
||||||
|
|
||||||
|
# Run the test
|
||||||
|
run_test "$test_num_arg" "$test_name_arg" "$test_command_arg"
|
||||||
|
local test_specific_exit_code=$?
|
||||||
|
|
||||||
|
# If the test failed, set the overall script exit code to 1
|
||||||
|
if [ "$test_specific_exit_code" -ne 0 ]; then
|
||||||
|
# No need for extra echo here, run_test already logged the failure.
|
||||||
|
overall_script_exit_code=1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# --- Actual Test Execution ---
|
||||||
|
run_and_track_test 1 "test_struct_output_generate.py" \
|
||||||
|
"HF_HUB_DISABLE_XET=1 python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
|
||||||
|
run_and_track_test 2 "test_moe_pallas.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
|
||||||
|
run_and_track_test 3 "test_lora.py" \
|
||||||
|
"VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
|
||||||
|
run_and_track_test 4 "test_tpu_qkv_linear.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
|
||||||
|
run_and_track_test 5 "test_spmd_model_weight_loading.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
|
||||||
|
run_and_track_test 6 "test_kv_cache_update_kernel.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
|
||||||
|
|
||||||
|
# After all tests have been attempted, exit with the overall status.
|
||||||
|
if [ "$overall_script_exit_code" -ne 0 ]; then
|
||||||
|
echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---"
|
||||||
|
else
|
||||||
|
echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---"
|
||||||
|
fi
|
||||||
|
exit "$overall_script_exit_code"
|
||||||
|
' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct.
|
||||||
|
|
||||||
|
# Capture the exit code of the docker run command
|
||||||
|
DOCKER_RUN_EXIT_CODE=$?
|
||||||
|
|
||||||
|
# The trap will run for cleanup.
|
||||||
|
# Exit the main script with the Docker run command's exit code.
|
||||||
|
if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then
|
||||||
|
echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE."
|
||||||
|
exit "$DOCKER_RUN_EXIT_CODE"
|
||||||
|
else
|
||||||
|
echo "Docker run command completed successfully."
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
# TODO: This test fails because it uses RANDOM_SEED sampling
|
||||||
|
# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
||||||
@ -150,18 +150,6 @@ run_and_track_test 9 "test_multimodal.py" \
|
|||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py"
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py"
|
||||||
run_and_track_test 10 "test_pallas.py" \
|
run_and_track_test 10 "test_pallas.py" \
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
|
||||||
run_and_track_test 11 "test_struct_output_generate.py" \
|
|
||||||
"HF_HUB_DISABLE_XET=1 python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
|
|
||||||
run_and_track_test 12 "test_moe_pallas.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
|
|
||||||
run_and_track_test 13 "test_lora.py" \
|
|
||||||
"VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
|
|
||||||
run_and_track_test 14 "test_tpu_qkv_linear.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
|
|
||||||
run_and_track_test 15 "test_spmd_model_weight_loading.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
|
|
||||||
run_and_track_test 16 "test_kv_cache_update_kernel.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
|
|
||||||
|
|
||||||
# After all tests have been attempted, exit with the overall status.
|
# After all tests have been attempted, exit with the overall status.
|
||||||
if [ "$overall_script_exit_code" -ne 0 ]; then
|
if [ "$overall_script_exit_code" -ne 0 ]; then
|
||||||
|
|||||||
@ -11,10 +11,10 @@ cd "$(dirname "${BASH_SOURCE[0]}")/../.."
|
|||||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||||
|
|
||||||
# run python-based benchmarks and upload the result to buildkite
|
# run python-based benchmarks and upload the result to buildkite
|
||||||
python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
|
vllm bench latency --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
|
||||||
bench_latency_exit_code=$?
|
bench_latency_exit_code=$?
|
||||||
|
|
||||||
python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
|
vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
|
||||||
bench_throughput_exit_code=$?
|
bench_throughput_exit_code=$?
|
||||||
|
|
||||||
# run server-based benchmarks and upload the result to buildkite
|
# run server-based benchmarks and upload the result to buildkite
|
||||||
@ -24,7 +24,7 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
|
|||||||
|
|
||||||
# wait for server to start, timeout after 600 seconds
|
# wait for server to start, timeout after 600 seconds
|
||||||
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
||||||
python3 benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--dataset-name sharegpt \
|
--dataset-name sharegpt \
|
||||||
--dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \
|
--dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \
|
||||||
|
|||||||
@ -77,7 +77,7 @@ done
|
|||||||
echo "run benchmark test..."
|
echo "run benchmark test..."
|
||||||
echo "logging to $BM_LOG"
|
echo "logging to $BM_LOG"
|
||||||
echo
|
echo
|
||||||
python benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--model $MODEL \
|
--model $MODEL \
|
||||||
--dataset-name sonnet \
|
--dataset-name sonnet \
|
||||||
|
|||||||
2
.github/workflows/lint-and-deploy.yaml
vendored
2
.github/workflows/lint-and-deploy.yaml
vendored
@ -7,7 +7,7 @@ permissions:
|
|||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
lint-and-deploy:
|
lint-and-deploy:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-24.04-arm
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
|||||||
@ -98,7 +98,7 @@ Then run the benchmarking script
|
|||||||
```bash
|
```bash
|
||||||
# download dataset
|
# download dataset
|
||||||
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
python3 vllm/benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||||
--endpoint /v1/completions \
|
--endpoint /v1/completions \
|
||||||
@ -150,7 +150,7 @@ VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
# run benchmarking script
|
# run benchmarking script
|
||||||
python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detailed \
|
vllm bench serve --port 9001 --save-result --save-detailed \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--model meta-llama/Llama-3.1-8B-Instruct \
|
--model meta-llama/Llama-3.1-8B-Instruct \
|
||||||
--endpoint /v1/completions \
|
--endpoint /v1/completions \
|
||||||
@ -174,7 +174,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
|
|||||||
```
|
```
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 vllm/benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--backend openai-chat \
|
--backend openai-chat \
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||||
--endpoint /v1/chat/completions \
|
--endpoint /v1/chat/completions \
|
||||||
@ -194,7 +194,7 @@ VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
|
|||||||
```
|
```
|
||||||
|
|
||||||
``` bash
|
``` bash
|
||||||
python3 benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--model meta-llama/Meta-Llama-3-8B-Instruct \
|
--model meta-llama/Meta-Llama-3-8B-Instruct \
|
||||||
--dataset-name hf \
|
--dataset-name hf \
|
||||||
--dataset-path likaixin/InstructCoder \
|
--dataset-path likaixin/InstructCoder \
|
||||||
@ -210,7 +210,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
|
|||||||
**`lmms-lab/LLaVA-OneVision-Data`**
|
**`lmms-lab/LLaVA-OneVision-Data`**
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 vllm/benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--backend openai-chat \
|
--backend openai-chat \
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||||
--endpoint /v1/chat/completions \
|
--endpoint /v1/chat/completions \
|
||||||
@ -224,7 +224,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
|
|||||||
**`Aeala/ShareGPT_Vicuna_unfiltered`**
|
**`Aeala/ShareGPT_Vicuna_unfiltered`**
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 vllm/benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--backend openai-chat \
|
--backend openai-chat \
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||||
--endpoint /v1/chat/completions \
|
--endpoint /v1/chat/completions \
|
||||||
@ -237,7 +237,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
|
|||||||
**`AI-MO/aimo-validation-aime`**
|
**`AI-MO/aimo-validation-aime`**
|
||||||
|
|
||||||
``` bash
|
``` bash
|
||||||
python3 vllm/benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--model Qwen/QwQ-32B \
|
--model Qwen/QwQ-32B \
|
||||||
--dataset-name hf \
|
--dataset-name hf \
|
||||||
--dataset-path AI-MO/aimo-validation-aime \
|
--dataset-path AI-MO/aimo-validation-aime \
|
||||||
@ -248,7 +248,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
|
|||||||
**`philschmid/mt-bench`**
|
**`philschmid/mt-bench`**
|
||||||
|
|
||||||
``` bash
|
``` bash
|
||||||
python3 vllm/benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--model Qwen/QwQ-32B \
|
--model Qwen/QwQ-32B \
|
||||||
--dataset-name hf \
|
--dataset-name hf \
|
||||||
--dataset-path philschmid/mt-bench \
|
--dataset-path philschmid/mt-bench \
|
||||||
@ -261,7 +261,7 @@ When using OpenAI-compatible backends such as `vllm`, optional sampling
|
|||||||
parameters can be specified. Example client command:
|
parameters can be specified. Example client command:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 vllm/benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||||
--endpoint /v1/completions \
|
--endpoint /v1/completions \
|
||||||
@ -296,7 +296,7 @@ The following arguments can be used to control the ramp-up:
|
|||||||
<br/>
|
<br/>
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
vllm bench throughput \
|
||||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||||
--dataset-name sonnet \
|
--dataset-name sonnet \
|
||||||
--dataset-path vllm/benchmarks/sonnet.txt \
|
--dataset-path vllm/benchmarks/sonnet.txt \
|
||||||
@ -314,7 +314,7 @@ Total num output tokens: 1500
|
|||||||
**VisionArena Benchmark for Vision Language Models**
|
**VisionArena Benchmark for Vision Language Models**
|
||||||
|
|
||||||
``` bash
|
``` bash
|
||||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
vllm bench throughput \
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||||
--backend vllm-chat \
|
--backend vllm-chat \
|
||||||
--dataset-name hf \
|
--dataset-name hf \
|
||||||
@ -336,7 +336,7 @@ Total num output tokens: 1280
|
|||||||
``` bash
|
``` bash
|
||||||
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
||||||
VLLM_USE_V1=1 \
|
VLLM_USE_V1=1 \
|
||||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
vllm bench throughput \
|
||||||
--dataset-name=hf \
|
--dataset-name=hf \
|
||||||
--dataset-path=likaixin/InstructCoder \
|
--dataset-path=likaixin/InstructCoder \
|
||||||
--model=meta-llama/Meta-Llama-3-8B-Instruct \
|
--model=meta-llama/Meta-Llama-3-8B-Instruct \
|
||||||
@ -360,7 +360,7 @@ Total num output tokens: 204800
|
|||||||
**`lmms-lab/LLaVA-OneVision-Data`**
|
**`lmms-lab/LLaVA-OneVision-Data`**
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
vllm bench throughput \
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||||
--backend vllm-chat \
|
--backend vllm-chat \
|
||||||
--dataset-name hf \
|
--dataset-name hf \
|
||||||
@ -373,7 +373,7 @@ python3 vllm/benchmarks/benchmark_throughput.py \
|
|||||||
**`Aeala/ShareGPT_Vicuna_unfiltered`**
|
**`Aeala/ShareGPT_Vicuna_unfiltered`**
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
vllm bench throughput \
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||||
--backend vllm-chat \
|
--backend vllm-chat \
|
||||||
--dataset-name hf \
|
--dataset-name hf \
|
||||||
@ -385,7 +385,7 @@ python3 vllm/benchmarks/benchmark_throughput.py \
|
|||||||
**`AI-MO/aimo-validation-aime`**
|
**`AI-MO/aimo-validation-aime`**
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 benchmarks/benchmark_throughput.py \
|
vllm bench throughput \
|
||||||
--model Qwen/QwQ-32B \
|
--model Qwen/QwQ-32B \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--dataset-name hf \
|
--dataset-name hf \
|
||||||
@ -399,7 +399,7 @@ python3 benchmarks/benchmark_throughput.py \
|
|||||||
``` bash
|
``` bash
|
||||||
# download dataset
|
# download dataset
|
||||||
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
vllm bench throughput \
|
||||||
--model meta-llama/Llama-2-7b-hf \
|
--model meta-llama/Llama-2-7b-hf \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
|
--dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
|
||||||
|
|||||||
@ -105,7 +105,7 @@ After the script finishes, you will find the results in a new, timestamped direc
|
|||||||
|
|
||||||
- **Log Files**: The directory (`$BASE/auto-benchmark/YYYY_MM_DD_HH_MM/`) contains detailed logs for each run:
|
- **Log Files**: The directory (`$BASE/auto-benchmark/YYYY_MM_DD_HH_MM/`) contains detailed logs for each run:
|
||||||
- `vllm_log_...txt`: The log output from the vLLM server for each parameter combination.
|
- `vllm_log_...txt`: The log output from the vLLM server for each parameter combination.
|
||||||
- `bm_log_...txt`: The log output from the `benchmark_serving.py` script for each benchmark run.
|
- `bm_log_...txt`: The log output from the `vllm bench serve` command for each benchmark run.
|
||||||
|
|
||||||
- **Final Result Summary**: A file named `result.txt` is created in the log directory. It contains a summary of each tested combination and concludes with the overall best parameters found.
|
- **Final Result Summary**: A file named `result.txt` is created in the log directory. It contains a summary of each tested combination and concludes with the overall best parameters found.
|
||||||
|
|
||||||
|
|||||||
@ -136,7 +136,7 @@ run_benchmark() {
|
|||||||
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
|
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
|
||||||
prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
|
prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
|
||||||
adjusted_input_len=$(( INPUT_LEN - prefix_len ))
|
adjusted_input_len=$(( INPUT_LEN - prefix_len ))
|
||||||
python3 benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--model $MODEL \
|
--model $MODEL \
|
||||||
--dataset-name random \
|
--dataset-name random \
|
||||||
@ -169,7 +169,7 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len ))
|
|||||||
curl -X POST http://0.0.0.0:8004/reset_prefix_cache
|
curl -X POST http://0.0.0.0:8004/reset_prefix_cache
|
||||||
sleep 5
|
sleep 5
|
||||||
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
|
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
|
||||||
python3 benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--model $MODEL \
|
--model $MODEL \
|
||||||
--dataset-name random \
|
--dataset-name random \
|
||||||
|
|||||||
@ -11,6 +11,7 @@ from typing import Any, Optional
|
|||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
from typing_extensions import deprecated
|
||||||
|
|
||||||
import vllm.envs as envs
|
import vllm.envs as envs
|
||||||
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
||||||
@ -34,6 +35,10 @@ def save_to_pytorch_benchmark_format(
|
|||||||
write_to_json(pt_file, pt_records)
|
write_to_json(pt_file, pt_records)
|
||||||
|
|
||||||
|
|
||||||
|
@deprecated(
|
||||||
|
"benchmark_latency.py is deprecated and will be removed in a "
|
||||||
|
"future version. Please use 'vllm bench latency' instead.",
|
||||||
|
)
|
||||||
def main(args: argparse.Namespace):
|
def main(args: argparse.Namespace):
|
||||||
print(args)
|
print(args)
|
||||||
|
|
||||||
|
|||||||
@ -38,6 +38,7 @@ from typing import Any, Literal, Optional
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
from tqdm.asyncio import tqdm
|
from tqdm.asyncio import tqdm
|
||||||
from transformers import PreTrainedTokenizerBase
|
from transformers import PreTrainedTokenizerBase
|
||||||
|
from typing_extensions import deprecated
|
||||||
|
|
||||||
from backend_request_func import (
|
from backend_request_func import (
|
||||||
ASYNC_REQUEST_FUNCS,
|
ASYNC_REQUEST_FUNCS,
|
||||||
@ -593,6 +594,10 @@ def save_to_pytorch_benchmark_format(
|
|||||||
write_to_json(pt_file, pt_records)
|
write_to_json(pt_file, pt_records)
|
||||||
|
|
||||||
|
|
||||||
|
@deprecated(
|
||||||
|
"benchmark_serving.py is deprecated and will be removed in a future "
|
||||||
|
"version. Please use 'vllm bench serve' instead.",
|
||||||
|
)
|
||||||
def main(args: argparse.Namespace):
|
def main(args: argparse.Namespace):
|
||||||
print(args)
|
print(args)
|
||||||
random.seed(args.seed)
|
random.seed(args.seed)
|
||||||
|
|||||||
@ -15,6 +15,7 @@ import torch
|
|||||||
import uvloop
|
import uvloop
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase
|
from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase
|
||||||
|
from typing_extensions import deprecated
|
||||||
|
|
||||||
from benchmark_dataset import (
|
from benchmark_dataset import (
|
||||||
AIMODataset,
|
AIMODataset,
|
||||||
@ -382,6 +383,10 @@ def get_requests(args, tokenizer):
|
|||||||
return dataset_cls(**common_kwargs).sample(**sample_kwargs)
|
return dataset_cls(**common_kwargs).sample(**sample_kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
@deprecated(
|
||||||
|
"benchmark_throughput.py is deprecated and will be removed in a "
|
||||||
|
"future version. Please use 'vllm bench throughput' instead.",
|
||||||
|
)
|
||||||
def main(args: argparse.Namespace):
|
def main(args: argparse.Namespace):
|
||||||
if args.seed is None:
|
if args.seed is None:
|
||||||
args.seed = 0
|
args.seed = 0
|
||||||
|
|||||||
@ -12,6 +12,8 @@ kill_gpu_processes() {
|
|||||||
# kill all processes on GPU.
|
# kill all processes on GPU.
|
||||||
pgrep pt_main_thread | xargs -r kill -9
|
pgrep pt_main_thread | xargs -r kill -9
|
||||||
pgrep python3 | xargs -r kill -9
|
pgrep python3 | xargs -r kill -9
|
||||||
|
# vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
|
||||||
|
pgrep VLLM | xargs -r kill -9
|
||||||
sleep 10
|
sleep 10
|
||||||
|
|
||||||
# remove vllm config file
|
# remove vllm config file
|
||||||
@ -76,38 +78,38 @@ benchmark() {
|
|||||||
wait_for_server 8200
|
wait_for_server 8200
|
||||||
|
|
||||||
# let the prefill instance finish prefill
|
# let the prefill instance finish prefill
|
||||||
python3 ../benchmark_serving.py \
|
vllm bench serve \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--model $model \
|
--model $model \
|
||||||
--dataset-name $dataset_name \
|
--dataset-name $dataset_name \
|
||||||
--dataset-path $dataset_path \
|
--dataset-path $dataset_path \
|
||||||
--sonnet-input-len $input_len \
|
--sonnet-input-len $input_len \
|
||||||
--sonnet-output-len "$output_len" \
|
--sonnet-output-len "$output_len" \
|
||||||
--sonnet-prefix-len $prefix_len \
|
--sonnet-prefix-len $prefix_len \
|
||||||
--num-prompts $num_prompts \
|
--num-prompts $num_prompts \
|
||||||
--port 8100 \
|
--port 8100 \
|
||||||
--save-result \
|
--save-result \
|
||||||
--result-dir $results_folder \
|
--result-dir $results_folder \
|
||||||
--result-filename disagg_prefill_tp1.json \
|
--result-filename disagg_prefill_tp1.json \
|
||||||
--request-rate "inf"
|
--request-rate "inf"
|
||||||
|
|
||||||
|
|
||||||
# send the request to decode.
|
# send the request to decode.
|
||||||
# The TTFT of this command will be the overhead of disagg prefill impl.
|
# The TTFT of this command will be the overhead of disagg prefill impl.
|
||||||
python3 ../benchmark_serving.py \
|
vllm bench serve \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--model $model \
|
--model $model \
|
||||||
--dataset-name $dataset_name \
|
--dataset-name $dataset_name \
|
||||||
--dataset-path $dataset_path \
|
--dataset-path $dataset_path \
|
||||||
--sonnet-input-len $input_len \
|
--sonnet-input-len $input_len \
|
||||||
--sonnet-output-len "$output_len" \
|
--sonnet-output-len "$output_len" \
|
||||||
--sonnet-prefix-len $prefix_len \
|
--sonnet-prefix-len $prefix_len \
|
||||||
--num-prompts $num_prompts \
|
--num-prompts $num_prompts \
|
||||||
--port 8200 \
|
--port 8200 \
|
||||||
--save-result \
|
--save-result \
|
||||||
--result-dir $results_folder \
|
--result-dir $results_folder \
|
||||||
--result-filename disagg_prefill_tp1_overhead.json \
|
--result-filename disagg_prefill_tp1_overhead.json \
|
||||||
--request-rate "$qps"
|
--request-rate "$qps"
|
||||||
kill_gpu_processes
|
kill_gpu_processes
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -18,6 +18,8 @@ kill_gpu_processes() {
|
|||||||
# kill all processes on GPU.
|
# kill all processes on GPU.
|
||||||
pgrep pt_main_thread | xargs -r kill -9
|
pgrep pt_main_thread | xargs -r kill -9
|
||||||
pgrep python3 | xargs -r kill -9
|
pgrep python3 | xargs -r kill -9
|
||||||
|
# vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
|
||||||
|
pgrep VLLM | xargs -r kill -9
|
||||||
for port in 8000 8100 8200; do lsof -t -i:$port | xargs -r kill -9; done
|
for port in 8000 8100 8200; do lsof -t -i:$port | xargs -r kill -9; done
|
||||||
sleep 1
|
sleep 1
|
||||||
}
|
}
|
||||||
@ -97,20 +99,20 @@ benchmark() {
|
|||||||
output_len=$2
|
output_len=$2
|
||||||
tag=$3
|
tag=$3
|
||||||
|
|
||||||
python3 ../benchmark_serving.py \
|
vllm bench serve \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--model $model \
|
--model $model \
|
||||||
--dataset-name $dataset_name \
|
--dataset-name $dataset_name \
|
||||||
--dataset-path $dataset_path \
|
--dataset-path $dataset_path \
|
||||||
--sonnet-input-len $input_len \
|
--sonnet-input-len $input_len \
|
||||||
--sonnet-output-len "$output_len" \
|
--sonnet-output-len "$output_len" \
|
||||||
--sonnet-prefix-len $prefix_len \
|
--sonnet-prefix-len $prefix_len \
|
||||||
--num-prompts $num_prompts \
|
--num-prompts $num_prompts \
|
||||||
--port 8000 \
|
--port 8000 \
|
||||||
--save-result \
|
--save-result \
|
||||||
--result-dir $results_folder \
|
--result-dir $results_folder \
|
||||||
--result-filename "$tag"-qps-"$qps".json \
|
--result-filename "$tag"-qps-"$qps".json \
|
||||||
--request-rate "$qps"
|
--request-rate "$qps"
|
||||||
|
|
||||||
sleep 2
|
sleep 2
|
||||||
}
|
}
|
||||||
|
|||||||
@ -5,9 +5,8 @@ import itertools
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm import _custom_ops as ops
|
|
||||||
from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
|
from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
|
||||||
moe_align_block_size_triton,
|
moe_align_block_size,
|
||||||
)
|
)
|
||||||
from vllm.triton_utils import triton
|
from vllm.triton_utils import triton
|
||||||
|
|
||||||
@ -21,60 +20,6 @@ def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def check_correctness(num_tokens, num_experts=256, block_size=256, topk=8):
|
|
||||||
"""
|
|
||||||
Verifies vllm vs. Triton
|
|
||||||
"""
|
|
||||||
topk_ids = get_topk_ids(num_tokens, num_experts, topk)
|
|
||||||
|
|
||||||
# 1. malloc space for triton and vllm
|
|
||||||
# malloc enough space (max_num_tokens_padded) for the sorted ids
|
|
||||||
max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
|
|
||||||
sorted_ids_triton = torch.empty(
|
|
||||||
(max_num_tokens_padded,), dtype=torch.int32, device="cuda"
|
|
||||||
)
|
|
||||||
expert_ids_triton = torch.empty(
|
|
||||||
(max_num_tokens_padded // block_size,), dtype=torch.int32, device="cuda"
|
|
||||||
)
|
|
||||||
num_tokens_post_pad_triton = torch.empty((1,), dtype=torch.int32, device="cuda")
|
|
||||||
|
|
||||||
sorted_ids_vllm = torch.empty_like(sorted_ids_triton)
|
|
||||||
expert_ids_vllm = torch.empty_like(expert_ids_triton)
|
|
||||||
num_tokens_post_pad_vllm = torch.empty_like(num_tokens_post_pad_triton)
|
|
||||||
|
|
||||||
# 2. run implementations
|
|
||||||
moe_align_block_size_triton(
|
|
||||||
topk_ids,
|
|
||||||
num_experts,
|
|
||||||
block_size,
|
|
||||||
sorted_ids_triton,
|
|
||||||
expert_ids_triton,
|
|
||||||
num_tokens_post_pad_triton,
|
|
||||||
)
|
|
||||||
|
|
||||||
ops.moe_align_block_size(
|
|
||||||
topk_ids,
|
|
||||||
num_experts,
|
|
||||||
block_size,
|
|
||||||
sorted_ids_vllm,
|
|
||||||
expert_ids_vllm,
|
|
||||||
num_tokens_post_pad_vllm,
|
|
||||||
)
|
|
||||||
print(f"✅ VLLM implementation works with {num_experts} experts!")
|
|
||||||
|
|
||||||
# 3. compare results
|
|
||||||
if torch.allclose(expert_ids_triton, expert_ids_vllm) and torch.allclose(
|
|
||||||
num_tokens_post_pad_triton, num_tokens_post_pad_vllm
|
|
||||||
):
|
|
||||||
print("✅ Triton and VLLM implementations match.")
|
|
||||||
else:
|
|
||||||
print("❌ Triton and VLLM implementations DO NOT match.")
|
|
||||||
print("Triton expert_ids:", expert_ids_triton)
|
|
||||||
print("VLLM expert_ids:", expert_ids_vllm)
|
|
||||||
print("Triton num_tokens_post_pad:", num_tokens_post_pad_triton)
|
|
||||||
print("VLLM num_tokens_post_pad:", num_tokens_post_pad_vllm)
|
|
||||||
|
|
||||||
|
|
||||||
# test configurations
|
# test configurations
|
||||||
num_tokens_range = [1, 16, 256, 4096]
|
num_tokens_range = [1, 16, 256, 4096]
|
||||||
num_experts_range = [16, 64, 224, 256, 280, 512]
|
num_experts_range = [16, 64, 224, 256, 280, 512]
|
||||||
@ -87,8 +32,8 @@ configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range
|
|||||||
x_names=["num_tokens", "num_experts", "topk"],
|
x_names=["num_tokens", "num_experts", "topk"],
|
||||||
x_vals=configs,
|
x_vals=configs,
|
||||||
line_arg="provider",
|
line_arg="provider",
|
||||||
line_vals=["vllm", "triton"], # "triton"
|
line_vals=["vllm"],
|
||||||
line_names=["VLLM", "Triton"], # "Triton"
|
line_names=["vLLM"],
|
||||||
plot_name="moe-align-block-size-performance",
|
plot_name="moe-align-block-size-performance",
|
||||||
args={},
|
args={},
|
||||||
)
|
)
|
||||||
@ -98,36 +43,11 @@ def benchmark(num_tokens, num_experts, topk, provider):
|
|||||||
block_size = 256
|
block_size = 256
|
||||||
topk_ids = get_topk_ids(num_tokens, num_experts, topk)
|
topk_ids = get_topk_ids(num_tokens, num_experts, topk)
|
||||||
|
|
||||||
max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
|
|
||||||
sorted_ids = torch.empty((max_num_tokens_padded,), dtype=torch.int32, device="cuda")
|
|
||||||
max_num_m_blocks = max_num_tokens_padded // block_size
|
|
||||||
expert_ids = torch.empty((max_num_m_blocks,), dtype=torch.int32, device="cuda")
|
|
||||||
num_tokens_post_pad = torch.empty((1,), dtype=torch.int32, device="cuda")
|
|
||||||
|
|
||||||
quantiles = [0.5, 0.2, 0.8]
|
quantiles = [0.5, 0.2, 0.8]
|
||||||
|
|
||||||
if provider == "vllm":
|
if provider == "vllm":
|
||||||
ms, min_ms, max_ms = triton.testing.do_bench(
|
ms, min_ms, max_ms = triton.testing.do_bench(
|
||||||
lambda: ops.moe_align_block_size(
|
lambda: moe_align_block_size(topk_ids, block_size, num_experts),
|
||||||
topk_ids,
|
|
||||||
num_experts,
|
|
||||||
block_size,
|
|
||||||
sorted_ids.clone(),
|
|
||||||
expert_ids.clone(),
|
|
||||||
num_tokens_post_pad.clone(),
|
|
||||||
),
|
|
||||||
quantiles=quantiles,
|
|
||||||
)
|
|
||||||
elif provider == "triton":
|
|
||||||
ms, min_ms, max_ms = triton.testing.do_bench(
|
|
||||||
lambda: moe_align_block_size_triton(
|
|
||||||
topk_ids,
|
|
||||||
num_experts,
|
|
||||||
block_size,
|
|
||||||
sorted_ids.clone(),
|
|
||||||
expert_ids.clone(),
|
|
||||||
num_tokens_post_pad.clone(),
|
|
||||||
),
|
|
||||||
quantiles=quantiles,
|
quantiles=quantiles,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -151,6 +71,4 @@ if __name__ == "__main__":
|
|||||||
)
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
print("Running correctness check...")
|
|
||||||
check_correctness(num_tokens=1024, num_experts=args.num_experts, topk=args.topk)
|
|
||||||
benchmark.run(print_data=True, show_plots=True)
|
benchmark.run(print_data=True, show_plots=True)
|
||||||
|
|||||||
@ -8,12 +8,13 @@ import ray
|
|||||||
import torch
|
import torch
|
||||||
from transformers import AutoConfig
|
from transformers import AutoConfig
|
||||||
|
|
||||||
from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
|
from vllm.model_executor.layers.fused_moe.fused_moe import *
|
||||||
|
from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
|
||||||
_moe_permute,
|
_moe_permute,
|
||||||
_moe_unpermute_and_reduce,
|
_moe_unpermute_and_reduce,
|
||||||
|
moe_permute,
|
||||||
|
moe_unpermute,
|
||||||
)
|
)
|
||||||
from vllm.model_executor.layers.fused_moe.fused_moe import *
|
|
||||||
from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import *
|
|
||||||
from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize
|
from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
@ -63,18 +64,19 @@ def benchmark_permute(
|
|||||||
|
|
||||||
def run():
|
def run():
|
||||||
if use_customized_permute:
|
if use_customized_permute:
|
||||||
(permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = (
|
(
|
||||||
moe_permute(
|
permuted_hidden_states,
|
||||||
qhidden_states,
|
a1q_scale,
|
||||||
topk_weights=topk_weights,
|
first_token_off,
|
||||||
topk_ids=topk_ids,
|
inv_perm_idx,
|
||||||
token_expert_indices=token_expert_indices,
|
m_indices,
|
||||||
topk=topk,
|
) = moe_permute(
|
||||||
n_expert=num_experts,
|
qhidden_states,
|
||||||
n_local_expert=num_experts,
|
a1q_scale=None,
|
||||||
expert_map=None,
|
topk_ids=topk_ids,
|
||||||
align_block_size=align_block_size,
|
n_expert=num_experts,
|
||||||
)
|
expert_map=None,
|
||||||
|
align_block_size=align_block_size,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
(
|
(
|
||||||
@ -150,18 +152,19 @@ def benchmark_unpermute(
|
|||||||
|
|
||||||
def prepare():
|
def prepare():
|
||||||
if use_customized_permute:
|
if use_customized_permute:
|
||||||
(permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = (
|
(
|
||||||
moe_permute(
|
permuted_hidden_states,
|
||||||
qhidden_states,
|
a1q_scale,
|
||||||
topk_weights=topk_weights,
|
first_token_off,
|
||||||
topk_ids=topk_ids,
|
inv_perm_idx,
|
||||||
token_expert_indices=token_expert_indices,
|
m_indices,
|
||||||
topk=topk,
|
) = moe_permute(
|
||||||
n_expert=num_experts,
|
qhidden_states,
|
||||||
n_local_expert=num_experts,
|
a1q_scale=None,
|
||||||
expert_map=None,
|
topk_ids=topk_ids,
|
||||||
align_block_size=align_block_size,
|
n_expert=num_experts,
|
||||||
)
|
expert_map=None,
|
||||||
|
align_block_size=align_block_size,
|
||||||
)
|
)
|
||||||
# convert to fp16/bf16 as gemm output
|
# convert to fp16/bf16 as gemm output
|
||||||
return (
|
return (
|
||||||
@ -191,16 +194,19 @@ def benchmark_unpermute(
|
|||||||
|
|
||||||
def run(input: tuple):
|
def run(input: tuple):
|
||||||
if use_customized_permute:
|
if use_customized_permute:
|
||||||
(permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = input
|
(
|
||||||
|
permuted_hidden_states,
|
||||||
|
first_token_off,
|
||||||
|
inv_perm_idx,
|
||||||
|
m_indices,
|
||||||
|
) = input
|
||||||
|
output = torch.empty_like(hidden_states)
|
||||||
moe_unpermute(
|
moe_unpermute(
|
||||||
|
output,
|
||||||
permuted_hidden_states,
|
permuted_hidden_states,
|
||||||
topk_weights,
|
topk_weights,
|
||||||
topk_ids,
|
|
||||||
inv_perm_idx,
|
inv_perm_idx,
|
||||||
first_token_off,
|
first_token_off,
|
||||||
topk,
|
|
||||||
num_experts,
|
|
||||||
num_experts,
|
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
(
|
(
|
||||||
@ -211,7 +217,11 @@ def benchmark_unpermute(
|
|||||||
inv_perm,
|
inv_perm,
|
||||||
) = input
|
) = input
|
||||||
_moe_unpermute_and_reduce(
|
_moe_unpermute_and_reduce(
|
||||||
output_hidden_states, permuted_hidden_states, inv_perm, topk_weights
|
output_hidden_states,
|
||||||
|
permuted_hidden_states,
|
||||||
|
inv_perm,
|
||||||
|
topk_weights,
|
||||||
|
True,
|
||||||
)
|
)
|
||||||
|
|
||||||
# JIT compilation & warmup
|
# JIT compilation & warmup
|
||||||
|
|||||||
@ -151,7 +151,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding);
|
ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding);
|
||||||
|
|
||||||
// Quantization
|
// Quantization
|
||||||
#if defined(__AVX512F__) || defined(__aarch64__)
|
#if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__))
|
||||||
at::Tag stride_tag = at::Tag::needs_fixed_stride_order;
|
at::Tag stride_tag = at::Tag::needs_fixed_stride_order;
|
||||||
|
|
||||||
// Compute int8 quantized tensor for given scaling factor.
|
// Compute int8 quantized tensor for given scaling factor.
|
||||||
|
|||||||
@ -10,32 +10,28 @@
|
|||||||
|
|
||||||
void moe_permute(
|
void moe_permute(
|
||||||
const torch::Tensor& input, // [n_token, hidden]
|
const torch::Tensor& input, // [n_token, hidden]
|
||||||
const torch::Tensor& topk_weights, //[n_token, topk]
|
const torch::Tensor& topk_ids, // [n_token, topk]
|
||||||
torch::Tensor& topk_ids, // [n_token, topk]
|
|
||||||
const torch::Tensor& token_expert_indices, // [n_token, topk]
|
const torch::Tensor& token_expert_indices, // [n_token, topk]
|
||||||
const std::optional<torch::Tensor>& expert_map, // [n_expert]
|
const std::optional<torch::Tensor>& expert_map, // [n_expert]
|
||||||
int64_t n_expert, int64_t n_local_expert, int64_t topk,
|
int64_t n_expert, int64_t n_local_expert, int64_t topk,
|
||||||
const std::optional<int64_t>& align_block_size,
|
const std::optional<int64_t>& align_block_size,
|
||||||
torch::Tensor&
|
torch::Tensor& permuted_input, // [permuted_size, hidden]
|
||||||
permuted_input, // [topk * n_token/align_block_size_m, hidden]
|
|
||||||
torch::Tensor& expert_first_token_offset, // [n_local_expert + 1]
|
torch::Tensor& expert_first_token_offset, // [n_local_expert + 1]
|
||||||
torch::Tensor& src_row_id2dst_row_id_map, // [n_token, topk]
|
torch::Tensor& inv_permuted_idx, // [n_token, topk]
|
||||||
|
torch::Tensor& permuted_idx, // [permute_size]
|
||||||
torch::Tensor& m_indices) { // [align_expand_m]
|
torch::Tensor& m_indices) { // [align_expand_m]
|
||||||
TORCH_CHECK(topk_weights.scalar_type() == at::ScalarType::Float,
|
|
||||||
"topk_weights must be float32");
|
|
||||||
TORCH_CHECK(expert_first_token_offset.scalar_type() == at::ScalarType::Long,
|
TORCH_CHECK(expert_first_token_offset.scalar_type() == at::ScalarType::Long,
|
||||||
"expert_first_token_offset must be int64");
|
"expert_first_token_offset must be int64");
|
||||||
TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int,
|
TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int,
|
||||||
"topk_ids must be int32");
|
"topk_ids must be int32");
|
||||||
TORCH_CHECK(token_expert_indices.scalar_type() == at::ScalarType::Int,
|
TORCH_CHECK(token_expert_indices.scalar_type() == at::ScalarType::Int,
|
||||||
"token_expert_indices must be int32");
|
"token_expert_indices must be int32");
|
||||||
TORCH_CHECK(src_row_id2dst_row_id_map.scalar_type() == at::ScalarType::Int,
|
TORCH_CHECK(inv_permuted_idx.scalar_type() == at::ScalarType::Int,
|
||||||
"src_row_id2dst_row_id_map must be int32");
|
"inv_permuted_idx must be int32");
|
||||||
TORCH_CHECK(expert_first_token_offset.size(0) == n_local_expert + 1,
|
TORCH_CHECK(expert_first_token_offset.size(0) == n_local_expert + 1,
|
||||||
"expert_first_token_offset shape != n_local_expert+1")
|
"expert_first_token_offset shape != n_local_expert+1")
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(inv_permuted_idx.sizes() == token_expert_indices.sizes(),
|
||||||
src_row_id2dst_row_id_map.sizes() == token_expert_indices.sizes(),
|
"token_expert_indices shape must be same as inv_permuted_idx");
|
||||||
"token_expert_indices shape must be same as src_row_id2dst_row_id_map");
|
|
||||||
auto n_token = input.sizes()[0];
|
auto n_token = input.sizes()[0];
|
||||||
auto n_hidden = input.sizes()[1];
|
auto n_hidden = input.sizes()[1];
|
||||||
auto align_block_size_value =
|
auto align_block_size_value =
|
||||||
@ -46,8 +42,9 @@ void moe_permute(
|
|||||||
auto sort_workspace = torch::empty(
|
auto sort_workspace = torch::empty(
|
||||||
{sorter_size},
|
{sorter_size},
|
||||||
torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false));
|
torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false));
|
||||||
|
auto copy_topk_ids = topk_ids.clone(); // copy topk_ids for preprocess
|
||||||
auto permuted_experts_id = torch::empty_like(topk_ids);
|
auto permuted_experts_id = torch::empty_like(topk_ids);
|
||||||
auto dst_row_id2src_row_id_map = torch::empty_like(src_row_id2dst_row_id_map);
|
auto sorted_row_idx = torch::empty_like(inv_permuted_idx);
|
||||||
auto align_expert_first_token_offset =
|
auto align_expert_first_token_offset =
|
||||||
torch::zeros_like(expert_first_token_offset);
|
torch::zeros_like(expert_first_token_offset);
|
||||||
|
|
||||||
@ -67,24 +64,22 @@ void moe_permute(
|
|||||||
const int* expert_map_ptr = get_ptr<int>(expert_map.value());
|
const int* expert_map_ptr = get_ptr<int>(expert_map.value());
|
||||||
valid_num_ptr =
|
valid_num_ptr =
|
||||||
get_ptr<int64_t>(expert_first_token_offset) + n_local_expert;
|
get_ptr<int64_t>(expert_first_token_offset) + n_local_expert;
|
||||||
preprocessTopkIdLauncher(get_ptr<int>(topk_ids), n_token * topk,
|
preprocessTopkIdLauncher(get_ptr<int>(copy_topk_ids), n_token * topk,
|
||||||
expert_map_ptr, n_expert, stream);
|
expert_map_ptr, n_expert, stream);
|
||||||
}
|
}
|
||||||
// expert sort topk expert id and scan expert id get expert_first_token_offset
|
// expert sort topk expert id and scan expert id get expert_first_token_offset
|
||||||
sortAndScanExpert(get_ptr<int>(topk_ids), get_ptr<int>(token_expert_indices),
|
sortAndScanExpert(
|
||||||
get_ptr<int>(permuted_experts_id),
|
get_ptr<int>(copy_topk_ids), get_ptr<int>(token_expert_indices),
|
||||||
get_ptr<int>(dst_row_id2src_row_id_map),
|
get_ptr<int>(permuted_experts_id), get_ptr<int>(sorted_row_idx),
|
||||||
get_ptr<int64_t>(expert_first_token_offset), n_token,
|
get_ptr<int64_t>(expert_first_token_offset), n_token, n_expert,
|
||||||
n_expert, n_local_expert, topk, sorter,
|
n_local_expert, topk, sorter, get_ptr<int>(sort_workspace), stream);
|
||||||
get_ptr<int>(sort_workspace), stream);
|
|
||||||
|
|
||||||
// dispatch expandInputRowsKernelLauncher
|
// dispatch expandInputRowsKernelLauncher
|
||||||
MOE_DISPATCH(input.scalar_type(), [&] {
|
MOE_DISPATCH(input.scalar_type(), [&] {
|
||||||
expandInputRowsKernelLauncher<scalar_t>(
|
expandInputRowsKernelLauncher<scalar_t>(
|
||||||
get_ptr<scalar_t>(input), get_ptr<scalar_t>(permuted_input),
|
get_ptr<scalar_t>(input), get_ptr<scalar_t>(permuted_input),
|
||||||
get_ptr<float>(topk_weights), get_ptr<int>(permuted_experts_id),
|
get_ptr<int>(permuted_experts_id), get_ptr<int>(sorted_row_idx),
|
||||||
get_ptr<int>(dst_row_id2src_row_id_map),
|
get_ptr<int>(inv_permuted_idx), get_ptr<int>(permuted_idx),
|
||||||
get_ptr<int>(src_row_id2dst_row_id_map),
|
|
||||||
get_ptr<int64_t>(expert_first_token_offset), n_token, valid_num_ptr,
|
get_ptr<int64_t>(expert_first_token_offset), n_token, valid_num_ptr,
|
||||||
n_hidden, topk, n_local_expert, align_block_size_value, stream);
|
n_hidden, topk, n_local_expert, align_block_size_value, stream);
|
||||||
});
|
});
|
||||||
@ -101,32 +96,34 @@ void moe_permute(
|
|||||||
}
|
}
|
||||||
|
|
||||||
void moe_unpermute(
|
void moe_unpermute(
|
||||||
const torch::Tensor& permuted_hidden_states, // [n_token * topk, hidden]
|
const torch::Tensor& permuted_hidden_states, // [n_token * topk, hidden]
|
||||||
const torch::Tensor& topk_weights, //[n_token, topk]
|
const torch::Tensor& topk_weights, // [n_token, topk]
|
||||||
const torch::Tensor& topk_ids, // [n_token, topk]
|
const torch::Tensor& inv_permuted_idx, // [n_token, topk]
|
||||||
const torch::Tensor& src_row_id2dst_row_id_map, // [n_token, topk]
|
const std::optional<torch::Tensor>&
|
||||||
const torch::Tensor& expert_first_token_offset, // [n_local_expert+1]
|
expert_first_token_offset, // [n_local_expert+1]
|
||||||
int64_t n_expert, int64_t n_local_expert, int64_t topk,
|
int64_t topk,
|
||||||
torch::Tensor& hidden_states // [n_token, hidden]
|
torch::Tensor& hidden_states // [n_token, hidden]
|
||||||
) {
|
) {
|
||||||
TORCH_CHECK(src_row_id2dst_row_id_map.sizes() == topk_ids.sizes(),
|
|
||||||
"topk_ids shape must be same as src_row_id2dst_row_id_map");
|
|
||||||
TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int,
|
|
||||||
"topk_ids must be int32");
|
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
permuted_hidden_states.scalar_type() == hidden_states.scalar_type(),
|
permuted_hidden_states.scalar_type() == hidden_states.scalar_type(),
|
||||||
"topk_ids dtype must be same as src_row_id2dst_row_id_map");
|
"permuted_hidden_states dtype must be same as hidden_states");
|
||||||
auto n_token = hidden_states.size(0);
|
auto n_token = hidden_states.size(0);
|
||||||
auto n_hidden = hidden_states.size(1);
|
auto n_hidden = hidden_states.size(1);
|
||||||
auto stream = at::cuda::getCurrentCUDAStream().stream();
|
auto stream = at::cuda::getCurrentCUDAStream().stream();
|
||||||
const int64_t* valid_ptr =
|
|
||||||
get_ptr<int64_t>(expert_first_token_offset) + n_local_expert;
|
int64_t const* valid_ptr = nullptr;
|
||||||
|
if (expert_first_token_offset.has_value()) {
|
||||||
|
int n_local_expert = expert_first_token_offset.value().size(0) - 1;
|
||||||
|
valid_ptr =
|
||||||
|
get_ptr<int64_t>(expert_first_token_offset.value()) + n_local_expert;
|
||||||
|
}
|
||||||
|
|
||||||
MOE_DISPATCH(hidden_states.scalar_type(), [&] {
|
MOE_DISPATCH(hidden_states.scalar_type(), [&] {
|
||||||
finalizeMoeRoutingKernelLauncher<scalar_t, scalar_t>(
|
finalizeMoeRoutingKernelLauncher<scalar_t, scalar_t>(
|
||||||
get_ptr<scalar_t>(permuted_hidden_states),
|
get_ptr<scalar_t>(permuted_hidden_states),
|
||||||
get_ptr<scalar_t>(hidden_states), get_ptr<float>(topk_weights),
|
get_ptr<scalar_t>(hidden_states), get_ptr<float>(topk_weights),
|
||||||
get_ptr<int>(src_row_id2dst_row_id_map), get_ptr<int>(topk_ids),
|
get_ptr<int>(inv_permuted_idx), n_token, n_hidden, topk, valid_ptr,
|
||||||
n_token, n_hidden, topk, valid_ptr, stream);
|
stream);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -177,7 +177,7 @@ __global__ void getMIndicesKernel(int64_t* expert_first_token_offset,
|
|||||||
int tidx = threadIdx.x;
|
int tidx = threadIdx.x;
|
||||||
extern __shared__ int64_t smem_expert_first_token_offset[];
|
extern __shared__ int64_t smem_expert_first_token_offset[];
|
||||||
for (int i = tidx; i <= num_local_expert; i += blockDim.x) {
|
for (int i = tidx; i <= num_local_expert; i += blockDim.x) {
|
||||||
smem_expert_first_token_offset[tidx] = __ldg(expert_first_token_offset + i);
|
smem_expert_first_token_offset[i] = __ldg(expert_first_token_offset + i);
|
||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
auto last_token_offset = smem_expert_first_token_offset[eidx + 1];
|
auto last_token_offset = smem_expert_first_token_offset[eidx + 1];
|
||||||
|
|||||||
@ -57,31 +57,19 @@ void sortAndScanExpert(int* expert_for_source_row, const int* source_rows,
|
|||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void expandInputRowsKernelLauncher(
|
void expandInputRowsKernelLauncher(
|
||||||
T const* unpermuted_input, T* permuted_output,
|
T const* unpermuted_input, T* permuted_output, int* sorted_experts,
|
||||||
const float* unpermuted_scales, int* sorted_experts,
|
|
||||||
int const* expanded_dest_row_to_expanded_source_row,
|
int const* expanded_dest_row_to_expanded_source_row,
|
||||||
int* expanded_source_row_to_expanded_dest_row,
|
int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
|
||||||
int64_t* expert_first_token_offset, int64_t const num_rows,
|
int64_t* expert_first_token_offset, int64_t const num_rows,
|
||||||
int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k,
|
int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k,
|
||||||
int num_local_experts, const int& align_block_size, cudaStream_t stream);
|
int num_local_experts, const int& align_block_size, cudaStream_t stream);
|
||||||
|
|
||||||
// Final kernel to unpermute and scale
|
|
||||||
// This kernel unpermutes the original data, does the k-way reduction and
|
|
||||||
// performs the final skip connection.
|
|
||||||
template <typename T, typename OutputType, bool CHECK_SKIPPED>
|
|
||||||
__global__ void finalizeMoeRoutingKernel(
|
|
||||||
T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
|
|
||||||
float const* scales, int const* expanded_source_row_to_expanded_dest_row,
|
|
||||||
int const* expert_for_source_row, int64_t const orig_cols, int64_t const k,
|
|
||||||
int64_t const* num_valid_ptr);
|
|
||||||
|
|
||||||
template <class T, class OutputType>
|
template <class T, class OutputType>
|
||||||
void finalizeMoeRoutingKernelLauncher(
|
void finalizeMoeRoutingKernelLauncher(
|
||||||
T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
|
T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
|
||||||
float const* scales, int const* expanded_source_row_to_expanded_dest_row,
|
float const* scales, int const* expanded_source_row_to_expanded_dest_row,
|
||||||
int const* expert_for_source_row, int64_t const num_rows,
|
int64_t const num_rows, int64_t const cols, int64_t const k,
|
||||||
int64_t const cols, int64_t const k, int64_t const* num_valid_ptr,
|
int64_t const* num_valid_ptr, cudaStream_t stream);
|
||||||
cudaStream_t stream);
|
|
||||||
|
|
||||||
void preprocessTopkIdLauncher(int* topk_id_ptr, int size,
|
void preprocessTopkIdLauncher(int* topk_id_ptr, int size,
|
||||||
const int* expert_map_ptr, int num_experts,
|
const int* expert_map_ptr, int num_experts,
|
||||||
|
|||||||
@ -2,10 +2,9 @@
|
|||||||
|
|
||||||
template <typename T, bool CHECK_SKIPPED, bool ALIGN_BLOCK_SIZE>
|
template <typename T, bool CHECK_SKIPPED, bool ALIGN_BLOCK_SIZE>
|
||||||
__global__ void expandInputRowsKernel(
|
__global__ void expandInputRowsKernel(
|
||||||
T const* unpermuted_input, T* permuted_output,
|
T const* unpermuted_input, T* permuted_output, int* sorted_experts,
|
||||||
const float* unpermuted_scales, int* sorted_experts,
|
|
||||||
int const* expanded_dest_row_to_expanded_source_row,
|
int const* expanded_dest_row_to_expanded_source_row,
|
||||||
int* expanded_source_row_to_expanded_dest_row,
|
int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
|
||||||
int64_t* expert_first_token_offset, int64_t const num_rows,
|
int64_t* expert_first_token_offset, int64_t const num_rows,
|
||||||
int64_t const* num_dest_rows, int64_t const cols, int64_t k,
|
int64_t const* num_dest_rows, int64_t const cols, int64_t k,
|
||||||
int num_local_experts, int align_block_size) {
|
int num_local_experts, int align_block_size) {
|
||||||
@ -54,6 +53,10 @@ __global__ void expandInputRowsKernel(
|
|||||||
assert(expanded_dest_row <= INT32_MAX);
|
assert(expanded_dest_row <= INT32_MAX);
|
||||||
expanded_source_row_to_expanded_dest_row[expanded_source_row] =
|
expanded_source_row_to_expanded_dest_row[expanded_source_row] =
|
||||||
static_cast<int>(expanded_dest_row);
|
static_cast<int>(expanded_dest_row);
|
||||||
|
// skip non local expert token
|
||||||
|
if (!CHECK_SKIPPED || blockIdx.x < *num_dest_rows) {
|
||||||
|
permuted_idx[expanded_dest_row] = expanded_source_row;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!CHECK_SKIPPED || blockIdx.x < *num_dest_rows) {
|
if (!CHECK_SKIPPED || blockIdx.x < *num_dest_rows) {
|
||||||
@ -62,7 +65,7 @@ __global__ void expandInputRowsKernel(
|
|||||||
using DataElem = cutlass::Array<T, ELEM_PER_THREAD>;
|
using DataElem = cutlass::Array<T, ELEM_PER_THREAD>;
|
||||||
|
|
||||||
// Duplicate and permute rows
|
// Duplicate and permute rows
|
||||||
int64_t const source_row = expanded_source_row % num_rows;
|
int64_t const source_row = expanded_source_row / k;
|
||||||
|
|
||||||
auto const* source_row_ptr =
|
auto const* source_row_ptr =
|
||||||
reinterpret_cast<DataElem const*>(unpermuted_input + source_row * cols);
|
reinterpret_cast<DataElem const*>(unpermuted_input + source_row * cols);
|
||||||
@ -82,10 +85,9 @@ __global__ void expandInputRowsKernel(
|
|||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void expandInputRowsKernelLauncher(
|
void expandInputRowsKernelLauncher(
|
||||||
T const* unpermuted_input, T* permuted_output,
|
T const* unpermuted_input, T* permuted_output, int* sorted_experts,
|
||||||
const float* unpermuted_scales, int* sorted_experts,
|
|
||||||
int const* expanded_dest_row_to_expanded_source_row,
|
int const* expanded_dest_row_to_expanded_source_row,
|
||||||
int* expanded_source_row_to_expanded_dest_row,
|
int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
|
||||||
int64_t* expert_first_token_offset, int64_t const num_rows,
|
int64_t* expert_first_token_offset, int64_t const num_rows,
|
||||||
int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k,
|
int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k,
|
||||||
int num_local_experts, const int& align_block_size, cudaStream_t stream) {
|
int num_local_experts, const int& align_block_size, cudaStream_t stream) {
|
||||||
@ -105,11 +107,11 @@ void expandInputRowsKernelLauncher(
|
|||||||
int64_t smem_size = sizeof(int64_t) * (num_local_experts + 1);
|
int64_t smem_size = sizeof(int64_t) * (num_local_experts + 1);
|
||||||
|
|
||||||
func<<<blocks, threads, smem_size, stream>>>(
|
func<<<blocks, threads, smem_size, stream>>>(
|
||||||
unpermuted_input, permuted_output, unpermuted_scales, sorted_experts,
|
unpermuted_input, permuted_output, sorted_experts,
|
||||||
expanded_dest_row_to_expanded_source_row,
|
expanded_dest_row_to_expanded_source_row,
|
||||||
expanded_source_row_to_expanded_dest_row, expert_first_token_offset,
|
expanded_source_row_to_expanded_dest_row, permuted_idx,
|
||||||
num_rows, num_valid_tokens_ptr, cols, k, num_local_experts,
|
expert_first_token_offset, num_rows, num_valid_tokens_ptr, cols, k,
|
||||||
align_block_size);
|
num_local_experts, align_block_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class T, class U>
|
template <class T, class U>
|
||||||
@ -128,11 +130,9 @@ template <typename T, typename OutputType, bool CHECK_SKIPPED>
|
|||||||
__global__ void finalizeMoeRoutingKernel(
|
__global__ void finalizeMoeRoutingKernel(
|
||||||
T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
|
T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
|
||||||
float const* scales, int const* expanded_source_row_to_expanded_dest_row,
|
float const* scales, int const* expanded_source_row_to_expanded_dest_row,
|
||||||
int const* expert_for_source_row, int64_t const orig_cols, int64_t const k,
|
int64_t const orig_cols, int64_t const k, int64_t const* num_valid_ptr) {
|
||||||
int64_t const* num_valid_ptr) {
|
|
||||||
assert(orig_cols % 4 == 0);
|
assert(orig_cols % 4 == 0);
|
||||||
int64_t const original_row = blockIdx.x;
|
int64_t const original_row = blockIdx.x;
|
||||||
int64_t const num_rows = gridDim.x;
|
|
||||||
auto const offset = original_row * orig_cols;
|
auto const offset = original_row * orig_cols;
|
||||||
OutputType* reduced_row_ptr = reduced_unpermuted_output + offset;
|
OutputType* reduced_row_ptr = reduced_unpermuted_output + offset;
|
||||||
int64_t const num_valid = *num_valid_ptr;
|
int64_t const num_valid = *num_valid_ptr;
|
||||||
@ -159,14 +159,13 @@ __global__ void finalizeMoeRoutingKernel(
|
|||||||
ComputeElem thread_output;
|
ComputeElem thread_output;
|
||||||
thread_output.fill(0);
|
thread_output.fill(0);
|
||||||
for (int k_idx = 0; k_idx < k; ++k_idx) {
|
for (int k_idx = 0; k_idx < k; ++k_idx) {
|
||||||
int64_t const expanded_original_row = original_row + k_idx * num_rows;
|
int64_t const expanded_original_row = original_row * k + k_idx;
|
||||||
int64_t const expanded_permuted_row =
|
int64_t const expanded_permuted_row =
|
||||||
expanded_source_row_to_expanded_dest_row[expanded_original_row];
|
expanded_source_row_to_expanded_dest_row[expanded_original_row];
|
||||||
|
|
||||||
int64_t const k_offset = original_row * k + k_idx;
|
int64_t const k_offset = original_row * k + k_idx;
|
||||||
float const row_scale = scales[k_offset];
|
float const row_scale = scales[k_offset];
|
||||||
|
|
||||||
// Check after row_rescale has accumulated
|
|
||||||
if (CHECK_SKIPPED && expanded_permuted_row >= num_valid) {
|
if (CHECK_SKIPPED && expanded_permuted_row >= num_valid) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -189,9 +188,8 @@ template <class T, class OutputType>
|
|||||||
void finalizeMoeRoutingKernelLauncher(
|
void finalizeMoeRoutingKernelLauncher(
|
||||||
T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
|
T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
|
||||||
float const* scales, int const* expanded_source_row_to_expanded_dest_row,
|
float const* scales, int const* expanded_source_row_to_expanded_dest_row,
|
||||||
int const* expert_for_source_row, int64_t const num_rows,
|
int64_t const num_rows, int64_t const cols, int64_t const k,
|
||||||
int64_t const cols, int64_t const k, int64_t const* num_valid_ptr,
|
int64_t const* num_valid_ptr, cudaStream_t stream) {
|
||||||
cudaStream_t stream) {
|
|
||||||
int64_t const blocks = num_rows;
|
int64_t const blocks = num_rows;
|
||||||
int64_t const threads = 256;
|
int64_t const threads = 256;
|
||||||
bool const check_finished = num_valid_ptr != nullptr;
|
bool const check_finished = num_valid_ptr != nullptr;
|
||||||
@ -201,6 +199,5 @@ void finalizeMoeRoutingKernelLauncher(
|
|||||||
auto* const kernel = func_map[check_finished];
|
auto* const kernel = func_map[check_finished];
|
||||||
kernel<<<blocks, threads, 0, stream>>>(
|
kernel<<<blocks, threads, 0, stream>>>(
|
||||||
expanded_permuted_rows, reduced_unpermuted_output, scales,
|
expanded_permuted_rows, reduced_unpermuted_output, scales,
|
||||||
expanded_source_row_to_expanded_dest_row, expert_for_source_row, cols, k,
|
expanded_source_row_to_expanded_dest_row, cols, k, num_valid_ptr);
|
||||||
num_valid_ptr);
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -56,18 +56,17 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
|
|||||||
" -> Tensor");
|
" -> Tensor");
|
||||||
|
|
||||||
m.def(
|
m.def(
|
||||||
"moe_permute(Tensor input, Tensor topk_weight, Tensor! topk_ids,"
|
"moe_permute(Tensor input, Tensor topk_ids,"
|
||||||
"Tensor token_expert_indices, Tensor? expert_map, int n_expert,"
|
"Tensor token_expert_indices, Tensor? expert_map, int n_expert,"
|
||||||
"int n_local_expert,"
|
"int n_local_expert,"
|
||||||
"int topk, int? align_block_size,Tensor! permuted_input, Tensor! "
|
"int topk, int? align_block_size,Tensor! permuted_input, Tensor! "
|
||||||
"expert_first_token_offset, Tensor! src_row_id2dst_row_id_map, Tensor! "
|
"expert_first_token_offset, Tensor! inv_permuted_idx, Tensor! "
|
||||||
"m_indices)->()");
|
"permuted_idx, Tensor! m_indices)->()");
|
||||||
|
|
||||||
m.def(
|
m.def(
|
||||||
"moe_unpermute(Tensor permuted_hidden_states, Tensor topk_weights,"
|
"moe_unpermute(Tensor permuted_hidden_states, Tensor topk_weights,"
|
||||||
"Tensor topk_ids,Tensor src_row_id2dst_row_id_map, Tensor "
|
"Tensor inv_permuted_idx, Tensor? expert_first_token_offset, "
|
||||||
"expert_first_token_offset, int n_expert, int n_local_expert,int "
|
"int topk, Tensor! hidden_states)->()");
|
||||||
"topk, Tensor! hidden_states)->()");
|
|
||||||
|
|
||||||
m.def("moe_permute_unpermute_supported() -> bool");
|
m.def("moe_permute_unpermute_supported() -> bool");
|
||||||
m.impl("moe_permute_unpermute_supported", &moe_permute_unpermute_supported);
|
m.impl("moe_permute_unpermute_supported", &moe_permute_unpermute_supported);
|
||||||
|
|||||||
@ -292,6 +292,11 @@ void per_token_group_quant_fp8(const torch::Tensor& input,
|
|||||||
torch::Tensor& output_q, torch::Tensor& output_s,
|
torch::Tensor& output_q, torch::Tensor& output_s,
|
||||||
int64_t group_size, double eps, double fp8_min,
|
int64_t group_size, double eps, double fp8_min,
|
||||||
double fp8_max, bool scale_ue8m0);
|
double fp8_max, bool scale_ue8m0);
|
||||||
|
|
||||||
|
void per_token_group_quant_int8(const torch::Tensor& input,
|
||||||
|
torch::Tensor& output_q,
|
||||||
|
torch::Tensor& output_s, int64_t group_size,
|
||||||
|
double eps, double int8_min, double int8_max);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
|
void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
|
||||||
|
|||||||
@ -1,6 +1,8 @@
|
|||||||
#include <ATen/cuda/CUDAContext.h>
|
#include <ATen/cuda/CUDAContext.h>
|
||||||
#include <torch/all.h>
|
#include <torch/all.h>
|
||||||
|
|
||||||
|
#include "../per_token_group_quant_8bit.h"
|
||||||
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
|
||||||
#include "../../dispatch_utils.h"
|
#include "../../dispatch_utils.h"
|
||||||
@ -336,3 +338,11 @@ void dynamic_scaled_int8_quant(
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void per_token_group_quant_int8(const torch::Tensor& input,
|
||||||
|
torch::Tensor& output_q,
|
||||||
|
torch::Tensor& output_s, int64_t group_size,
|
||||||
|
double eps, double int8_min, double int8_max) {
|
||||||
|
per_token_group_quant_8bit(input, output_q, output_s, group_size, eps,
|
||||||
|
int8_min, int8_max);
|
||||||
|
}
|
||||||
@ -1,6 +1,8 @@
|
|||||||
#include <ATen/cuda/CUDAContext.h>
|
#include <ATen/cuda/CUDAContext.h>
|
||||||
#include <c10/util/Float8_e4m3fn.h>
|
#include <c10/util/Float8_e4m3fn.h>
|
||||||
|
|
||||||
|
#include "../per_token_group_quant_8bit.h"
|
||||||
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
|
||||||
#include <cuda_fp16.h>
|
#include <cuda_fp16.h>
|
||||||
@ -120,7 +122,7 @@ void per_token_group_quant_8bit(const torch::Tensor& input,
|
|||||||
torch::Tensor& output_q,
|
torch::Tensor& output_q,
|
||||||
torch::Tensor& output_s, int64_t group_size,
|
torch::Tensor& output_s, int64_t group_size,
|
||||||
double eps, double min_8bit, double max_8bit,
|
double eps, double min_8bit, double max_8bit,
|
||||||
bool scale_ue8m0 = false) {
|
bool scale_ue8m0) {
|
||||||
TORCH_CHECK(input.is_contiguous());
|
TORCH_CHECK(input.is_contiguous());
|
||||||
TORCH_CHECK(output_q.is_contiguous());
|
TORCH_CHECK(output_q.is_contiguous());
|
||||||
|
|
||||||
@ -198,6 +200,8 @@ void per_token_group_quant_8bit(const torch::Tensor& input,
|
|||||||
input.scalar_type(), "per_token_group_quant_8bit", ([&] {
|
input.scalar_type(), "per_token_group_quant_8bit", ([&] {
|
||||||
if (dst_type == at::ScalarType::Float8_e4m3fn) {
|
if (dst_type == at::ScalarType::Float8_e4m3fn) {
|
||||||
LAUNCH_KERNEL(scalar_t, c10::Float8_e4m3fn);
|
LAUNCH_KERNEL(scalar_t, c10::Float8_e4m3fn);
|
||||||
|
} else if (dst_type == at::ScalarType::Char) {
|
||||||
|
LAUNCH_KERNEL(scalar_t, int8_t);
|
||||||
}
|
}
|
||||||
}));
|
}));
|
||||||
|
|
||||||
|
|||||||
@ -187,8 +187,12 @@ struct PrepackedLayoutBTemplate {
|
|||||||
CUTE_HOST_DEVICE static constexpr auto TVbNbKL_to_offset_copy(
|
CUTE_HOST_DEVICE static constexpr auto TVbNbKL_to_offset_copy(
|
||||||
Shape_NKL shape_mkl) {
|
Shape_NKL shape_mkl) {
|
||||||
auto layout = TVbNbKL_to_offset(shape_mkl);
|
auto layout = TVbNbKL_to_offset(shape_mkl);
|
||||||
return make_layout(coalesce(get<0>(layout)), get<1>(layout),
|
// for 4-bit elements, having >= 64 values per column
|
||||||
get<2>(layout));
|
// allows TMA to load full 32-byte sectors
|
||||||
|
auto inner_layout =
|
||||||
|
make_layout(make_shape(_256{}, size<0>(layout) / _256{}));
|
||||||
|
|
||||||
|
return make_layout(inner_layout, get<1>(layout), get<2>(layout));
|
||||||
}
|
}
|
||||||
|
|
||||||
// ((BlockN, BlockK), (BlocksN, BlocksK), L) -> (storage_idx)
|
// ((BlockN, BlockK), (BlocksN, BlocksK), L) -> (storage_idx)
|
||||||
|
|||||||
10
csrc/quantization/per_token_group_quant_8bit.h
Normal file
10
csrc/quantization/per_token_group_quant_8bit.h
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
#pragma once
|
||||||
|
#include <torch/all.h>
|
||||||
|
|
||||||
|
// TODO(wentao): refactor the folder to 8bit, then includes fp8 and int8 folders
|
||||||
|
// 8-bit per-token-group quantization helper used by both FP8 and INT8
|
||||||
|
void per_token_group_quant_8bit(const torch::Tensor& input,
|
||||||
|
torch::Tensor& output_q,
|
||||||
|
torch::Tensor& output_s, int64_t group_size,
|
||||||
|
double eps, double min_8bit, double max_8bit,
|
||||||
|
bool scale_ue8m0 = false);
|
||||||
@ -624,6 +624,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
ops.impl("per_token_group_fp8_quant", torch::kCUDA,
|
ops.impl("per_token_group_fp8_quant", torch::kCUDA,
|
||||||
&per_token_group_quant_fp8);
|
&per_token_group_quant_fp8);
|
||||||
|
|
||||||
|
// Compute per-token-group INT8 quantized tensor and scaling factor.
|
||||||
|
ops.def(
|
||||||
|
"per_token_group_quant_int8(Tensor input, Tensor! output_q, Tensor! "
|
||||||
|
"output_s, int group_size, float eps, float int8_min, float int8_max) -> "
|
||||||
|
"()");
|
||||||
|
ops.impl("per_token_group_quant_int8", torch::kCUDA,
|
||||||
|
&per_token_group_quant_int8);
|
||||||
|
|
||||||
// reorder weight for AllSpark Ampere W8A16 Fused Gemm kernel
|
// reorder weight for AllSpark Ampere W8A16 Fused Gemm kernel
|
||||||
ops.def(
|
ops.def(
|
||||||
"rearrange_kn_weight_as_n32k16_order(Tensor b_qweight, Tensor b_scales, "
|
"rearrange_kn_weight_as_n32k16_order(Tensor b_qweight, Tensor b_scales, "
|
||||||
|
|||||||
@ -1,62 +0,0 @@
|
|||||||
# This vLLM Dockerfile is used to construct an image that can build and run vLLM on ARM CPU platform.
|
|
||||||
|
|
||||||
FROM ubuntu:22.04 AS cpu-test-arm
|
|
||||||
|
|
||||||
ENV CCACHE_DIR=/root/.cache/ccache
|
|
||||||
|
|
||||||
ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
|
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/var/cache/apt \
|
|
||||||
apt-get update -y \
|
|
||||||
&& apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
|
|
||||||
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
|
|
||||||
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
|
|
||||||
|
|
||||||
# tcmalloc provides better memory allocation efficiency, e.g., holding memory in caches to speed up access of commonly-used objects.
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
pip install py-cpuinfo # Use this to gather CPU info and optimize based on ARM Neoverse cores
|
|
||||||
|
|
||||||
# Set LD_PRELOAD for tcmalloc on ARM
|
|
||||||
ENV LD_PRELOAD="/usr/lib/aarch64-linux-gnu/libtcmalloc_minimal.so.4"
|
|
||||||
|
|
||||||
RUN echo 'ulimit -c 0' >> ~/.bashrc
|
|
||||||
|
|
||||||
WORKDIR /workspace
|
|
||||||
|
|
||||||
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
|
|
||||||
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
--mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
|
|
||||||
pip install --upgrade pip && \
|
|
||||||
pip install -r requirements/build.txt
|
|
||||||
|
|
||||||
FROM cpu-test-arm AS build
|
|
||||||
|
|
||||||
WORKDIR /workspace/vllm
|
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
--mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
|
|
||||||
--mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
|
|
||||||
pip install -v -r requirements/cpu.txt
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
ARG GIT_REPO_CHECK=0
|
|
||||||
RUN --mount=type=bind,source=.git,target=.git \
|
|
||||||
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
|
|
||||||
|
|
||||||
# Disabling AVX512 specific optimizations for ARM
|
|
||||||
ARG VLLM_CPU_DISABLE_AVX512="true"
|
|
||||||
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
|
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
--mount=type=cache,target=/root/.cache/ccache \
|
|
||||||
--mount=type=bind,source=.git,target=.git \
|
|
||||||
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
|
|
||||||
pip install dist/*.whl && \
|
|
||||||
rm -rf dist
|
|
||||||
|
|
||||||
WORKDIR /workspace/
|
|
||||||
|
|
||||||
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
|
|
||||||
|
|
||||||
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
|
||||||
@ -1,4 +1,11 @@
|
|||||||
# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
|
# This vLLM Dockerfile is used to build images that can run vLLM on both x86_64 and arm64 CPU platforms.
|
||||||
|
#
|
||||||
|
# Supported platforms:
|
||||||
|
# - linux/amd64 (x86_64)
|
||||||
|
# - linux/arm64 (aarch64)
|
||||||
|
#
|
||||||
|
# Use the `--platform` option with `docker buildx build` to specify the target architecture, e.g.:
|
||||||
|
# docker buildx build --platform=linux/arm64 -f docker/Dockerfile.cpu .
|
||||||
#
|
#
|
||||||
# Build targets:
|
# Build targets:
|
||||||
# vllm-openai (default): used for serving deployment
|
# vllm-openai (default): used for serving deployment
|
||||||
@ -53,7 +60,20 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
uv pip install --upgrade pip && \
|
uv pip install --upgrade pip && \
|
||||||
uv pip install -r requirements/cpu.txt
|
uv pip install -r requirements/cpu.txt
|
||||||
|
|
||||||
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/venv/lib/libiomp5.so:$LD_PRELOAD"
|
ARG TARGETARCH
|
||||||
|
ENV TARGETARCH=${TARGETARCH}
|
||||||
|
|
||||||
|
RUN if [ "$TARGETARCH" = "arm64" ]; then \
|
||||||
|
PRELOAD_PATH="/usr/lib/aarch64-linux-gnu/libtcmalloc_minimal.so.4"; \
|
||||||
|
else \
|
||||||
|
PRELOAD_PATH="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/venv/lib/libiomp5.so"; \
|
||||||
|
fi && \
|
||||||
|
echo "export LD_PRELOAD=$PRELOAD_PATH" >> ~/.bashrc
|
||||||
|
|
||||||
|
# Ensure that the LD_PRELOAD environment variable for export is in effect.
|
||||||
|
SHELL ["/bin/bash", "-c"]
|
||||||
|
|
||||||
|
ENV LD_PRELOAD=${LD_PRELOAD}
|
||||||
|
|
||||||
RUN echo 'ulimit -c 0' >> ~/.bashrc
|
RUN echo 'ulimit -c 0' >> ~/.bashrc
|
||||||
|
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
ARG NIGHTLY_DATE="20250714"
|
ARG NIGHTLY_DATE="20250724"
|
||||||
ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.12_tpuvm_$NIGHTLY_DATE"
|
ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.12_tpuvm_$NIGHTLY_DATE"
|
||||||
|
|
||||||
FROM $BASE_IMAGE
|
FROM $BASE_IMAGE
|
||||||
|
|||||||
@ -9,10 +9,13 @@ We support tracing vLLM workers using the `torch.profiler` module. You can enabl
|
|||||||
|
|
||||||
The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` environment variable set.
|
The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` environment variable set.
|
||||||
|
|
||||||
When using `benchmarks/benchmark_serving.py`, you can enable profiling by passing the `--profile` flag.
|
When using `vllm bench serve`, you can enable profiling by passing the `--profile` flag.
|
||||||
|
|
||||||
Traces can be visualized using <https://ui.perfetto.dev/>.
|
Traces can be visualized using <https://ui.perfetto.dev/>.
|
||||||
|
|
||||||
|
!!! tip
|
||||||
|
You can directly call bench module without installing vllm using `python -m vllm.entrypoints.cli.main bench`.
|
||||||
|
|
||||||
!!! tip
|
!!! tip
|
||||||
Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly.
|
Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly.
|
||||||
|
|
||||||
@ -35,10 +38,10 @@ VLLM_TORCH_PROFILER_DIR=./vllm_profile \
|
|||||||
--model meta-llama/Meta-Llama-3-70B
|
--model meta-llama/Meta-Llama-3-70B
|
||||||
```
|
```
|
||||||
|
|
||||||
benchmark_serving.py:
|
vllm bench command:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--model meta-llama/Meta-Llama-3-70B \
|
--model meta-llama/Meta-Llama-3-70B \
|
||||||
--dataset-name sharegpt \
|
--dataset-name sharegpt \
|
||||||
@ -69,13 +72,13 @@ apt install nsight-systems-cli
|
|||||||
|
|
||||||
For basic usage, you can just append `nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace=node` before any existing script you would run for offline inference.
|
For basic usage, you can just append `nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace=node` before any existing script you would run for offline inference.
|
||||||
|
|
||||||
The following is an example using the `benchmarks/benchmark_latency.py` script:
|
The following is an example using the `vllm bench latency` script:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
nsys profile -o report.nsys-rep \
|
nsys profile -o report.nsys-rep \
|
||||||
--trace-fork-before-exec=true \
|
--trace-fork-before-exec=true \
|
||||||
--cuda-graph-trace=node \
|
--cuda-graph-trace=node \
|
||||||
python benchmarks/benchmark_latency.py \
|
vllm bench latency \
|
||||||
--model meta-llama/Llama-3.1-8B-Instruct \
|
--model meta-llama/Llama-3.1-8B-Instruct \
|
||||||
--num-iters-warmup 5 \
|
--num-iters-warmup 5 \
|
||||||
--num-iters 1 \
|
--num-iters 1 \
|
||||||
@ -98,7 +101,7 @@ nsys profile -o report.nsys-rep \
|
|||||||
vllm serve meta-llama/Llama-3.1-8B-Instruct
|
vllm serve meta-llama/Llama-3.1-8B-Instruct
|
||||||
|
|
||||||
# client
|
# client
|
||||||
python benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--model meta-llama/Llama-3.1-8B-Instruct \
|
--model meta-llama/Llama-3.1-8B-Instruct \
|
||||||
--num-prompts 1 \
|
--num-prompts 1 \
|
||||||
|
|||||||
@ -291,7 +291,7 @@ curl -X POST -s http://10.0.1.1:10001/v1/completions \
|
|||||||
??? console "Command"
|
??? console "Command"
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
python3 benchmark_serving.py \
|
vllm bench serve \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--model base_model \
|
--model base_model \
|
||||||
--tokenizer meta-llama/Llama-3.1-8B-Instruct \
|
--tokenizer meta-llama/Llama-3.1-8B-Instruct \
|
||||||
|
|||||||
@ -177,6 +177,70 @@ Multi-image input can be extended to perform video captioning. We show this with
|
|||||||
You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary
|
You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary
|
||||||
instead of using multi-image input.
|
instead of using multi-image input.
|
||||||
|
|
||||||
|
Instead of NumPy arrays, you can also pass `'torch.Tensor'` instances, as shown in this example using Qwen2.5-VL:
|
||||||
|
|
||||||
|
??? code
|
||||||
|
|
||||||
|
```python
|
||||||
|
from transformers import AutoProcessor
|
||||||
|
from vllm import LLM, SamplingParams
|
||||||
|
from qwen_vl_utils import process_vision_info
|
||||||
|
|
||||||
|
model_path = "Qwen/Qwen2.5-VL-3B-Instruct/"
|
||||||
|
video_path = "https://content.pexels.com/videos/free-videos.mp4"
|
||||||
|
|
||||||
|
llm = LLM(
|
||||||
|
model=model_path,
|
||||||
|
gpu_memory_utilization=0.8,
|
||||||
|
enforce_eager=True,
|
||||||
|
limit_mm_per_prompt={"video": 1},
|
||||||
|
)
|
||||||
|
|
||||||
|
sampling_params = SamplingParams(
|
||||||
|
max_tokens=1024,
|
||||||
|
)
|
||||||
|
|
||||||
|
video_messages = [
|
||||||
|
{"role": "system", "content": "You are a helpful assistant."},
|
||||||
|
{"role": "user", "content": [
|
||||||
|
{"type": "text", "text": "describe this video."},
|
||||||
|
{
|
||||||
|
"type": "video",
|
||||||
|
"video": video_path,
|
||||||
|
"total_pixels": 20480 * 28 * 28,
|
||||||
|
"min_pixels": 16 * 28 * 28
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
messages = video_messages
|
||||||
|
processor = AutoProcessor.from_pretrained(model_path)
|
||||||
|
prompt = processor.apply_chat_template(
|
||||||
|
messages,
|
||||||
|
tokenize=False,
|
||||||
|
add_generation_prompt=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
image_inputs, video_inputs = process_vision_info(messages)
|
||||||
|
mm_data = {}
|
||||||
|
if video_inputs is not None:
|
||||||
|
mm_data["video"] = video_inputs
|
||||||
|
|
||||||
|
llm_inputs = {
|
||||||
|
"prompt": prompt,
|
||||||
|
"multi_modal_data": mm_data,
|
||||||
|
}
|
||||||
|
|
||||||
|
outputs = llm.generate([llm_inputs], sampling_params=sampling_params)
|
||||||
|
for o in outputs:
|
||||||
|
generated_text = o.outputs[0].text
|
||||||
|
print(generated_text)
|
||||||
|
```
|
||||||
|
|
||||||
|
!!! note
|
||||||
|
'process_vision_info' is only applicable to Qwen2.5-VL and similar models.
|
||||||
|
|
||||||
Full example: <gh-file:examples/offline_inference/vision_language.py>
|
Full example: <gh-file:examples/offline_inference/vision_language.py>
|
||||||
|
|
||||||
### Audio Inputs
|
### Audio Inputs
|
||||||
|
|||||||
@ -6,6 +6,7 @@ Contents:
|
|||||||
|
|
||||||
- [Supported Hardware](supported_hardware.md)
|
- [Supported Hardware](supported_hardware.md)
|
||||||
- [AutoAWQ](auto_awq.md)
|
- [AutoAWQ](auto_awq.md)
|
||||||
|
- [AutoRound](auto_round.md)
|
||||||
- [BitsAndBytes](bnb.md)
|
- [BitsAndBytes](bnb.md)
|
||||||
- [BitBLAS](bitblas.md)
|
- [BitBLAS](bitblas.md)
|
||||||
- [GGUF](gguf.md)
|
- [GGUF](gguf.md)
|
||||||
|
|||||||
103
docs/features/quantization/auto_round.md
Normal file
103
docs/features/quantization/auto_round.md
Normal file
@ -0,0 +1,103 @@
|
|||||||
|
# AutoRound
|
||||||
|
|
||||||
|
[AutoRound](https://github.com/intel/auto-round) is Intel’s advanced quantization algorithm designed to produce highly efficient **INT2, INT3, INT4, and INT8**
|
||||||
|
quantized large language models—striking an optimal balance between accuracy and deployment performance.
|
||||||
|
|
||||||
|
AutoRound applies weight-only quantization to transformer-based models, enabling significant memory savings and faster
|
||||||
|
inference while maintaining near-original accuracy. It supports a wide range of hardware platforms, including **CPUs,
|
||||||
|
Intel GPUs, HPUs, and CUDA-enabled devices**.
|
||||||
|
|
||||||
|
Please refer to the [AutoRound guide](https://github.com/intel/auto-round/blob/main/docs/step_by_step.md) for more details.
|
||||||
|
|
||||||
|
Key Features:
|
||||||
|
|
||||||
|
✅ **AutoRound, AutoAWQ, AutoGPTQ, and GGUF** are supported
|
||||||
|
|
||||||
|
✅ **10+ vision-language models (VLMs)** are supported
|
||||||
|
|
||||||
|
✅ **Per-layer mixed-bit quantization** for fine-grained control
|
||||||
|
|
||||||
|
✅ **RTN (Round-To-Nearest) mode** for quick quantization with slight accuracy loss
|
||||||
|
|
||||||
|
✅ **Multiple quantization recipes**: best, base, and light
|
||||||
|
|
||||||
|
✅ Advanced utilities such as immediate packing and support for **10+ backends**
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
uv pip install auto-round
|
||||||
|
```
|
||||||
|
|
||||||
|
## Quantizing a model
|
||||||
|
|
||||||
|
For VLMs, please change to `auto-round-mllm` in CLI usage and `AutoRoundMLLM` in API usage.
|
||||||
|
|
||||||
|
### CLI usage
|
||||||
|
|
||||||
|
```bash
|
||||||
|
auto-round \
|
||||||
|
--model Qwen/Qwen3-0.6B \
|
||||||
|
--bits 4 \
|
||||||
|
--group_size 128 \
|
||||||
|
--format "auto_round" \
|
||||||
|
--output_dir ./tmp_autoround
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
|
auto-round \
|
||||||
|
--model Qwen/Qwen3-0.6B \
|
||||||
|
--format "gguf:q4_k_m" \
|
||||||
|
--output_dir ./tmp_autoround
|
||||||
|
```
|
||||||
|
|
||||||
|
### API usage
|
||||||
|
|
||||||
|
```python
|
||||||
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||||
|
from auto_round import AutoRound
|
||||||
|
|
||||||
|
model_name = "Qwen/Qwen3-0.6B"
|
||||||
|
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||||
|
|
||||||
|
bits, group_size, sym = 4, 128, True
|
||||||
|
autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym)
|
||||||
|
|
||||||
|
# the best accuracy, 4-5X slower, low_gpu_mem_usage could save ~20G but ~30% slower
|
||||||
|
# autoround = AutoRound(model, tokenizer, nsamples=512, iters=1000, low_gpu_mem_usage=True, bits=bits, group_size=group_size, sym=sym)
|
||||||
|
|
||||||
|
# 2-3X speedup, slight accuracy drop at W4G128
|
||||||
|
# autoround = AutoRound(model, tokenizer, nsamples=128, iters=50, lr=5e-3, bits=bits, group_size=group_size, sym=sym )
|
||||||
|
|
||||||
|
output_dir = "./tmp_autoround"
|
||||||
|
# format= 'auto_round'(default), 'auto_gptq', 'auto_awq'
|
||||||
|
autoround.quantize_and_save(output_dir, format="auto_round")
|
||||||
|
```
|
||||||
|
|
||||||
|
## Running a quantized model with vLLM
|
||||||
|
|
||||||
|
Here is some example code to run auto-round format in vLLM:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from vllm import LLM, SamplingParams
|
||||||
|
|
||||||
|
prompts = [
|
||||||
|
"Hello, my name is",
|
||||||
|
]
|
||||||
|
sampling_params = SamplingParams(temperature=0.6, top_p=0.95)
|
||||||
|
model_name = "Intel/DeepSeek-R1-0528-Qwen3-8B-int4-AutoRound"
|
||||||
|
llm = LLM(model=model_name)
|
||||||
|
|
||||||
|
outputs = llm.generate(prompts, sampling_params)
|
||||||
|
|
||||||
|
for output in outputs:
|
||||||
|
prompt = output.prompt
|
||||||
|
generated_text = output.outputs[0].text
|
||||||
|
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||||
|
```
|
||||||
|
|
||||||
|
# Acknowledgement
|
||||||
|
|
||||||
|
Special thanks to open-source low precision libraries such as AutoGPTQ, AutoAWQ, GPTQModel, Triton, Marlin, and
|
||||||
|
ExLLaMAV2 for providing low-precision CUDA kernels, which are leveraged in AutoRound.
|
||||||
@ -33,7 +33,7 @@ Testing has been conducted on AWS Graviton3 instances for compatibility.
|
|||||||
# --8<-- [end:pre-built-images]
|
# --8<-- [end:pre-built-images]
|
||||||
# --8<-- [start:build-image-from-source]
|
# --8<-- [start:build-image-from-source]
|
||||||
```bash
|
```bash
|
||||||
docker build -f docker/Dockerfile.arm \
|
docker build -f docker/Dockerfile.cpu \
|
||||||
--tag vllm-cpu-env .
|
--tag vllm-cpu-env .
|
||||||
|
|
||||||
# Launching OpenAI server
|
# Launching OpenAI server
|
||||||
|
|||||||
@ -365,6 +365,7 @@ th {
|
|||||||
| `Grok1ModelForCausalLM` | Grok1 | `hpcai-tech/grok-1`. | ✅︎ | ✅︎ | ✅︎ |
|
| `Grok1ModelForCausalLM` | Grok1 | `hpcai-tech/grok-1`. | ✅︎ | ✅︎ | ✅︎ |
|
||||||
| `HunYuanDenseV1ForCausalLM` | Hunyuan-7B-Instruct-0124 | `tencent/Hunyuan-7B-Instruct-0124` | ✅︎ | | ✅︎ |
|
| `HunYuanDenseV1ForCausalLM` | Hunyuan-7B-Instruct-0124 | `tencent/Hunyuan-7B-Instruct-0124` | ✅︎ | | ✅︎ |
|
||||||
| `HunYuanMoEV1ForCausalLM` | Hunyuan-80B-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | | ✅︎ |
|
| `HunYuanMoEV1ForCausalLM` | Hunyuan-80B-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | | ✅︎ |
|
||||||
|
| `HCXVisionForCausalLM` | HyperCLOVAX-SEED-Vision-Instruct-3B | `naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B` | | | ✅︎ |
|
||||||
| `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
| `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||||
| `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
| `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||||
| `InternLM3ForCausalLM` | InternLM3 | `internlm/internlm3-8b-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
| `InternLM3ForCausalLM` | InternLM3 | `internlm/internlm3-8b-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||||
@ -592,6 +593,7 @@ Specified using `--task generate`.
|
|||||||
| `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ |
|
| `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ |
|
||||||
| `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ |
|
| `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ |
|
||||||
| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ |
|
| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ |
|
||||||
|
| `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, etc. | | ✅︎ | ✅︎ |
|
||||||
| `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
| `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||||
| `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | | | ✅︎ |
|
| `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | | | ✅︎ |
|
||||||
| `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | | ✅︎ |
|
| `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | | ✅︎ |
|
||||||
@ -612,6 +614,7 @@ Specified using `--task generate`.
|
|||||||
| `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ | ⚠️ |
|
| `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ | ⚠️ |
|
||||||
| `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ | ✅︎ |
|
| `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ | ✅︎ |
|
||||||
| `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
| `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||||
|
| `Phi4MultimodalForCausalLM` | Phi-4-multimodal (HF Transformers) | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct` (with revision `refs/pr/70`), etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||||
| `PixtralForConditionalGeneration` | Mistral 3 (Mistral format), Pixtral (Mistral format) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistralai/Pixtral-12B-2409`, etc. | | ✅︎ | ✅︎ |
|
| `PixtralForConditionalGeneration` | Mistral 3 (Mistral format), Pixtral (Mistral format) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistralai/Pixtral-12B-2409`, etc. | | ✅︎ | ✅︎ |
|
||||||
| `QwenVLForConditionalGeneration`<sup>^</sup> | Qwen-VL | T + I<sup>E+</sup> | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
| `QwenVLForConditionalGeneration`<sup>^</sup> | Qwen-VL | T + I<sup>E+</sup> | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||||
| `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A<sup>+</sup> | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ | ✅︎ |
|
| `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A<sup>+</sup> | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ | ✅︎ |
|
||||||
|
|||||||
@ -2,10 +2,14 @@
|
|||||||
|
|
||||||
Reinforcement Learning from Human Feedback (RLHF) is a technique that fine-tunes language models using human-generated preference data to align model outputs with desired behaviors.
|
Reinforcement Learning from Human Feedback (RLHF) is a technique that fine-tunes language models using human-generated preference data to align model outputs with desired behaviors.
|
||||||
|
|
||||||
vLLM can be used to generate the completions for RLHF. The best way to do this is with libraries like [TRL](https://github.com/huggingface/trl), [OpenRLHF](https://github.com/OpenRLHF/OpenRLHF) and [verl](https://github.com/volcengine/verl).
|
vLLM can be used to generate the completions for RLHF. Some ways to do this include using libraries like [TRL](https://github.com/huggingface/trl), [OpenRLHF](https://github.com/OpenRLHF/OpenRLHF), [verl](https://github.com/volcengine/verl) and [unsloth](https://github.com/unslothai/unsloth).
|
||||||
|
|
||||||
See the following basic examples to get started if you don't want to use an existing library:
|
See the following basic examples to get started if you don't want to use an existing library:
|
||||||
|
|
||||||
- [Training and inference processes are located on separate GPUs (inspired by OpenRLHF)](../examples/offline_inference/rlhf.md)
|
- [Training and inference processes are located on separate GPUs (inspired by OpenRLHF)](../examples/offline_inference/rlhf.md)
|
||||||
- [Training and inference processes are colocated on the same GPUs using Ray](../examples/offline_inference/rlhf_colocate.md)
|
- [Training and inference processes are colocated on the same GPUs using Ray](../examples/offline_inference/rlhf_colocate.md)
|
||||||
- [Utilities for performing RLHF with vLLM](../examples/offline_inference/rlhf_utils.md)
|
- [Utilities for performing RLHF with vLLM](../examples/offline_inference/rlhf_utils.md)
|
||||||
|
|
||||||
|
See the following notebooks showing how to use vLLM for GRPO:
|
||||||
|
|
||||||
|
- [Qwen-3 4B GRPO using Unsloth + vLLM](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_(4B)-GRPO.ipynb)
|
||||||
|
|||||||
@ -190,6 +190,37 @@ def run_phi4mm(question: str, audio_count: int) -> ModelRequestData:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def run_phi4_multimodal(question: str, audio_count: int) -> ModelRequestData:
|
||||||
|
"""
|
||||||
|
Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
|
||||||
|
show how to process audio inputs.
|
||||||
|
"""
|
||||||
|
model_path = snapshot_download(
|
||||||
|
"microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70"
|
||||||
|
)
|
||||||
|
# Since the vision-lora and speech-lora co-exist with the base model,
|
||||||
|
# we have to manually specify the path of the lora weights.
|
||||||
|
speech_lora_path = os.path.join(model_path, "speech-lora")
|
||||||
|
placeholders = "<|audio|>" * audio_count
|
||||||
|
|
||||||
|
prompts = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
|
||||||
|
|
||||||
|
engine_args = EngineArgs(
|
||||||
|
model=model_path,
|
||||||
|
max_model_len=12800,
|
||||||
|
max_num_seqs=2,
|
||||||
|
enable_lora=True,
|
||||||
|
max_lora_rank=320,
|
||||||
|
limit_mm_per_prompt={"audio": audio_count},
|
||||||
|
)
|
||||||
|
|
||||||
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompt=prompts,
|
||||||
|
lora_requests=[LoRARequest("speech", 1, speech_lora_path)],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# Qwen2-Audio
|
# Qwen2-Audio
|
||||||
def run_qwen2_audio(question: str, audio_count: int) -> ModelRequestData:
|
def run_qwen2_audio(question: str, audio_count: int) -> ModelRequestData:
|
||||||
model_name = "Qwen/Qwen2-Audio-7B-Instruct"
|
model_name = "Qwen/Qwen2-Audio-7B-Instruct"
|
||||||
@ -303,6 +334,7 @@ model_example_map = {
|
|||||||
"granite_speech": run_granite_speech,
|
"granite_speech": run_granite_speech,
|
||||||
"minicpmo": run_minicpmo,
|
"minicpmo": run_minicpmo,
|
||||||
"phi4_mm": run_phi4mm,
|
"phi4_mm": run_phi4mm,
|
||||||
|
"phi4_multimodal": run_phi4_multimodal,
|
||||||
"qwen2_audio": run_qwen2_audio,
|
"qwen2_audio": run_qwen2_audio,
|
||||||
"qwen2_5_omni": run_qwen2_5_omni,
|
"qwen2_5_omni": run_qwen2_5_omni,
|
||||||
"ultravox": run_ultravox,
|
"ultravox": run_ultravox,
|
||||||
|
|||||||
@ -3,12 +3,12 @@
|
|||||||
import argparse
|
import argparse
|
||||||
import datetime
|
import datetime
|
||||||
import os
|
import os
|
||||||
import re
|
|
||||||
from typing import Union
|
from typing import Union
|
||||||
|
|
||||||
import albumentations
|
import albumentations
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import rasterio
|
import rasterio
|
||||||
|
import regex as re
|
||||||
import torch
|
import torch
|
||||||
from einops import rearrange
|
from einops import rearrange
|
||||||
from terratorch.datamodules import Sen1Floods11NonGeoDataModule
|
from terratorch.datamodules import Sen1Floods11NonGeoDataModule
|
||||||
|
|||||||
@ -29,6 +29,7 @@ import shutil
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from vllm import LLM, EngineArgs
|
from vllm import LLM, EngineArgs
|
||||||
|
from vllm.model_executor.model_loader import ShardedStateLoader
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
|
||||||
@ -39,7 +40,10 @@ def parse_args():
|
|||||||
"--output", "-o", required=True, type=str, help="path to output checkpoint"
|
"--output", "-o", required=True, type=str, help="path to output checkpoint"
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--file-pattern", type=str, help="string pattern of saved filenames"
|
"--file-pattern",
|
||||||
|
type=str,
|
||||||
|
default=ShardedStateLoader.DEFAULT_PATTERN,
|
||||||
|
help="string pattern of saved filenames",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--max-file-size",
|
"--max-file-size",
|
||||||
|
|||||||
@ -316,6 +316,85 @@ def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B
|
||||||
|
def run_hyperclovax_seed_vision(
|
||||||
|
questions: list[str], modality: str
|
||||||
|
) -> ModelRequestData:
|
||||||
|
model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
||||||
|
|
||||||
|
engine_args = EngineArgs(
|
||||||
|
model=model_name,
|
||||||
|
trust_remote_code=True,
|
||||||
|
max_model_len=8192 if modality == "image" else 16384,
|
||||||
|
limit_mm_per_prompt={modality: 1},
|
||||||
|
)
|
||||||
|
|
||||||
|
messages = list()
|
||||||
|
for question in questions:
|
||||||
|
if modality == "image":
|
||||||
|
"""
|
||||||
|
ocr: List the words in the image in raster order.
|
||||||
|
Even if the word order feels unnatural for reading,
|
||||||
|
the model will handle it as long as it follows raster order.
|
||||||
|
e.g. "Naver, CLOVA, bigshane"
|
||||||
|
lens_keywords: List the entity names in the image.
|
||||||
|
e.g. "iPhone"
|
||||||
|
lens_local_keywords: List the entity names with quads in the image.
|
||||||
|
e.g. "[0.07, 0.21, 0.92, 0.90] iPhone"
|
||||||
|
"""
|
||||||
|
messages.append(
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "image",
|
||||||
|
"ocr": "",
|
||||||
|
"lens_keywords": "",
|
||||||
|
"lens_local_keywords": "",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": question,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
]
|
||||||
|
)
|
||||||
|
elif modality == "video":
|
||||||
|
messages.append(
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "video",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": question,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
]
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unsupported modality: {modality}")
|
||||||
|
|
||||||
|
prompts = tokenizer.apply_chat_template(
|
||||||
|
messages,
|
||||||
|
tokenize=False,
|
||||||
|
add_generation_prompt=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompts=prompts,
|
||||||
|
stop_token_ids=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# Idefics3-8B-Llama3
|
# Idefics3-8B-Llama3
|
||||||
def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
|
def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
|
||||||
assert modality == "image"
|
assert modality == "image"
|
||||||
@ -389,6 +468,39 @@ def run_tarsier(questions: list[str], modality: str) -> ModelRequestData:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# Intern-S1
|
||||||
|
def run_interns1(questions: list[str], modality: str) -> ModelRequestData:
|
||||||
|
model_name = "internlm/Intern-S1"
|
||||||
|
|
||||||
|
engine_args = EngineArgs(
|
||||||
|
model=model_name,
|
||||||
|
trust_remote_code=True,
|
||||||
|
max_model_len=8192,
|
||||||
|
max_num_seqs=2,
|
||||||
|
limit_mm_per_prompt={modality: 1},
|
||||||
|
enforce_eager=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
if modality == "image":
|
||||||
|
placeholder = "<IMG_CONTEXT>"
|
||||||
|
elif modality == "video":
|
||||||
|
placeholder = "<video>"
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
||||||
|
messages = [
|
||||||
|
[{"role": "user", "content": f"{placeholder}\n{question}"}]
|
||||||
|
for question in questions
|
||||||
|
]
|
||||||
|
prompts = tokenizer.apply_chat_template(
|
||||||
|
messages, tokenize=False, add_generation_prompt=True
|
||||||
|
)
|
||||||
|
|
||||||
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompts=prompts,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# InternVL
|
# InternVL
|
||||||
def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
|
def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
|
||||||
model_name = "OpenGVLab/InternVL3-2B"
|
model_name = "OpenGVLab/InternVL3-2B"
|
||||||
@ -987,6 +1099,41 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# HF format Phi-4-multimodal-instruct
|
||||||
|
def run_phi4_multimodal(questions: list[str], modality: str) -> ModelRequestData:
|
||||||
|
"""
|
||||||
|
Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
|
||||||
|
show how to process image inputs.
|
||||||
|
"""
|
||||||
|
assert modality == "image"
|
||||||
|
model_path = snapshot_download(
|
||||||
|
"microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70"
|
||||||
|
)
|
||||||
|
# Since the vision-lora and speech-lora co-exist with the base model,
|
||||||
|
# we have to manually specify the path of the lora weights.
|
||||||
|
vision_lora_path = os.path.join(model_path, "vision-lora")
|
||||||
|
prompts = [
|
||||||
|
f"<|user|><|image|>{question}<|end|><|assistant|>" for question in questions
|
||||||
|
]
|
||||||
|
engine_args = EngineArgs(
|
||||||
|
model=model_path,
|
||||||
|
max_model_len=5120,
|
||||||
|
max_num_seqs=2,
|
||||||
|
max_num_batched_tokens=12800,
|
||||||
|
enable_lora=True,
|
||||||
|
max_lora_rank=320,
|
||||||
|
# Note - mm_processor_kwargs can also be passed to generate/chat calls
|
||||||
|
mm_processor_kwargs={"dynamic_hd": 16},
|
||||||
|
limit_mm_per_prompt={"image": 1},
|
||||||
|
)
|
||||||
|
|
||||||
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompts=prompts,
|
||||||
|
lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# Pixtral HF-format
|
# Pixtral HF-format
|
||||||
def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
|
def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
|
||||||
assert modality == "image"
|
assert modality == "image"
|
||||||
@ -1222,7 +1369,9 @@ model_example_map = {
|
|||||||
"glm4v": run_glm4v,
|
"glm4v": run_glm4v,
|
||||||
"glm4_1v": run_glm4_1v,
|
"glm4_1v": run_glm4_1v,
|
||||||
"h2ovl_chat": run_h2ovl,
|
"h2ovl_chat": run_h2ovl,
|
||||||
|
"hyperclovax_seed_vision": run_hyperclovax_seed_vision,
|
||||||
"idefics3": run_idefics3,
|
"idefics3": run_idefics3,
|
||||||
|
"interns1": run_interns1,
|
||||||
"internvl_chat": run_internvl,
|
"internvl_chat": run_internvl,
|
||||||
"nemotron_vl": run_nemotron_vl,
|
"nemotron_vl": run_nemotron_vl,
|
||||||
"keye_vl": run_keye_vl,
|
"keye_vl": run_keye_vl,
|
||||||
@ -1244,6 +1393,7 @@ model_example_map = {
|
|||||||
"paligemma2": run_paligemma2,
|
"paligemma2": run_paligemma2,
|
||||||
"phi3_v": run_phi3v,
|
"phi3_v": run_phi3v,
|
||||||
"phi4_mm": run_phi4mm,
|
"phi4_mm": run_phi4mm,
|
||||||
|
"phi4_multimodal": run_phi4_multimodal,
|
||||||
"pixtral_hf": run_pixtral_hf,
|
"pixtral_hf": run_pixtral_hf,
|
||||||
"qwen_vl": run_qwen_vl,
|
"qwen_vl": run_qwen_vl,
|
||||||
"qwen2_vl": run_qwen2_vl,
|
"qwen2_vl": run_qwen2_vl,
|
||||||
|
|||||||
@ -253,6 +253,33 @@ def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def load_interns1(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||||
|
model_name = "internlm/Intern-S1"
|
||||||
|
|
||||||
|
engine_args = EngineArgs(
|
||||||
|
model=model_name,
|
||||||
|
trust_remote_code=True,
|
||||||
|
max_model_len=4096,
|
||||||
|
limit_mm_per_prompt={"image": len(image_urls)},
|
||||||
|
)
|
||||||
|
|
||||||
|
placeholders = "\n".join(
|
||||||
|
f"Image-{i}: <IMG_CONTEXT>\n" for i, _ in enumerate(image_urls, start=1)
|
||||||
|
)
|
||||||
|
messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
||||||
|
prompt = tokenizer.apply_chat_template(
|
||||||
|
messages, tokenize=False, add_generation_prompt=True
|
||||||
|
)
|
||||||
|
|
||||||
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompt=prompt,
|
||||||
|
image_data=[fetch_image(url) for url in image_urls],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
|
def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||||
model_name = "OpenGVLab/InternVL2-2B"
|
model_name = "OpenGVLab/InternVL2-2B"
|
||||||
|
|
||||||
@ -289,6 +316,53 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def load_hyperclovax_seed_vision(
|
||||||
|
question: str, image_urls: list[str]
|
||||||
|
) -> ModelRequestData:
|
||||||
|
model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
||||||
|
|
||||||
|
engine_args = EngineArgs(
|
||||||
|
model=model_name,
|
||||||
|
trust_remote_code=True,
|
||||||
|
max_model_len=16384,
|
||||||
|
limit_mm_per_prompt={"image": len(image_urls)},
|
||||||
|
)
|
||||||
|
|
||||||
|
message = {"role": "user", "content": list()}
|
||||||
|
for _image_url in image_urls:
|
||||||
|
message["content"].append(
|
||||||
|
{
|
||||||
|
"type": "image",
|
||||||
|
"image": _image_url,
|
||||||
|
"ocr": "",
|
||||||
|
"lens_keywords": "",
|
||||||
|
"lens_local_keywords": "",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
message["content"].append(
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": question,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
prompt = tokenizer.apply_chat_template(
|
||||||
|
[
|
||||||
|
message,
|
||||||
|
],
|
||||||
|
tokenize=False,
|
||||||
|
add_generation_prompt=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompt=prompt,
|
||||||
|
stop_token_ids=None,
|
||||||
|
image_data=[fetch_image(url) for url in image_urls],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def load_llava(question: str, image_urls: list[str]) -> ModelRequestData:
|
def load_llava(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||||
# NOTE: CAUTION! Original Llava models wasn't really trained on multi-image inputs,
|
# NOTE: CAUTION! Original Llava models wasn't really trained on multi-image inputs,
|
||||||
# it will generate poor response for multi-image inputs!
|
# it will generate poor response for multi-image inputs!
|
||||||
@ -686,6 +760,40 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def load_phi4_multimodal(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||||
|
"""
|
||||||
|
Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
|
||||||
|
show how to process multi images inputs.
|
||||||
|
"""
|
||||||
|
|
||||||
|
model_path = snapshot_download(
|
||||||
|
"microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70"
|
||||||
|
)
|
||||||
|
# Since the vision-lora and speech-lora co-exist with the base model,
|
||||||
|
# we have to manually specify the path of the lora weights.
|
||||||
|
vision_lora_path = os.path.join(model_path, "vision-lora")
|
||||||
|
engine_args = EngineArgs(
|
||||||
|
model=model_path,
|
||||||
|
max_model_len=4096,
|
||||||
|
max_num_seqs=2,
|
||||||
|
limit_mm_per_prompt={"image": len(image_urls)},
|
||||||
|
enable_lora=True,
|
||||||
|
max_lora_rank=320,
|
||||||
|
# Note - mm_processor_kwargs can also be passed to generate/chat calls
|
||||||
|
mm_processor_kwargs={"dynamic_hd": 4},
|
||||||
|
)
|
||||||
|
|
||||||
|
placeholders = "<|image|>" * len(image_urls)
|
||||||
|
prompt = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
|
||||||
|
|
||||||
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompt=prompt,
|
||||||
|
image_data=[fetch_image(url) for url in image_urls],
|
||||||
|
lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def load_qwen_vl_chat(question: str, image_urls: list[str]) -> ModelRequestData:
|
def load_qwen_vl_chat(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||||
model_name = "Qwen/Qwen-VL-Chat"
|
model_name = "Qwen/Qwen-VL-Chat"
|
||||||
engine_args = EngineArgs(
|
engine_args = EngineArgs(
|
||||||
@ -899,7 +1007,9 @@ model_example_map = {
|
|||||||
"gemma3": load_gemma3,
|
"gemma3": load_gemma3,
|
||||||
"h2ovl_chat": load_h2ovl,
|
"h2ovl_chat": load_h2ovl,
|
||||||
"idefics3": load_idefics3,
|
"idefics3": load_idefics3,
|
||||||
|
"interns1": load_interns1,
|
||||||
"internvl_chat": load_internvl,
|
"internvl_chat": load_internvl,
|
||||||
|
"hyperclovax_seed_vision": load_hyperclovax_seed_vision,
|
||||||
"keye_vl": load_keye_vl,
|
"keye_vl": load_keye_vl,
|
||||||
"kimi_vl": load_kimi_vl,
|
"kimi_vl": load_kimi_vl,
|
||||||
"llava": load_llava,
|
"llava": load_llava,
|
||||||
@ -912,6 +1022,7 @@ model_example_map = {
|
|||||||
"ovis": load_ovis,
|
"ovis": load_ovis,
|
||||||
"phi3_v": load_phi3v,
|
"phi3_v": load_phi3v,
|
||||||
"phi4_mm": load_phi4mm,
|
"phi4_mm": load_phi4mm,
|
||||||
|
"phi4_multimodal": load_phi4_multimodal,
|
||||||
"pixtral_hf": load_pixtral_hf,
|
"pixtral_hf": load_pixtral_hf,
|
||||||
"qwen_vl_chat": load_qwen_vl_chat,
|
"qwen_vl_chat": load_qwen_vl_chat,
|
||||||
"qwen2_vl": load_qwen2_vl,
|
"qwen2_vl": load_qwen2_vl,
|
||||||
|
|||||||
@ -233,7 +233,7 @@ main() {
|
|||||||
# Run Benchmark
|
# Run Benchmark
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
cd ../../../benchmarks/
|
cd ../../../benchmarks/
|
||||||
python3 benchmark_serving.py --port 10001 --seed $(date +%s) \
|
vllm bench serve --port 10001 --seed $(date +%s) \
|
||||||
--model $MODEL \
|
--model $MODEL \
|
||||||
--dataset-name random --random-input-len 7500 --random-output-len 200 \
|
--dataset-name random --random-input-len 7500 --random-output-len 200 \
|
||||||
--num-prompts 200 --burstiness 100 --request-rate 2 | tee benchmark.log
|
--num-prompts 200 --burstiness 100 --request-rate 2 | tee benchmark.log
|
||||||
|
|||||||
@ -28,7 +28,7 @@ Submit some sample requests to the server:
|
|||||||
```bash
|
```bash
|
||||||
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
|
|
||||||
python3 ../../../benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--model mistralai/Mistral-7B-v0.1 \
|
--model mistralai/Mistral-7B-v0.1 \
|
||||||
--tokenizer mistralai/Mistral-7B-v0.1 \
|
--tokenizer mistralai/Mistral-7B-v0.1 \
|
||||||
--endpoint /v1/completions \
|
--endpoint /v1/completions \
|
||||||
|
|||||||
@ -122,7 +122,7 @@ main() {
|
|||||||
|
|
||||||
# begin benchmark
|
# begin benchmark
|
||||||
cd ../../../../benchmarks/
|
cd ../../../../benchmarks/
|
||||||
python3 benchmark_serving.py --port 9000 --seed $(date +%s) \
|
vllm bench serve --port 9000 --seed $(date +%s) \
|
||||||
--model meta-llama/Llama-3.1-8B-Instruct \
|
--model meta-llama/Llama-3.1-8B-Instruct \
|
||||||
--dataset-name random --random-input-len 7500 --random-output-len 200 \
|
--dataset-name random --random-input-len 7500 --random-output-len 200 \
|
||||||
--num-prompts 200 --burstiness 100 --request-rate 3.6 | tee benchmark.log
|
--num-prompts 200 --burstiness 100 --request-rate 3.6 | tee benchmark.log
|
||||||
|
|||||||
@ -10,7 +10,8 @@ setuptools>=77.0.3,<80.0.0
|
|||||||
--extra-index-url https://download.pytorch.org/whl/cpu
|
--extra-index-url https://download.pytorch.org/whl/cpu
|
||||||
torch==2.6.0+cpu; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218
|
torch==2.6.0+cpu; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218
|
||||||
torch==2.7.0; platform_system == "Darwin"
|
torch==2.7.0; platform_system == "Darwin"
|
||||||
torch==2.7.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
|
torch==2.7.0; platform_machine == "ppc64le"
|
||||||
|
torch==2.6.0; platform_machine == "aarch64" # for arm64 CPUs, torch 2.7.0 has a issue: https://github.com/vllm-project/vllm/issues/17960
|
||||||
|
|
||||||
# required for the image processor of minicpm-o-2_6, this must be updated alongside torch
|
# required for the image processor of minicpm-o-2_6, this must be updated alongside torch
|
||||||
torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
|
torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
|
||||||
@ -25,3 +26,6 @@ datasets # for benchmark scripts
|
|||||||
intel-openmp==2024.2.1; platform_machine == "x86_64"
|
intel-openmp==2024.2.1; platform_machine == "x86_64"
|
||||||
intel_extension_for_pytorch==2.6.0; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218
|
intel_extension_for_pytorch==2.6.0; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218
|
||||||
triton==3.2.0; platform_machine == "x86_64" # Triton is required for torch 2.6+cpu, as it is imported in torch.compile.
|
triton==3.2.0; platform_machine == "x86_64" # Triton is required for torch 2.6+cpu, as it is imported in torch.compile.
|
||||||
|
|
||||||
|
# Use this to gather CPU info and optimize based on ARM Neoverse cores
|
||||||
|
py-cpuinfo; platform_machine == "aarch64"
|
||||||
|
|||||||
@ -19,8 +19,8 @@ nixl==0.3.0
|
|||||||
--find-links https://storage.googleapis.com/libtpu-releases/index.html
|
--find-links https://storage.googleapis.com/libtpu-releases/index.html
|
||||||
--find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
|
--find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
|
||||||
--find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
|
--find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
|
||||||
torch==2.9.0.dev20250716
|
torch==2.9.0.dev20250724
|
||||||
torchvision==0.24.0.dev20250716
|
torchvision==0.24.0.dev20250724
|
||||||
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250716-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
|
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250724-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
|
||||||
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250716-cp312-cp312-linux_x86_64.whl ; python_version == "3.12"
|
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250724-cp312-cp312-linux_x86_64.whl ; python_version == "3.12"
|
||||||
|
|
||||||
|
|||||||
@ -1062,8 +1062,17 @@ class VllmRunner:
|
|||||||
return [req_output.outputs.score for req_output in req_outputs]
|
return [req_output.outputs.score for req_output in req_outputs]
|
||||||
|
|
||||||
def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
|
def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
|
||||||
executor = self.llm.llm_engine.model_executor
|
if hasattr(self.llm.llm_engine, "model_executor"):
|
||||||
return executor.apply_model(func)
|
# This works either in V0 or in V1 with
|
||||||
|
# VLLM_ENABLE_V1_MULTIPROCESSING=0
|
||||||
|
executor = self.llm.llm_engine.model_executor
|
||||||
|
return executor.apply_model(func)
|
||||||
|
|
||||||
|
# This works in V1 with VLLM_ALLOW_INSECURE_SERIALIZATION=1
|
||||||
|
def _apply_model(self):
|
||||||
|
return func(self.get_model())
|
||||||
|
|
||||||
|
return self.llm.llm_engine.collective_rpc(_apply_model)
|
||||||
|
|
||||||
def __enter__(self):
|
def __enter__(self):
|
||||||
return self
|
return self
|
||||||
|
|||||||
93
tests/entrypoints/openai/test_skip_tokenizer.py
Normal file
93
tests/entrypoints/openai/test_skip_tokenizer.py
Normal file
@ -0,0 +1,93 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
import base64
|
||||||
|
import io
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pytest
|
||||||
|
import requests
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from ...utils import RemoteOpenAIServer
|
||||||
|
|
||||||
|
MODEL_NAME = "christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM"
|
||||||
|
DTYPE = "float16"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def v1(run_with_both_engines):
|
||||||
|
# Simple autouse wrapper to run both engines for each test
|
||||||
|
# This can be promoted up to conftest.py to run for every
|
||||||
|
# test in a package
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="module")
|
||||||
|
def server():
|
||||||
|
args = [
|
||||||
|
"--task",
|
||||||
|
"embed",
|
||||||
|
# use half precision for speed and memory savings in CI environment
|
||||||
|
"--dtype",
|
||||||
|
DTYPE,
|
||||||
|
"--enforce-eager",
|
||||||
|
"--trust-remote-code",
|
||||||
|
"--skip-tokenizer-init",
|
||||||
|
"--max-num-seqs",
|
||||||
|
"32"
|
||||||
|
]
|
||||||
|
|
||||||
|
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||||
|
yield remote_server
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||||
|
async def test_single_request(server: RemoteOpenAIServer, model_name: str):
|
||||||
|
|
||||||
|
pixel_values = torch.full((6, 512, 512), 1.0, dtype=torch.float16)
|
||||||
|
location_coords = torch.full((1, 2), 1.0, dtype=torch.float16)
|
||||||
|
|
||||||
|
buffer_tiff = io.BytesIO()
|
||||||
|
torch.save(pixel_values, buffer_tiff)
|
||||||
|
buffer_tiff.seek(0)
|
||||||
|
binary_data = buffer_tiff.read()
|
||||||
|
base64_tensor_embedding = base64.b64encode(binary_data).decode('utf-8')
|
||||||
|
|
||||||
|
buffer_coord = io.BytesIO()
|
||||||
|
torch.save(location_coords, buffer_coord)
|
||||||
|
buffer_coord.seek(0)
|
||||||
|
binary_data = buffer_coord.read()
|
||||||
|
base64_coord_embedding = base64.b64encode(binary_data).decode('utf-8')
|
||||||
|
|
||||||
|
prompt = {
|
||||||
|
"model":
|
||||||
|
model_name,
|
||||||
|
"additional_data": {
|
||||||
|
"prompt_token_ids": [1]
|
||||||
|
},
|
||||||
|
"encoding_format":
|
||||||
|
"base64",
|
||||||
|
"messages": [{
|
||||||
|
"role":
|
||||||
|
"user",
|
||||||
|
"content": [{
|
||||||
|
"type": "image_embeds",
|
||||||
|
"image_embeds": {
|
||||||
|
"pixel_values": base64_tensor_embedding,
|
||||||
|
"location_coords": base64_coord_embedding,
|
||||||
|
},
|
||||||
|
}],
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
|
||||||
|
# test single pooling
|
||||||
|
response = requests.post(server.url_for("pooling"), json=prompt)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
output = response.json()["data"][0]['data']
|
||||||
|
|
||||||
|
np_response = np.frombuffer(base64.b64decode(output), dtype=np.float32)
|
||||||
|
|
||||||
|
assert len(np_response) == 524288
|
||||||
191
tests/kernels/attention/test_aiter_flash_attn.py
Normal file
191
tests/kernels/attention/test_aiter_flash_attn.py
Normal file
@ -0,0 +1,191 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import torch
|
||||||
|
|
||||||
|
import vllm.v1.attention.backends.rocm_aiter_fa # noqa: F401
|
||||||
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
|
NUM_HEADS = [(4, 4), (8, 2), (16, 2)]
|
||||||
|
HEAD_SIZES = [128, 256]
|
||||||
|
BLOCK_SIZES = [16, 32]
|
||||||
|
DTYPES = [torch.float16, torch.bfloat16]
|
||||||
|
QDTYPES = [None]
|
||||||
|
# one value large enough to test overflow in index calculation.
|
||||||
|
# one value small enough to test the schema op check
|
||||||
|
NUM_BLOCKS = [32768, 2048]
|
||||||
|
|
||||||
|
|
||||||
|
def ref_paged_attn(
|
||||||
|
query: torch.Tensor,
|
||||||
|
key_cache: torch.Tensor,
|
||||||
|
value_cache: torch.Tensor,
|
||||||
|
query_lens: list[int],
|
||||||
|
kv_lens: list[int],
|
||||||
|
block_tables: torch.Tensor,
|
||||||
|
scale: float,
|
||||||
|
sliding_window: Optional[int] = None,
|
||||||
|
soft_cap: Optional[float] = None,
|
||||||
|
) -> torch.Tensor:
|
||||||
|
num_seqs = len(query_lens)
|
||||||
|
block_tables = block_tables.cpu().numpy()
|
||||||
|
_, block_size, num_kv_heads, head_size = key_cache.shape
|
||||||
|
|
||||||
|
outputs: list[torch.Tensor] = []
|
||||||
|
start_idx = 0
|
||||||
|
for i in range(num_seqs):
|
||||||
|
query_len = query_lens[i]
|
||||||
|
kv_len = kv_lens[i]
|
||||||
|
q = query[start_idx:start_idx + query_len]
|
||||||
|
q *= scale
|
||||||
|
|
||||||
|
num_kv_blocks = (kv_len + block_size - 1) // block_size
|
||||||
|
block_indices = block_tables[i, :num_kv_blocks]
|
||||||
|
|
||||||
|
k = key_cache[block_indices].view(-1, num_kv_heads, head_size)
|
||||||
|
k = k[:kv_len]
|
||||||
|
v = value_cache[block_indices].view(-1, num_kv_heads, head_size)
|
||||||
|
v = v[:kv_len]
|
||||||
|
|
||||||
|
if q.shape[1] != k.shape[1]:
|
||||||
|
k = torch.repeat_interleave(k, q.shape[1] // k.shape[1], dim=1)
|
||||||
|
v = torch.repeat_interleave(v, q.shape[1] // v.shape[1], dim=1)
|
||||||
|
attn = torch.einsum("qhd,khd->hqk", q, k).float()
|
||||||
|
empty_mask = torch.ones(query_len, kv_len)
|
||||||
|
mask = torch.triu(empty_mask, diagonal=kv_len - query_len + 1).bool()
|
||||||
|
if sliding_window is not None:
|
||||||
|
sliding_window_mask = torch.triu(empty_mask,
|
||||||
|
diagonal=kv_len -
|
||||||
|
(query_len + sliding_window) +
|
||||||
|
1).bool().logical_not()
|
||||||
|
mask |= sliding_window_mask
|
||||||
|
if soft_cap is not None:
|
||||||
|
attn = soft_cap * torch.tanh(attn / soft_cap)
|
||||||
|
attn.masked_fill_(mask, float("-inf"))
|
||||||
|
attn = torch.softmax(attn, dim=-1).to(v.dtype)
|
||||||
|
out = torch.einsum("hqk,khd->qhd", attn, v)
|
||||||
|
|
||||||
|
outputs.append(out)
|
||||||
|
start_idx += query_len
|
||||||
|
|
||||||
|
return torch.cat(outputs, dim=0)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(not current_platform.is_rocm(),
|
||||||
|
reason="Only ROCm is supported")
|
||||||
|
@pytest.mark.parametrize("seq_lens",
|
||||||
|
[[(10, 1328), (5, 18),
|
||||||
|
(129, 463)], [(8, 523), (24, 37), (3, 2011)]])
|
||||||
|
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
||||||
|
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
||||||
|
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
|
||||||
|
@pytest.mark.parametrize("sliding_window", [None, 256])
|
||||||
|
@pytest.mark.parametrize("dtype", DTYPES)
|
||||||
|
@pytest.mark.parametrize("soft_cap", [None])
|
||||||
|
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
|
||||||
|
@pytest.mark.parametrize("q_dtype", QDTYPES)
|
||||||
|
@torch.inference_mode()
|
||||||
|
def test_varlen_with_paged_kv(
|
||||||
|
seq_lens: list[tuple[int, int]],
|
||||||
|
num_heads: tuple[int, int],
|
||||||
|
head_size: int,
|
||||||
|
sliding_window: Optional[int],
|
||||||
|
dtype: torch.dtype,
|
||||||
|
block_size: int,
|
||||||
|
soft_cap: Optional[float],
|
||||||
|
num_blocks: int,
|
||||||
|
q_dtype: Optional[torch.dtype],
|
||||||
|
) -> None:
|
||||||
|
torch.set_default_device("cuda")
|
||||||
|
current_platform.seed_everything(0)
|
||||||
|
num_seqs = len(seq_lens)
|
||||||
|
query_lens = [x[0] for x in seq_lens]
|
||||||
|
kv_lens = [x[1] for x in seq_lens]
|
||||||
|
num_query_heads = num_heads[0]
|
||||||
|
num_kv_heads = num_heads[1]
|
||||||
|
assert num_query_heads % num_kv_heads == 0
|
||||||
|
max_query_len = max(query_lens)
|
||||||
|
max_kv_len = max(kv_lens)
|
||||||
|
window_size = ((sliding_window - 1, 0) if sliding_window is not None else
|
||||||
|
(-1, -1))
|
||||||
|
scale = head_size**-0.5
|
||||||
|
|
||||||
|
query = torch.randn(sum(query_lens),
|
||||||
|
num_query_heads,
|
||||||
|
head_size,
|
||||||
|
dtype=dtype)
|
||||||
|
key_cache = torch.randn(num_blocks,
|
||||||
|
block_size,
|
||||||
|
num_kv_heads,
|
||||||
|
head_size,
|
||||||
|
dtype=dtype)
|
||||||
|
value_cache = torch.randn_like(key_cache)
|
||||||
|
cu_query_lens = torch.tensor([0] + query_lens,
|
||||||
|
dtype=torch.int32).cumsum(dim=0,
|
||||||
|
dtype=torch.int32)
|
||||||
|
|
||||||
|
cu_seq_lens = torch.tensor([0] + kv_lens,
|
||||||
|
dtype=torch.int32).cumsum(dim=0,
|
||||||
|
dtype=torch.int32)
|
||||||
|
kv_lens = torch.tensor(kv_lens, dtype=torch.int32)
|
||||||
|
|
||||||
|
max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
|
||||||
|
block_tables = torch.randint(0,
|
||||||
|
num_blocks,
|
||||||
|
(num_seqs, max_num_blocks_per_seq),
|
||||||
|
dtype=torch.int32)
|
||||||
|
|
||||||
|
output = torch.empty_like(query)
|
||||||
|
|
||||||
|
maybe_quantized_query = query
|
||||||
|
maybe_quantized_key_cache = key_cache
|
||||||
|
maybe_quantized_value_cache = value_cache
|
||||||
|
k_descale = None
|
||||||
|
v_descale = None
|
||||||
|
if q_dtype is not None:
|
||||||
|
# QKV are drawn from N(0, 1): no need for a fp8 scaling factor
|
||||||
|
maybe_quantized_query = query.to(q_dtype)
|
||||||
|
maybe_quantized_key_cache = key_cache.to(q_dtype)
|
||||||
|
maybe_quantized_value_cache = value_cache.to(q_dtype)
|
||||||
|
|
||||||
|
scale_shape = (num_seqs, num_kv_heads)
|
||||||
|
k_descale = torch.ones(scale_shape, dtype=torch.float32)
|
||||||
|
v_descale = torch.ones(scale_shape, dtype=torch.float32)
|
||||||
|
|
||||||
|
torch.ops.vllm.flash_attn_varlen_func(
|
||||||
|
maybe_quantized_query,
|
||||||
|
maybe_quantized_key_cache,
|
||||||
|
maybe_quantized_value_cache,
|
||||||
|
out=output,
|
||||||
|
cu_seqlens_q=cu_query_lens,
|
||||||
|
max_seqlen_q=max_query_len,
|
||||||
|
max_seqlen_k=max_kv_len,
|
||||||
|
softmax_scale=scale,
|
||||||
|
alibi_slopes=None,
|
||||||
|
window_size=window_size,
|
||||||
|
block_table=block_tables,
|
||||||
|
cu_seqlens_k=cu_seq_lens,
|
||||||
|
k_scale=k_descale,
|
||||||
|
v_scale=v_descale,
|
||||||
|
)
|
||||||
|
|
||||||
|
ref_output = ref_paged_attn(
|
||||||
|
query=query,
|
||||||
|
key_cache=key_cache,
|
||||||
|
value_cache=value_cache,
|
||||||
|
query_lens=query_lens,
|
||||||
|
kv_lens=kv_lens,
|
||||||
|
block_tables=block_tables,
|
||||||
|
scale=scale,
|
||||||
|
sliding_window=sliding_window,
|
||||||
|
soft_cap=soft_cap,
|
||||||
|
)
|
||||||
|
|
||||||
|
atol, rtol = 2e-2, 2e-2
|
||||||
|
if q_dtype is not None:
|
||||||
|
atol, rtol = 1.5e-1, 1.5e-1
|
||||||
|
torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol), \
|
||||||
|
f"{torch.max(torch.abs(output - ref_output))}"
|
||||||
@ -17,28 +17,34 @@ from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
|
|||||||
moe_permute, moe_permute_unpermute_supported, moe_unpermute)
|
moe_permute, moe_permute_unpermute_supported, moe_unpermute)
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
NUM_EXPERTS = [16, 64]
|
NUM_EXPERTS = [16, 64, 256]
|
||||||
TOP_KS = [2, 4, 6, 8]
|
TOP_KS = [2, 4, 6, 8]
|
||||||
EP_SIZE = [1, 4, 16]
|
EP_SIZE = [1, 4, 16]
|
||||||
current_platform.seed_everything(0)
|
current_platform.seed_everything(0)
|
||||||
|
|
||||||
|
|
||||||
def torch_permute(hidden_states: torch.Tensor,
|
def torch_permute(
|
||||||
topk_ids: torch.Tensor,
|
hidden_states: torch.Tensor,
|
||||||
token_expert_indices: torch.Tensor,
|
topk_ids: torch.Tensor,
|
||||||
topk: int,
|
# token_expert_indices: torch.Tensor,
|
||||||
n_expert: int,
|
topk: int,
|
||||||
n_local_expert: int,
|
n_expert: int,
|
||||||
start_expert: int,
|
n_local_expert: int,
|
||||||
expert_map: Optional[torch.Tensor] = None,
|
start_expert: int,
|
||||||
align_block_size: Optional[int] = None,
|
expert_map: Optional[torch.Tensor] = None,
|
||||||
fill_invalid_expert: int = -1) -> list[torch.Tensor]:
|
align_block_size: Optional[int] = None,
|
||||||
|
fill_invalid_expert: int = -1) -> list[torch.Tensor]:
|
||||||
n_token, n_hidden = hidden_states.shape[0], hidden_states.shape[1]
|
n_token, n_hidden = hidden_states.shape[0], hidden_states.shape[1]
|
||||||
if expert_map is not None:
|
if expert_map is not None:
|
||||||
is_local_expert = (expert_map[topk_ids] != -1)
|
is_local_expert = (expert_map[topk_ids] != -1)
|
||||||
not_local_expert = (expert_map[topk_ids] == -1)
|
not_local_expert = (expert_map[topk_ids] == -1)
|
||||||
topk_ids = is_local_expert * (
|
topk_ids = is_local_expert * (
|
||||||
topk_ids - start_expert) + not_local_expert * (topk_ids + n_expert)
|
topk_ids - start_expert) + not_local_expert * (topk_ids + n_expert)
|
||||||
|
token_expert_indices = torch.arange(0,
|
||||||
|
n_token * topk,
|
||||||
|
dtype=torch.int32,
|
||||||
|
device=hidden_states.device).reshape(
|
||||||
|
(n_token, topk))
|
||||||
|
|
||||||
sorted_topk_ids, sorted_indices = torch.sort(topk_ids.flatten(),
|
sorted_topk_ids, sorted_indices = torch.sort(topk_ids.flatten(),
|
||||||
stable=True)
|
stable=True)
|
||||||
@ -59,8 +65,8 @@ def torch_permute(hidden_states: torch.Tensor,
|
|||||||
valid_row_idx = []
|
valid_row_idx = []
|
||||||
if align_block_size is None:
|
if align_block_size is None:
|
||||||
|
|
||||||
permuted_hidden_states = hidden_states[dst_row_id2src_row_id_map %
|
permuted_hidden_states = hidden_states[dst_row_id2src_row_id_map //
|
||||||
n_token, ...]
|
topk, ...]
|
||||||
permuted_row_size = permuted_hidden_states.shape[0]
|
permuted_row_size = permuted_hidden_states.shape[0]
|
||||||
m_indices = torch.empty(permuted_row_size,
|
m_indices = torch.empty(permuted_row_size,
|
||||||
device="cuda",
|
device="cuda",
|
||||||
@ -73,14 +79,21 @@ def torch_permute(hidden_states: torch.Tensor,
|
|||||||
0, n_token * topk, device="cuda",
|
0, n_token * topk, device="cuda",
|
||||||
dtype=torch.int32)[src2dst_idx].reshape((n_token, topk))
|
dtype=torch.int32)[src2dst_idx].reshape((n_token, topk))
|
||||||
valid_row_idx += [i for i in range(expert_first_token_offset[-1])]
|
valid_row_idx += [i for i in range(expert_first_token_offset[-1])]
|
||||||
|
dst_row_id2src_row_id_map[
|
||||||
|
expert_first_token_offset[-1]:] = n_token * topk
|
||||||
return [
|
return [
|
||||||
permuted_hidden_states, expert_first_token_offset,
|
permuted_hidden_states, expert_first_token_offset,
|
||||||
src_row_id2dst_row_id_map, m_indices, valid_row_idx
|
src_row_id2dst_row_id_map, dst_row_id2src_row_id_map, m_indices,
|
||||||
|
valid_row_idx
|
||||||
]
|
]
|
||||||
else:
|
else:
|
||||||
permuted_row_size = (topk * n_token + n_expert *
|
permuted_row_size = (topk * n_token + n_expert *
|
||||||
(align_block_size - 1) + align_block_size -
|
(align_block_size - 1) + align_block_size -
|
||||||
1) // align_block_size * align_block_size
|
1) // align_block_size * align_block_size
|
||||||
|
permuted_idx = torch.full((permuted_row_size, ),
|
||||||
|
n_token * topk,
|
||||||
|
dtype=torch.int32,
|
||||||
|
device=hidden_states.device)
|
||||||
permuted_hidden_states = torch.empty((permuted_row_size, n_hidden),
|
permuted_hidden_states = torch.empty((permuted_row_size, n_hidden),
|
||||||
device="cuda",
|
device="cuda",
|
||||||
dtype=hidden_states.dtype)
|
dtype=hidden_states.dtype)
|
||||||
@ -105,13 +118,16 @@ def torch_permute(hidden_states: torch.Tensor,
|
|||||||
align_first_token_offset = align_expert_first_token_offset[i - 1]
|
align_first_token_offset = align_expert_first_token_offset[i - 1]
|
||||||
align_last_token_offset = align_expert_first_token_offset[i]
|
align_last_token_offset = align_expert_first_token_offset[i]
|
||||||
dst_row_id2src_row_id_in_expert = dst_row_id2src_row_id_map[
|
dst_row_id2src_row_id_in_expert = dst_row_id2src_row_id_map[
|
||||||
first_token_offset:first_token_offset +
|
first_token_offset:first_token_offset + n_token_in_expert]
|
||||||
n_token_in_expert] % n_token
|
|
||||||
# store token in current expert with align_first_token_offset
|
# store token in current expert with align_first_token_offset
|
||||||
permuted_hidden_states[align_first_token_offset:\
|
permuted_hidden_states[align_first_token_offset:\
|
||||||
align_first_token_offset+n_token_in_expert,\
|
align_first_token_offset+n_token_in_expert,\
|
||||||
...] = hidden_states[\
|
...] = hidden_states[\
|
||||||
dst_row_id2src_row_id_in_expert, ...]
|
dst_row_id2src_row_id_in_expert // topk,\
|
||||||
|
...]
|
||||||
|
permuted_idx[align_first_token_offset:\
|
||||||
|
align_first_token_offset+\
|
||||||
|
n_token_in_expert] = dst_row_id2src_row_id_in_expert
|
||||||
# set current expert m_indices
|
# set current expert m_indices
|
||||||
m_indices[align_first_token_offset:align_last_token_offset] = i - 1
|
m_indices[align_first_token_offset:align_last_token_offset] = i - 1
|
||||||
valid_row_idx += [
|
valid_row_idx += [
|
||||||
@ -135,7 +151,7 @@ def torch_permute(hidden_states: torch.Tensor,
|
|||||||
src2dst_idx].reshape((n_token, topk))
|
src2dst_idx].reshape((n_token, topk))
|
||||||
return [
|
return [
|
||||||
permuted_hidden_states, align_expert_first_token_offset,
|
permuted_hidden_states, align_expert_first_token_offset,
|
||||||
align_src_row_id2dst_row_id, m_indices, valid_row_idx
|
align_src_row_id2dst_row_id, permuted_idx, m_indices, valid_row_idx
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@ -146,15 +162,18 @@ def torch_unpermute(permuted_hidden_states: torch.Tensor,
|
|||||||
valid_row_idx: torch.Tensor, topk: int,
|
valid_row_idx: torch.Tensor, topk: int,
|
||||||
n_expert: int) -> torch.Tensor:
|
n_expert: int) -> torch.Tensor:
|
||||||
# ignore invalid row
|
# ignore invalid row
|
||||||
|
n_hidden = permuted_hidden_states.shape[1]
|
||||||
mask = torch.zeros(permuted_hidden_states.shape[0],
|
mask = torch.zeros(permuted_hidden_states.shape[0],
|
||||||
dtype=bool,
|
dtype=bool,
|
||||||
device="cuda")
|
device="cuda")
|
||||||
mask[valid_row_idx] = True
|
mask[valid_row_idx] = True
|
||||||
permuted_hidden_states[~mask] = 0
|
permuted_hidden_states[~mask] = 0
|
||||||
idx = src_row_id2dst_row_id_map.flatten()[
|
|
||||||
token_expert_indices.flatten()].reshape(token_expert_indices.shape)
|
permuted_hidden_states = permuted_hidden_states[
|
||||||
output = permuted_hidden_states[idx, ...] * topk_weights[..., None]
|
src_row_id2dst_row_id_map.flatten(), ...]
|
||||||
output = output.sum(dim=1).to(permuted_hidden_states.dtype)
|
permuted_hidden_states = permuted_hidden_states.view(-1, topk, n_hidden)
|
||||||
|
output = (permuted_hidden_states * topk_weights.unsqueeze(2)).sum(1).to(
|
||||||
|
permuted_hidden_states.dtype)
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
@ -184,43 +203,56 @@ def test_moe_permute_unpermute(n_token: int, n_hidden: int, topk: int,
|
|||||||
gating_output = torch.randn((n_token, n_expert), device="cuda").to(dtype)
|
gating_output = torch.randn((n_token, n_expert), device="cuda").to(dtype)
|
||||||
topk_weights, topk_ids, token_expert_indices = fused_topk(
|
topk_weights, topk_ids, token_expert_indices = fused_topk(
|
||||||
hidden_states, gating_output, topk, False)
|
hidden_states, gating_output, topk, False)
|
||||||
gold0, gold1, gold2, gold3, valid_row_idx = torch_permute(
|
(gold_permuted_hidden_states, gold_expert_first_token_offset,
|
||||||
hidden_states,
|
gold_inv_permuted_idx, gold_permuted_idx, gold_m_indices,
|
||||||
topk_ids,
|
valid_row_idx) = torch_permute(
|
||||||
token_expert_indices,
|
hidden_states,
|
||||||
topk,
|
topk_ids,
|
||||||
n_expert,
|
# token_expert_indices,
|
||||||
n_local_expert,
|
topk,
|
||||||
start_expert,
|
n_expert,
|
||||||
expert_map=expert_map,
|
n_local_expert,
|
||||||
align_block_size=align_block_size,
|
start_expert,
|
||||||
fill_invalid_expert=fill_invalid_expert)
|
expert_map=expert_map,
|
||||||
|
align_block_size=align_block_size,
|
||||||
|
fill_invalid_expert=fill_invalid_expert)
|
||||||
|
|
||||||
result0, result1, result2, result3 = moe_permute(
|
(permuted_hidden_states, _, expert_first_token_offset, inv_permuted_idx,
|
||||||
hidden_states, topk_weights, topk_ids, token_expert_indices, topk,
|
m_indices) = moe_permute(hidden_states=hidden_states,
|
||||||
n_expert, n_local_expert, expert_map, align_block_size,
|
a1q_scale=None,
|
||||||
fill_invalid_expert)
|
topk_ids=topk_ids,
|
||||||
|
n_expert=n_expert,
|
||||||
|
n_local_expert=n_local_expert,
|
||||||
|
expert_map=expert_map,
|
||||||
|
align_block_size=align_block_size,
|
||||||
|
fill_invalid_expert=fill_invalid_expert)
|
||||||
|
|
||||||
# check expert_first_token_offset
|
# check expert_first_token_offset
|
||||||
torch.testing.assert_close(gold1, result1, atol=0, rtol=0)
|
torch.testing.assert_close(gold_expert_first_token_offset,
|
||||||
# check src_row_id2dst_row_id_map
|
expert_first_token_offset,
|
||||||
torch.testing.assert_close(gold2, result2, atol=0, rtol=0)
|
atol=0,
|
||||||
# check mindice
|
rtol=0)
|
||||||
torch.testing.assert_close(gold3, result3, atol=0, rtol=0)
|
# check src_row_id2dst_row_id_map
|
||||||
# check permuted_hidden_states, only valid token
|
torch.testing.assert_close(gold_inv_permuted_idx.flatten(),
|
||||||
torch.testing.assert_close(gold0[valid_row_idx],
|
inv_permuted_idx,
|
||||||
result0[valid_row_idx],
|
atol=0,
|
||||||
|
rtol=0)
|
||||||
|
# check mindice
|
||||||
|
torch.testing.assert_close(gold_m_indices, m_indices, atol=0, rtol=0)
|
||||||
|
# check permuted_hidden_states, only valid token
|
||||||
|
torch.testing.assert_close(gold_permuted_hidden_states[valid_row_idx],
|
||||||
|
permuted_hidden_states[valid_row_idx],
|
||||||
atol=0,
|
atol=0,
|
||||||
rtol=0)
|
rtol=0)
|
||||||
|
|
||||||
# add a random tensor to simulate group gemm
|
# add a random tensor to simulate group gemm
|
||||||
result0 = 0.5 * result0 + torch.randn_like(result0)
|
result0 = 0.5 * permuted_hidden_states + torch.randn_like(
|
||||||
|
permuted_hidden_states)
|
||||||
|
result4 = torch.empty_like(hidden_states)
|
||||||
|
moe_unpermute(result4, result0, topk_weights, inv_permuted_idx,
|
||||||
|
expert_first_token_offset)
|
||||||
|
|
||||||
result4 = moe_unpermute(result0, topk_weights, topk_ids, result2, result1,
|
|
||||||
topk, n_expert, n_local_expert)
|
|
||||||
gold4 = torch_unpermute(result0, topk_weights, topk_ids,
|
gold4 = torch_unpermute(result0, topk_weights, topk_ids,
|
||||||
token_expert_indices, result2, valid_row_idx, topk,
|
token_expert_indices, inv_permuted_idx,
|
||||||
n_local_expert)
|
valid_row_idx, topk, n_local_expert)
|
||||||
|
|
||||||
# check unpermuted hidden
|
# check unpermuted hidden
|
||||||
torch.testing.assert_close(result4, gold4, atol=2e-2, rtol=0)
|
torch.testing.assert_close(result4, gold4, atol=2e-2, rtol=0)
|
||||||
|
|||||||
@ -22,10 +22,12 @@ REVISION_ROBERTA = os.environ.get("REVISION", "main")
|
|||||||
|
|
||||||
@pytest.mark.skipif(current_platform.is_rocm(),
|
@pytest.mark.skipif(current_platform.is_rocm(),
|
||||||
reason="Xformers backend is not supported on ROCm.")
|
reason="Xformers backend is not supported on ROCm.")
|
||||||
def test_model_loading_with_params(vllm_runner):
|
def test_model_loading_with_params(vllm_runner, monkeypatch):
|
||||||
"""
|
"""
|
||||||
Test parameter weight loading with tp>1.
|
Test parameter weight loading with tp>1.
|
||||||
"""
|
"""
|
||||||
|
# to use apply_model
|
||||||
|
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||||
with vllm_runner(model_name=MODEL_NAME,
|
with vllm_runner(model_name=MODEL_NAME,
|
||||||
revision=REVISION,
|
revision=REVISION,
|
||||||
dtype="float16",
|
dtype="float16",
|
||||||
@ -61,10 +63,12 @@ def test_model_loading_with_params(vllm_runner):
|
|||||||
|
|
||||||
@pytest.mark.skipif(current_platform.is_rocm(),
|
@pytest.mark.skipif(current_platform.is_rocm(),
|
||||||
reason="Xformers backend is not supported on ROCm.")
|
reason="Xformers backend is not supported on ROCm.")
|
||||||
def test_roberta_model_loading_with_params(vllm_runner):
|
def test_roberta_model_loading_with_params(vllm_runner, monkeypatch):
|
||||||
"""
|
"""
|
||||||
Test parameter weight loading with tp>1.
|
Test parameter weight loading with tp>1.
|
||||||
"""
|
"""
|
||||||
|
# to use apply_model
|
||||||
|
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||||
with vllm_runner(model_name=MODEL_NAME_ROBERTA,
|
with vllm_runner(model_name=MODEL_NAME_ROBERTA,
|
||||||
revision=REVISION_ROBERTA,
|
revision=REVISION_ROBERTA,
|
||||||
dtype="float16",
|
dtype="float16",
|
||||||
@ -101,10 +105,12 @@ def test_roberta_model_loading_with_params(vllm_runner):
|
|||||||
|
|
||||||
@pytest.mark.skipif(current_platform.is_rocm(),
|
@pytest.mark.skipif(current_platform.is_rocm(),
|
||||||
reason="Xformers backend is not supported on ROCm.")
|
reason="Xformers backend is not supported on ROCm.")
|
||||||
def test_facebook_roberta_model_loading_with_params(vllm_runner):
|
def test_facebook_roberta_model_loading_with_params(vllm_runner, monkeypatch):
|
||||||
"""
|
"""
|
||||||
Test loading roberta-base model with no lm_head.
|
Test loading roberta-base model with no lm_head.
|
||||||
"""
|
"""
|
||||||
|
# to use apply_model
|
||||||
|
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||||
model_name = "FacebookAI/roberta-base"
|
model_name = "FacebookAI/roberta-base"
|
||||||
with vllm_runner(model_name=model_name,
|
with vllm_runner(model_name=model_name,
|
||||||
dtype="float16",
|
dtype="float16",
|
||||||
|
|||||||
@ -39,17 +39,9 @@ def v1(run_with_both_engines):
|
|||||||
pytest.param("ssmits/Qwen2-7B-Instruct-embed-base",
|
pytest.param("ssmits/Qwen2-7B-Instruct-embed-base",
|
||||||
marks=[pytest.mark.skip_v0, pytest.mark.cpu_model]),
|
marks=[pytest.mark.skip_v0, pytest.mark.cpu_model]),
|
||||||
# [Encoder-only]
|
# [Encoder-only]
|
||||||
pytest.param(
|
pytest.param("BAAI/bge-base-en-v1.5", marks=[pytest.mark.core_model]),
|
||||||
"BAAI/bge-base-en-v1.5",
|
pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
|
||||||
marks=[
|
pytest.param("intfloat/multilingual-e5-small"),
|
||||||
# CPU only supports V1
|
|
||||||
pytest.mark.core_model,
|
|
||||||
pytest.mark.skip_v1
|
|
||||||
]),
|
|
||||||
pytest.param("sentence-transformers/all-MiniLM-L12-v2",
|
|
||||||
marks=[pytest.mark.skip_v1]),
|
|
||||||
pytest.param("intfloat/multilingual-e5-small",
|
|
||||||
marks=[pytest.mark.skip_v1]),
|
|
||||||
pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
||||||
marks=[pytest.mark.skip_v1]),
|
marks=[pytest.mark.skip_v1]),
|
||||||
# [Cross-Encoder]
|
# [Cross-Encoder]
|
||||||
|
|||||||
@ -23,6 +23,14 @@ RERANK_MODELS = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def v1(run_with_both_engines):
|
||||||
|
# Simple autouse wrapper to run both engines for each test
|
||||||
|
# This can be promoted up to conftest.py to run for every
|
||||||
|
# test in a package
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
|
@pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
|
||||||
def test_embed_models_mteb(hf_runner, vllm_runner,
|
def test_embed_models_mteb(hf_runner, vllm_runner,
|
||||||
model_info: EmbedModelInfo) -> None:
|
model_info: EmbedModelInfo) -> None:
|
||||||
|
|||||||
@ -677,6 +677,7 @@ VLM_TEST_SETTINGS = {
|
|||||||
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||||
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
|
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
|
||||||
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
|
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
|
||||||
|
multi_image_prompt="Picture 1: <vlm_image>\nPicture 2: <vlm_image>\nDescribe these two images with one paragraph respectively.", # noqa: E501
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
auto_cls=AutoModelForVision2Seq,
|
auto_cls=AutoModelForVision2Seq,
|
||||||
|
|||||||
@ -22,6 +22,9 @@ from transformers import (AutoConfig, AutoProcessor, AutoTokenizer,
|
|||||||
GenerationConfig)
|
GenerationConfig)
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
|
from vllm.v1.executor.abstract import Executor
|
||||||
|
from vllm.v1.kv_cache_interface import (ChunkedLocalAttentionSpec,
|
||||||
|
FullAttentionSpec)
|
||||||
|
|
||||||
from ....utils import multi_gpu_test
|
from ....utils import multi_gpu_test
|
||||||
|
|
||||||
@ -69,6 +72,26 @@ def run_maverick_serving(model: str):
|
|||||||
raise
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
def get_rope_layers_config(model_path: str) -> list[int]:
|
||||||
|
"""
|
||||||
|
Get the interleaved RoPE configuration from HuggingFace config
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model_path: Path to the local directory containing the reduced
|
||||||
|
Maverick model checkpoint
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of 0 or 1 indicating whether each layer uses RoPE and local attn
|
||||||
|
0 indicates that RoPE is not used while 1 indicates that RoPE is used.
|
||||||
|
"""
|
||||||
|
config_path = Path(model_path) / "config.json"
|
||||||
|
model_config = json.loads(config_path.read_text())
|
||||||
|
text_config = model_config["text_config"]
|
||||||
|
no_rope_layers = text_config["no_rope_layers"]
|
||||||
|
print(f"Found no_rope_layers: {no_rope_layers}")
|
||||||
|
return no_rope_layers
|
||||||
|
|
||||||
|
|
||||||
def create_reduced_maverick_model(
|
def create_reduced_maverick_model(
|
||||||
original_model_name:
|
original_model_name:
|
||||||
str = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
str = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
||||||
@ -113,7 +136,6 @@ def create_reduced_maverick_model(
|
|||||||
print("Loading original model configuration...")
|
print("Loading original model configuration...")
|
||||||
original_config = AutoConfig.from_pretrained(original_model_name,
|
original_config = AutoConfig.from_pretrained(original_model_name,
|
||||||
trust_remote_code=True)
|
trust_remote_code=True)
|
||||||
|
|
||||||
print("Creating reduced configuration...")
|
print("Creating reduced configuration...")
|
||||||
reduced_config = create_reduced_config(original_config, text_layers,
|
reduced_config = create_reduced_config(original_config, text_layers,
|
||||||
num_experts, vision_layers)
|
num_experts, vision_layers)
|
||||||
@ -510,21 +532,32 @@ def save_weights_to_safetensors(weights: dict[str, torch.Tensor],
|
|||||||
f"{index_data['metadata']['total_size'] / (1024**3):.2f} GB")
|
f"{index_data['metadata']['total_size'] / (1024**3):.2f} GB")
|
||||||
|
|
||||||
|
|
||||||
def run_reduced_model(model_path: str,
|
def check_attention_spec_interleaved_rope(
|
||||||
should_profile: bool = False,
|
llm: LLM,
|
||||||
**kwargs) -> None:
|
num_attention_layers: int,
|
||||||
"""Test the created reduced model with vLLM."""
|
num_ranks: int,
|
||||||
|
rope_layers: list[int],
|
||||||
print(f"\nTesting reduced model at {model_path}...")
|
):
|
||||||
|
"""Check that the attention spec is correct."""
|
||||||
llm = LLM(
|
assert isinstance(llm.llm_engine.model_executor, Executor)
|
||||||
model=model_path,
|
kv_cache_specs_per_rank = llm.llm_engine.model_executor.get_kv_cache_specs(
|
||||||
trust_remote_code=True,
|
|
||||||
max_model_len=512, # Small context for testing
|
|
||||||
gpu_memory_utilization=0.3, # Conservative memory usage
|
|
||||||
**kwargs,
|
|
||||||
)
|
)
|
||||||
|
for rank in range(num_ranks):
|
||||||
|
kv_cache_specs = kv_cache_specs_per_rank[rank]
|
||||||
|
assert len(kv_cache_specs.keys()) == num_attention_layers
|
||||||
|
for i in range(num_attention_layers):
|
||||||
|
if rope_layers[i] == 0:
|
||||||
|
expected_spec = FullAttentionSpec
|
||||||
|
else:
|
||||||
|
expected_spec = ChunkedLocalAttentionSpec
|
||||||
|
assert isinstance(
|
||||||
|
kv_cache_specs[
|
||||||
|
f"language_model.model.layers.{i}.self_attn.attn"],
|
||||||
|
expected_spec)
|
||||||
|
|
||||||
|
|
||||||
|
def run_reduced_model(llm: LLM, should_profile: bool = False) -> None:
|
||||||
|
"""Test the created reduced model with vLLM."""
|
||||||
sampling_params = SamplingParams(temperature=0.8,
|
sampling_params = SamplingParams(temperature=0.8,
|
||||||
top_p=0.95,
|
top_p=0.95,
|
||||||
max_tokens=50)
|
max_tokens=50)
|
||||||
@ -551,6 +584,7 @@ def run_reduced_model(model_path: str,
|
|||||||
@pytest.mark.parametrize("tp,ep", [(2, True)])
|
@pytest.mark.parametrize("tp,ep", [(2, True)])
|
||||||
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
|
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
|
||||||
def test_dummy_maverick(
|
def test_dummy_maverick(
|
||||||
|
monkeypatch,
|
||||||
original_model_name: str,
|
original_model_name: str,
|
||||||
text_layers: int,
|
text_layers: int,
|
||||||
num_experts: int,
|
num_experts: int,
|
||||||
@ -562,6 +596,10 @@ def test_dummy_maverick(
|
|||||||
force_recreate: bool = True,
|
force_recreate: bool = True,
|
||||||
profile: bool = False,
|
profile: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
# Disable multiprocessing allows us to access model executor from LLM engine
|
||||||
|
monkeypatch.setenv("VLLM_USE_V1", "1")
|
||||||
|
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
|
||||||
|
|
||||||
model_path = create_reduced_maverick_model(
|
model_path = create_reduced_maverick_model(
|
||||||
original_model_name=original_model_name,
|
original_model_name=original_model_name,
|
||||||
output_dir=output_dir,
|
output_dir=output_dir,
|
||||||
@ -573,11 +611,27 @@ def test_dummy_maverick(
|
|||||||
|
|
||||||
print(f"\nReduced model created successfully at: {model_path}")
|
print(f"\nReduced model created successfully at: {model_path}")
|
||||||
|
|
||||||
run_reduced_model(model_path=model_path,
|
rope_layers = get_rope_layers_config(model_path)
|
||||||
should_profile=profile,
|
|
||||||
enforce_eager=enforce_eager,
|
llm = LLM(
|
||||||
tensor_parallel_size=tp,
|
model=model_path,
|
||||||
enable_expert_parallel=ep)
|
trust_remote_code=True,
|
||||||
|
max_model_len=512, # Small context for testing
|
||||||
|
gpu_memory_utilization=0.3, # Conservative memory usage
|
||||||
|
enforce_eager=enforce_eager,
|
||||||
|
tensor_parallel_size=tp,
|
||||||
|
enable_expert_parallel=ep,
|
||||||
|
)
|
||||||
|
|
||||||
|
check_attention_spec_interleaved_rope(
|
||||||
|
llm,
|
||||||
|
text_layers,
|
||||||
|
tp,
|
||||||
|
rope_layers,
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"\nTesting reduced model at {model_path}...")
|
||||||
|
run_reduced_model(llm=llm, should_profile=profile)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|||||||
252
tests/models/multimodal/generation/test_phi4_multimodal.py
Normal file
252
tests/models/multimodal/generation/test_phi4_multimodal.py
Normal file
@ -0,0 +1,252 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
import os
|
||||||
|
from collections.abc import Sequence
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import librosa
|
||||||
|
import pytest
|
||||||
|
from huggingface_hub import snapshot_download
|
||||||
|
|
||||||
|
from vllm.assets.image import ImageAsset
|
||||||
|
from vllm.lora.request import LoRARequest
|
||||||
|
from vllm.multimodal.image import rescale_image_size
|
||||||
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
|
from ....conftest import (IMAGE_ASSETS, HfRunner, PromptAudioInput,
|
||||||
|
PromptImageInput, VllmRunner)
|
||||||
|
from ....utils import large_gpu_test
|
||||||
|
from ...utils import check_logprobs_close
|
||||||
|
|
||||||
|
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||||
|
"stop_sign":
|
||||||
|
"<|user|>\n<|image|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n", # noqa: E501
|
||||||
|
"cherry_blossom":
|
||||||
|
"<|user|>\n<|image|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n", # noqa: E501
|
||||||
|
})
|
||||||
|
HF_MULTIIMAGE_IMAGE_PROMPT = "<|user|>\n<|image|>\n<|image|>\nDescribe these images.<|end|>\n<|assistant|>\n" # noqa: E501
|
||||||
|
|
||||||
|
model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct",
|
||||||
|
revision="refs/pr/70")
|
||||||
|
# Since the vision-lora and speech-lora co-exist with the base model,
|
||||||
|
# we have to manually specify the path of the lora weights.
|
||||||
|
vision_lora_path = os.path.join(model_path, "vision-lora")
|
||||||
|
speech_question = os.path.join(model_path, "examples",
|
||||||
|
"what_is_shown_in_this_image.wav")
|
||||||
|
models = [model_path]
|
||||||
|
|
||||||
|
target_dtype = "half"
|
||||||
|
|
||||||
|
# ROCm Triton FA can run into shared memory issues with these models,
|
||||||
|
# use other backends in the meantime
|
||||||
|
# FIXME (mattwong, gshtrasb, hongxiayan)
|
||||||
|
if current_platform.is_rocm():
|
||||||
|
os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
|
||||||
|
|
||||||
|
|
||||||
|
def run_test(
|
||||||
|
hf_runner: type[HfRunner],
|
||||||
|
vllm_runner: type[VllmRunner],
|
||||||
|
inputs: Sequence[tuple[list[str], PromptImageInput,
|
||||||
|
Optional[PromptAudioInput]]],
|
||||||
|
model: str,
|
||||||
|
*,
|
||||||
|
max_model_len: int,
|
||||||
|
dtype: str,
|
||||||
|
max_tokens: int,
|
||||||
|
num_logprobs: int,
|
||||||
|
mm_limit: int,
|
||||||
|
tensor_parallel_size: int,
|
||||||
|
distributed_executor_backend: Optional[str] = None,
|
||||||
|
):
|
||||||
|
"""Inference result should be the same between hf and vllm.
|
||||||
|
|
||||||
|
All the image fixtures for the test are from IMAGE_ASSETS.
|
||||||
|
For huggingface runner, we provide the PIL images as input.
|
||||||
|
For vllm runner, we provide MultiModalDataDict objects
|
||||||
|
and corresponding MultiModalConfig as input.
|
||||||
|
Note, the text input is also adjusted to abide by vllm contract.
|
||||||
|
The text output is sanitized to be able to compare with hf.
|
||||||
|
"""
|
||||||
|
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||||
|
# vLLM needs a fresh new process without cuda initialization.
|
||||||
|
# if we run HF first, the cuda initialization will be done and it
|
||||||
|
# will hurt multiprocessing backend with fork method (the default method).
|
||||||
|
# max_model_len should be greater than image_feature_size
|
||||||
|
with vllm_runner(
|
||||||
|
model,
|
||||||
|
task="generate",
|
||||||
|
max_model_len=max_model_len,
|
||||||
|
max_num_seqs=2,
|
||||||
|
dtype=dtype,
|
||||||
|
limit_mm_per_prompt={"image": mm_limit},
|
||||||
|
tensor_parallel_size=tensor_parallel_size,
|
||||||
|
distributed_executor_backend=distributed_executor_backend,
|
||||||
|
enable_lora=True,
|
||||||
|
max_lora_rank=320,
|
||||||
|
gpu_memory_utilization=0.8, # set to 0.8 to avoid OOM in CI
|
||||||
|
enforce_eager=True,
|
||||||
|
trust_remote_code=False,
|
||||||
|
) as vllm_model:
|
||||||
|
lora_request = LoRARequest("vision", 1, vision_lora_path)
|
||||||
|
vllm_outputs_per_case = [
|
||||||
|
vllm_model.generate_greedy_logprobs(prompts,
|
||||||
|
max_tokens,
|
||||||
|
num_logprobs=num_logprobs,
|
||||||
|
images=images,
|
||||||
|
audios=audios,
|
||||||
|
lora_request=lora_request)
|
||||||
|
for prompts, images, audios in inputs
|
||||||
|
]
|
||||||
|
|
||||||
|
with hf_runner(model, dtype=dtype) as hf_model:
|
||||||
|
hf_model.model.load_adapter(
|
||||||
|
vision_lora_path,
|
||||||
|
adapter_name="vision",
|
||||||
|
)
|
||||||
|
hf_processor = hf_model.processor
|
||||||
|
eos_token_id = hf_processor.tokenizer.eos_token_id
|
||||||
|
hf_outputs_per_case = [
|
||||||
|
hf_model.generate_greedy_logprobs_limit(prompts,
|
||||||
|
max_tokens,
|
||||||
|
num_logprobs=num_logprobs,
|
||||||
|
images=images,
|
||||||
|
audios=audios,
|
||||||
|
eos_token_id=eos_token_id)
|
||||||
|
for prompts, images, audios in inputs
|
||||||
|
]
|
||||||
|
|
||||||
|
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
|
||||||
|
vllm_outputs_per_case):
|
||||||
|
check_logprobs_close(
|
||||||
|
outputs_0_lst=hf_outputs,
|
||||||
|
outputs_1_lst=vllm_outputs,
|
||||||
|
name_0="hf",
|
||||||
|
name_1="vllm",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("model", models)
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"size_factors",
|
||||||
|
[
|
||||||
|
# No image
|
||||||
|
[],
|
||||||
|
# Single-scale
|
||||||
|
[1.0],
|
||||||
|
# Single-scale, batched
|
||||||
|
[1.0, 1.0, 1.0],
|
||||||
|
# Multi-scale
|
||||||
|
[0.25, 0.5, 1.0],
|
||||||
|
],
|
||||||
|
)
|
||||||
|
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||||
|
@pytest.mark.parametrize("max_model_len", [12800])
|
||||||
|
@pytest.mark.parametrize("max_tokens", [128])
|
||||||
|
@pytest.mark.parametrize("num_logprobs", [10])
|
||||||
|
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
|
||||||
|
dtype: str, max_model_len: int, max_tokens: int,
|
||||||
|
num_logprobs: int) -> None:
|
||||||
|
images = [asset.pil_image for asset in image_assets]
|
||||||
|
|
||||||
|
inputs_per_image = [(
|
||||||
|
[prompt for _ in size_factors],
|
||||||
|
[rescale_image_size(image, factor) for factor in size_factors],
|
||||||
|
None,
|
||||||
|
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
|
||||||
|
|
||||||
|
run_test(
|
||||||
|
hf_runner,
|
||||||
|
vllm_runner,
|
||||||
|
inputs_per_image,
|
||||||
|
model,
|
||||||
|
dtype=dtype,
|
||||||
|
max_model_len=max_model_len,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
num_logprobs=num_logprobs,
|
||||||
|
mm_limit=1,
|
||||||
|
tensor_parallel_size=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@large_gpu_test(min_gb=48)
|
||||||
|
@pytest.mark.parametrize("model", models)
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"size_factors",
|
||||||
|
[
|
||||||
|
# No image
|
||||||
|
# [],
|
||||||
|
# Single-scale
|
||||||
|
[1.0],
|
||||||
|
# Single-scale, batched
|
||||||
|
[1.0, 1.0, 1.0],
|
||||||
|
# Multi-scale
|
||||||
|
[0.25, 0.5, 1.0],
|
||||||
|
],
|
||||||
|
)
|
||||||
|
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||||
|
@pytest.mark.parametrize("max_model_len", [25600])
|
||||||
|
@pytest.mark.parametrize("max_tokens", [128])
|
||||||
|
@pytest.mark.parametrize("num_logprobs", [10])
|
||||||
|
def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
|
||||||
|
size_factors, dtype: str, max_model_len: int,
|
||||||
|
max_tokens: int, num_logprobs: int) -> None:
|
||||||
|
images = [asset.pil_image for asset in image_assets]
|
||||||
|
|
||||||
|
inputs_per_case = [
|
||||||
|
(
|
||||||
|
[HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
|
||||||
|
[[rescale_image_size(image, factor) for image in images]
|
||||||
|
for factor in size_factors],
|
||||||
|
None,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
run_test(
|
||||||
|
hf_runner,
|
||||||
|
vllm_runner,
|
||||||
|
inputs_per_case,
|
||||||
|
model,
|
||||||
|
dtype=dtype,
|
||||||
|
max_model_len=max_model_len,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
num_logprobs=num_logprobs,
|
||||||
|
mm_limit=2,
|
||||||
|
tensor_parallel_size=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("model", models)
|
||||||
|
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||||
|
@pytest.mark.parametrize("max_model_len", [12800])
|
||||||
|
@pytest.mark.parametrize("max_tokens", [128])
|
||||||
|
@pytest.mark.parametrize("num_logprobs", [10])
|
||||||
|
def test_vision_speech_models(hf_runner, vllm_runner, model, dtype: str,
|
||||||
|
max_model_len: int, max_tokens: int,
|
||||||
|
num_logprobs: int) -> None:
|
||||||
|
|
||||||
|
# use the example speech question so that the model outputs are reasonable
|
||||||
|
audio = librosa.load(speech_question, sr=16000)
|
||||||
|
image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
|
||||||
|
|
||||||
|
inputs_vision_speech = [
|
||||||
|
(
|
||||||
|
["<|user|><|image|><|audio|><|end|><|assistant|>"],
|
||||||
|
[image],
|
||||||
|
[audio],
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
run_test(
|
||||||
|
hf_runner,
|
||||||
|
vllm_runner,
|
||||||
|
inputs_vision_speech,
|
||||||
|
model,
|
||||||
|
dtype=dtype,
|
||||||
|
max_model_len=max_model_len,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
num_logprobs=num_logprobs,
|
||||||
|
mm_limit=1,
|
||||||
|
tensor_parallel_size=1,
|
||||||
|
)
|
||||||
@ -41,12 +41,18 @@ def glm4_1v_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
|
|||||||
|
|
||||||
|
|
||||||
def _test_processing_correctness(
|
def _test_processing_correctness(
|
||||||
model_id: str,
|
model_id_or_arch: str,
|
||||||
hit_rate: float,
|
hit_rate: float,
|
||||||
num_batches: int,
|
num_batches: int,
|
||||||
simplify_rate: float,
|
simplify_rate: float,
|
||||||
):
|
):
|
||||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
|
if model_id_or_arch in HF_EXAMPLE_MODELS.get_supported_archs():
|
||||||
|
# Use model architecture to get the default model id
|
||||||
|
model_info = HF_EXAMPLE_MODELS.get_hf_info(model_id_or_arch)
|
||||||
|
model_id = model_info.default
|
||||||
|
else:
|
||||||
|
model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id_or_arch)
|
||||||
|
model_id = model_id_or_arch
|
||||||
model_info.check_available_online(on_fail="skip")
|
model_info.check_available_online(on_fail="skip")
|
||||||
model_info.check_transformers_version(on_fail="skip")
|
model_info.check_transformers_version(on_fail="skip")
|
||||||
|
|
||||||
@ -58,7 +64,7 @@ def _test_processing_correctness(
|
|||||||
trust_remote_code=model_info.trust_remote_code,
|
trust_remote_code=model_info.trust_remote_code,
|
||||||
seed=0,
|
seed=0,
|
||||||
dtype="auto",
|
dtype="auto",
|
||||||
revision=None,
|
revision=model_info.revision,
|
||||||
hf_overrides=model_info.hf_overrides,
|
hf_overrides=model_info.hf_overrides,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -272,12 +278,14 @@ def _test_processing_correctness_one(
|
|||||||
"THUDM/GLM-4.1V-9B-Thinking",
|
"THUDM/GLM-4.1V-9B-Thinking",
|
||||||
"ibm-granite/granite-speech-3.3-2b",
|
"ibm-granite/granite-speech-3.3-2b",
|
||||||
"h2oai/h2ovl-mississippi-800m",
|
"h2oai/h2ovl-mississippi-800m",
|
||||||
|
"internlm/Intern-S1",
|
||||||
"OpenGVLab/InternVL2-1B",
|
"OpenGVLab/InternVL2-1B",
|
||||||
"OpenGVLab/InternVL3-1B",
|
"OpenGVLab/InternVL3-1B",
|
||||||
"HuggingFaceM4/Idefics3-8B-Llama3",
|
"HuggingFaceM4/Idefics3-8B-Llama3",
|
||||||
"HuggingFaceTB/SmolVLM2-2.2B-Instruct",
|
"HuggingFaceTB/SmolVLM2-2.2B-Instruct",
|
||||||
"moonshotai/Kimi-VL-A3B-Instruct",
|
"moonshotai/Kimi-VL-A3B-Instruct",
|
||||||
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
||||||
|
"naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
|
||||||
"llava-hf/llava-1.5-7b-hf",
|
"llava-hf/llava-1.5-7b-hf",
|
||||||
"llava-hf/llava-v1.6-mistral-7b-hf",
|
"llava-hf/llava-v1.6-mistral-7b-hf",
|
||||||
"llava-hf/LLaVA-NeXT-Video-7B-hf",
|
"llava-hf/LLaVA-NeXT-Video-7B-hf",
|
||||||
@ -330,6 +338,28 @@ def test_processing_correctness(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# Phi4MultimodalForCausalLM share same model repo with original format
|
||||||
|
# Phi4MMForCausalLM, so we add it as a separate test case
|
||||||
|
# Remove this test after conversion PR merged:
|
||||||
|
# https://huggingface.co/microsoft/Phi-4-multimodal-instruct/discussions/70
|
||||||
|
@pytest.mark.parametrize("model_arch", ["Phi4MultimodalForCausalLM"])
|
||||||
|
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
|
||||||
|
@pytest.mark.parametrize("num_batches", [32])
|
||||||
|
@pytest.mark.parametrize("simplify_rate", [1.0])
|
||||||
|
def test_processing_correctness_phi4_multimodal(
|
||||||
|
model_arch: str,
|
||||||
|
hit_rate: float,
|
||||||
|
num_batches: int,
|
||||||
|
simplify_rate: float,
|
||||||
|
):
|
||||||
|
_test_processing_correctness(
|
||||||
|
model_arch,
|
||||||
|
hit_rate=hit_rate,
|
||||||
|
num_batches=num_batches,
|
||||||
|
simplify_rate=simplify_rate,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def _assert_inputs_equal(
|
def _assert_inputs_equal(
|
||||||
a: MultiModalInputs,
|
a: MultiModalInputs,
|
||||||
b: MultiModalInputs,
|
b: MultiModalInputs,
|
||||||
|
|||||||
@ -201,6 +201,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
|
|||||||
trust_remote_code=True),
|
trust_remote_code=True),
|
||||||
"HunYuanDenseV1ForCausalLM":_HfExamplesInfo("tencent/Hunyuan-7B-Instruct-0124",
|
"HunYuanDenseV1ForCausalLM":_HfExamplesInfo("tencent/Hunyuan-7B-Instruct-0124",
|
||||||
trust_remote_code=True),
|
trust_remote_code=True),
|
||||||
|
"HCXVisionForCausalLM": _HfExamplesInfo(
|
||||||
|
"naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
|
||||||
|
trust_remote_code=True),
|
||||||
"InternLMForCausalLM": _HfExamplesInfo("internlm/internlm-chat-7b",
|
"InternLMForCausalLM": _HfExamplesInfo("internlm/internlm-chat-7b",
|
||||||
trust_remote_code=True),
|
trust_remote_code=True),
|
||||||
"InternLM2ForCausalLM": _HfExamplesInfo("internlm/internlm2-chat-7b",
|
"InternLM2ForCausalLM": _HfExamplesInfo("internlm/internlm2-chat-7b",
|
||||||
@ -218,6 +221,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
|
|||||||
"fp8": "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"}), # noqa: E501
|
"fp8": "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"}), # noqa: E501
|
||||||
"LLaMAForCausalLM": _HfExamplesInfo("decapoda-research/llama-7b-hf",
|
"LLaMAForCausalLM": _HfExamplesInfo("decapoda-research/llama-7b-hf",
|
||||||
is_available_online=False),
|
is_available_online=False),
|
||||||
|
"Llama4ForCausalLM": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501
|
||||||
|
is_available_online=False),
|
||||||
"MambaForCausalLM": _HfExamplesInfo("state-spaces/mamba-130m-hf"),
|
"MambaForCausalLM": _HfExamplesInfo("state-spaces/mamba-130m-hf"),
|
||||||
"Mamba2ForCausalLM": _HfExamplesInfo("mistralai/Mamba-Codestral-7B-v0.1"),
|
"Mamba2ForCausalLM": _HfExamplesInfo("mistralai/Mamba-Codestral-7B-v0.1"),
|
||||||
"FalconMambaForCausalLM": _HfExamplesInfo("tiiuae/falcon-mamba-7b-instruct"), # noqa: E501
|
"FalconMambaForCausalLM": _HfExamplesInfo("tiiuae/falcon-mamba-7b-instruct"), # noqa: E501
|
||||||
@ -376,6 +381,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
|||||||
extras={"2B": "OpenGVLab/InternVL2-2B",
|
extras={"2B": "OpenGVLab/InternVL2-2B",
|
||||||
"3.0": "OpenGVLab/InternVL3-1B"}, # noqa: E501
|
"3.0": "OpenGVLab/InternVL3-1B"}, # noqa: E501
|
||||||
trust_remote_code=True),
|
trust_remote_code=True),
|
||||||
|
"InternS1ForConditionalGeneration": _HfExamplesInfo("internlm/Intern-S1",
|
||||||
|
trust_remote_code=True),
|
||||||
"Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3", # noqa: E501
|
"Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3", # noqa: E501
|
||||||
{"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}), # noqa: E501
|
{"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}), # noqa: E501
|
||||||
"KeyeForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501
|
"KeyeForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501
|
||||||
@ -426,6 +433,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
|||||||
"1.6-gemma": "AIDC-AI/Ovis1.6-Gemma2-9B"}), # noqa: E501
|
"1.6-gemma": "AIDC-AI/Ovis1.6-Gemma2-9B"}), # noqa: E501
|
||||||
"Phi4MMForCausalLM": _HfExamplesInfo("microsoft/Phi-4-multimodal-instruct",
|
"Phi4MMForCausalLM": _HfExamplesInfo("microsoft/Phi-4-multimodal-instruct",
|
||||||
trust_remote_code=True),
|
trust_remote_code=True),
|
||||||
|
"Phi4MultimodalForCausalLM": _HfExamplesInfo("microsoft/Phi-4-multimodal-instruct", # noqa: E501
|
||||||
|
revision="refs/pr/70"),
|
||||||
"PixtralForConditionalGeneration": _HfExamplesInfo("mistralai/Pixtral-12B-2409", # noqa: E501
|
"PixtralForConditionalGeneration": _HfExamplesInfo("mistralai/Pixtral-12B-2409", # noqa: E501
|
||||||
tokenizer_mode="mistral"),
|
tokenizer_mode="mistral"),
|
||||||
"QwenVLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen-VL",
|
"QwenVLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen-VL",
|
||||||
|
|||||||
@ -17,7 +17,9 @@ from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tenso
|
|||||||
CompressedTensorsW4A4Fp4, CompressedTensorsW4A16Fp4,
|
CompressedTensorsW4A4Fp4, CompressedTensorsW4A16Fp4,
|
||||||
CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8,
|
CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8,
|
||||||
CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8,
|
CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8,
|
||||||
CompressedTensorsWNA16, cutlass_fp4_supported)
|
CompressedTensorsWNA16)
|
||||||
|
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||||
|
cutlass_fp4_supported)
|
||||||
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
|
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
|
||||||
sparse_cutlass_supported)
|
sparse_cutlass_supported)
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
|||||||
@ -8,7 +8,10 @@ import pytest
|
|||||||
|
|
||||||
from tests.quantization.utils import is_quant_method_supported
|
from tests.quantization.utils import is_quant_method_supported
|
||||||
|
|
||||||
MODELS = ["microsoft/Phi-3-mini-4k-instruct"]
|
MODELS = [
|
||||||
|
"microsoft/Phi-3-mini-4k-instruct", # dense model
|
||||||
|
"ai21labs/Jamba-tiny-dev", # MoE model
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(not is_quant_method_supported("rtn"),
|
@pytest.mark.skipif(not is_quant_method_supported("rtn"),
|
||||||
|
|||||||
@ -4,6 +4,7 @@
|
|||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
from vllm.model_executor.models.fuyu import FuyuImagePatchInputs
|
||||||
from vllm.model_executor.models.phi3v import Phi3VImagePixelInputs
|
from vllm.model_executor.models.phi3v import Phi3VImagePixelInputs
|
||||||
|
|
||||||
|
|
||||||
@ -124,3 +125,24 @@ def test_tensor_schema_with_invalid_resolve_binding_dims():
|
|||||||
"w": 336
|
"w": 336
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_tensor_schema_with_list_of_symbolic_dim():
|
||||||
|
flat_data = torch.stack([torch.randn(768) for _ in range(3)]) # (bn=3, fn)
|
||||||
|
patches_per_image = [64, 64, 64] # len = bn = 3
|
||||||
|
|
||||||
|
FuyuImagePatchInputs(
|
||||||
|
flat_data=flat_data,
|
||||||
|
patches_per_image=patches_per_image,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_tensor_schema_with_list_of_symbolic_dim_mismatch_in_length():
|
||||||
|
flat_data = torch.stack([torch.randn(768) for _ in range(4)]) # (bn=4, fn)
|
||||||
|
patches_per_image = [64, 64, 64] # len = 3 ≠ bn
|
||||||
|
|
||||||
|
with pytest.raises(ValueError, match="expected 'bn'=4, got 3"):
|
||||||
|
FuyuImagePatchInputs(
|
||||||
|
flat_data=flat_data,
|
||||||
|
patches_per_image=patches_per_image,
|
||||||
|
)
|
||||||
@ -93,6 +93,7 @@ def create_common_attn_metadata(
|
|||||||
max_query_len=max_query_len,
|
max_query_len=max_query_len,
|
||||||
block_table_tensor=block_table_tensor,
|
block_table_tensor=block_table_tensor,
|
||||||
slot_mapping=slot_mapping,
|
slot_mapping=slot_mapping,
|
||||||
|
causal=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -13,7 +13,6 @@ UNSUPPORTED_MODELS_V1 = [
|
|||||||
"openai/whisper-large-v3", # transcription
|
"openai/whisper-large-v3", # transcription
|
||||||
"facebook/bart-large-cnn", # encoder decoder
|
"facebook/bart-large-cnn", # encoder decoder
|
||||||
"state-spaces/mamba-130m-hf", # mamba1
|
"state-spaces/mamba-130m-hf", # mamba1
|
||||||
"BAAI/bge-m3", # embedding
|
|
||||||
]
|
]
|
||||||
|
|
||||||
MODEL = "meta-llama/Llama-3.2-1B-Instruct"
|
MODEL = "meta-llama/Llama-3.2-1B-Instruct"
|
||||||
|
|||||||
@ -1,9 +1,8 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import re
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
import regex as re
|
||||||
import requests
|
import requests
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
|||||||
@ -59,7 +59,7 @@ def test_basic(
|
|||||||
# actually test chunked prompt
|
# actually test chunked prompt
|
||||||
max_num_batched_tokens=1024,
|
max_num_batched_tokens=1024,
|
||||||
max_model_len=8192,
|
max_model_len=8192,
|
||||||
gpu_memory_utilization=0.95,
|
gpu_memory_utilization=0.7,
|
||||||
max_num_seqs=max_num_seqs,
|
max_num_seqs=max_num_seqs,
|
||||||
tensor_parallel_size=tensor_parallel_size) as vllm_model:
|
tensor_parallel_size=tensor_parallel_size) as vllm_model:
|
||||||
vllm_outputs = vllm_model.generate_greedy(example_prompts,
|
vllm_outputs = vllm_model.generate_greedy(example_prompts,
|
||||||
|
|||||||
@ -67,4 +67,9 @@ class InfEncoder(json.JSONEncoder):
|
|||||||
|
|
||||||
def write_to_json(filename: str, records: list) -> None:
|
def write_to_json(filename: str, records: list) -> None:
|
||||||
with open(filename, "w") as f:
|
with open(filename, "w") as f:
|
||||||
json.dump(records, f, cls=InfEncoder)
|
json.dump(
|
||||||
|
records,
|
||||||
|
f,
|
||||||
|
cls=InfEncoder,
|
||||||
|
default=lambda o: f"<{type(o).__name__} is not JSON serializable>",
|
||||||
|
)
|
||||||
|
|||||||
@ -4790,26 +4790,26 @@ class VllmConfig:
|
|||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return (
|
return (
|
||||||
f"model={self.model_config.model!r},"
|
f"model={self.model_config.model!r}, "
|
||||||
f" speculative_config={self.speculative_config!r},"
|
f"speculative_config={self.speculative_config!r}, "
|
||||||
f" tokenizer={self.model_config.tokenizer!r}, "
|
f"tokenizer={self.model_config.tokenizer!r}, "
|
||||||
f"skip_tokenizer_init={self.model_config.skip_tokenizer_init},"
|
f"skip_tokenizer_init={self.model_config.skip_tokenizer_init}, "
|
||||||
f" tokenizer_mode={self.model_config.tokenizer_mode}, "
|
f"tokenizer_mode={self.model_config.tokenizer_mode}, "
|
||||||
f"revision={self.model_config.revision}, "
|
f"revision={self.model_config.revision}, "
|
||||||
f"override_neuron_config={self.model_config.override_neuron_config},"
|
f"override_neuron_config={self.model_config.override_neuron_config}, " # noqa
|
||||||
f" tokenizer_revision={self.model_config.tokenizer_revision}, "
|
f"tokenizer_revision={self.model_config.tokenizer_revision}, "
|
||||||
f"trust_remote_code={self.model_config.trust_remote_code}, "
|
f"trust_remote_code={self.model_config.trust_remote_code}, "
|
||||||
f"dtype={self.model_config.dtype}, "
|
f"dtype={self.model_config.dtype}, "
|
||||||
f"max_seq_len={self.model_config.max_model_len},"
|
f"max_seq_len={self.model_config.max_model_len}, "
|
||||||
f" download_dir={self.load_config.download_dir!r}, "
|
f"download_dir={self.load_config.download_dir!r}, "
|
||||||
f"load_format={self.load_config.load_format}, "
|
f"load_format={self.load_config.load_format}, "
|
||||||
f"tensor_parallel_size={self.parallel_config.tensor_parallel_size},"
|
f"tensor_parallel_size={self.parallel_config.tensor_parallel_size}, " # noqa
|
||||||
f" pipeline_parallel_size={self.parallel_config.pipeline_parallel_size}, " # noqa
|
f"pipeline_parallel_size={self.parallel_config.pipeline_parallel_size}, " # noqa
|
||||||
f"disable_custom_all_reduce={self.parallel_config.disable_custom_all_reduce}, " # noqa
|
f"disable_custom_all_reduce={self.parallel_config.disable_custom_all_reduce}, " # noqa
|
||||||
f"quantization={self.model_config.quantization}, "
|
f"quantization={self.model_config.quantization}, "
|
||||||
f"enforce_eager={self.model_config.enforce_eager}, "
|
f"enforce_eager={self.model_config.enforce_eager}, "
|
||||||
f"kv_cache_dtype={self.cache_config.cache_dtype}, "
|
f"kv_cache_dtype={self.cache_config.cache_dtype}, "
|
||||||
f" device_config={self.device_config.device}, "
|
f"device_config={self.device_config.device}, "
|
||||||
f"decoding_config={self.decoding_config!r}, "
|
f"decoding_config={self.decoding_config!r}, "
|
||||||
f"observability_config={self.observability_config!r}, "
|
f"observability_config={self.observability_config!r}, "
|
||||||
f"seed={self.model_config.seed}, "
|
f"seed={self.model_config.seed}, "
|
||||||
|
|||||||
@ -156,8 +156,16 @@ class SharedStorageConnector(KVConnectorBase_V1):
|
|||||||
logger.info("Inject KV cache of %d tokens to the paged memory",
|
logger.info("Inject KV cache of %d tokens to the paged memory",
|
||||||
len(request.slot_mapping))
|
len(request.slot_mapping))
|
||||||
for layer_name in forward_context.no_compile_layers:
|
for layer_name in forward_context.no_compile_layers:
|
||||||
attn_layer = forward_context.no_compile_layers[layer_name]
|
layer = forward_context.no_compile_layers[layer_name]
|
||||||
kv_cache_layer = attn_layer.kv_cache[\
|
|
||||||
|
# Only process layers that have kv_cache
|
||||||
|
# attribute (attention layers) Skip non-attention
|
||||||
|
# layers like FusedMoE/MLP etc.
|
||||||
|
kv_cache_attr = getattr(layer, 'kv_cache', None)
|
||||||
|
if kv_cache_attr is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
kv_cache_layer = kv_cache_attr[ \
|
||||||
forward_context.virtual_engine]
|
forward_context.virtual_engine]
|
||||||
|
|
||||||
filename = self._generate_filename_debug(
|
filename = self._generate_filename_debug(
|
||||||
|
|||||||
@ -1649,7 +1649,8 @@ class EngineArgs:
|
|||||||
|
|
||||||
if (self.max_num_seqs is None
|
if (self.max_num_seqs is None
|
||||||
and usage_context in default_max_num_seqs):
|
and usage_context in default_max_num_seqs):
|
||||||
self.max_num_seqs = default_max_num_seqs[usage_context]
|
self.max_num_seqs = min(default_max_num_seqs[usage_context],
|
||||||
|
self.max_num_batched_tokens or sys.maxsize)
|
||||||
|
|
||||||
logger.debug("Setting max_num_seqs to %d for %s usage context.",
|
logger.debug("Setting max_num_seqs to %d for %s usage context.",
|
||||||
self.max_num_seqs, use_context_value)
|
self.max_num_seqs, use_context_value)
|
||||||
|
|||||||
@ -97,11 +97,16 @@ class MQLLMEngineClient(EngineClient):
|
|||||||
self.model_config = engine_config.model_config
|
self.model_config = engine_config.model_config
|
||||||
self.decoding_config = engine_config.decoding_config
|
self.decoding_config = engine_config.decoding_config
|
||||||
|
|
||||||
# Create the tokenizer group.
|
if self.vllm_config.model_config.skip_tokenizer_init:
|
||||||
self.tokenizer = init_tokenizer_from_configs(
|
self.tokenizer = None
|
||||||
model_config=self.model_config,
|
|
||||||
scheduler_config=engine_config.scheduler_config,
|
else:
|
||||||
lora_config=engine_config.lora_config)
|
# Create the tokenizer group.
|
||||||
|
self.tokenizer = init_tokenizer_from_configs(
|
||||||
|
model_config=self.model_config,
|
||||||
|
scheduler_config=engine_config.scheduler_config,
|
||||||
|
lora_config=engine_config.lora_config)
|
||||||
|
|
||||||
self.input_preprocessor = InputPreprocessor(self.model_config,
|
self.input_preprocessor = InputPreprocessor(self.model_config,
|
||||||
self.tokenizer)
|
self.tokenizer)
|
||||||
|
|
||||||
@ -375,7 +380,10 @@ class MQLLMEngineClient(EngineClient):
|
|||||||
return self.input_preprocessor
|
return self.input_preprocessor
|
||||||
|
|
||||||
async def get_tokenizer(self, lora_request: Optional[LoRARequest] = None):
|
async def get_tokenizer(self, lora_request: Optional[LoRARequest] = None):
|
||||||
return await self.tokenizer.get_lora_tokenizer_async(lora_request)
|
if self.tokenizer is None:
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
return await self.tokenizer.get_lora_tokenizer_async(lora_request)
|
||||||
|
|
||||||
async def get_vllm_config(self) -> VllmConfig:
|
async def get_vllm_config(self) -> VllmConfig:
|
||||||
return self.vllm_config
|
return self.vllm_config
|
||||||
|
|||||||
@ -14,6 +14,7 @@ from pydantic import ValidationError
|
|||||||
from tqdm.auto import tqdm
|
from tqdm.auto import tqdm
|
||||||
from typing_extensions import TypeVar, deprecated
|
from typing_extensions import TypeVar, deprecated
|
||||||
|
|
||||||
|
import vllm.envs as envs
|
||||||
from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput,
|
from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput,
|
||||||
BeamSearchSequence,
|
BeamSearchSequence,
|
||||||
create_sort_beams_key_function)
|
create_sort_beams_key_function)
|
||||||
@ -44,9 +45,10 @@ from vllm.model_executor.layers.quantization import QuantizationMethods
|
|||||||
from vllm.outputs import (ClassificationRequestOutput, EmbeddingRequestOutput,
|
from vllm.outputs import (ClassificationRequestOutput, EmbeddingRequestOutput,
|
||||||
PoolingRequestOutput, RequestOutput,
|
PoolingRequestOutput, RequestOutput,
|
||||||
ScoringRequestOutput)
|
ScoringRequestOutput)
|
||||||
from vllm.pooling_params import PoolingParams, PoolingTask
|
from vllm.pooling_params import PoolingParams
|
||||||
from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams,
|
from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams,
|
||||||
RequestOutputKind, SamplingParams)
|
RequestOutputKind, SamplingParams)
|
||||||
|
from vllm.tasks import PoolingTask
|
||||||
from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
|
from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
|
||||||
get_cached_tokenizer)
|
get_cached_tokenizer)
|
||||||
from vllm.usage.usage_lib import UsageContext
|
from vllm.usage.usage_lib import UsageContext
|
||||||
@ -277,6 +279,16 @@ class LLM:
|
|||||||
self.request_counter = Counter()
|
self.request_counter = Counter()
|
||||||
self.default_sampling_params: Union[dict[str, Any], None] = None
|
self.default_sampling_params: Union[dict[str, Any], None] = None
|
||||||
|
|
||||||
|
if envs.VLLM_USE_V1:
|
||||||
|
supported_tasks = self.llm_engine \
|
||||||
|
.get_supported_tasks() # type: ignore
|
||||||
|
else:
|
||||||
|
supported_tasks = self.llm_engine.model_config.supported_tasks
|
||||||
|
|
||||||
|
logger.info("Supported_tasks: %s", supported_tasks)
|
||||||
|
|
||||||
|
self.supported_tasks = supported_tasks
|
||||||
|
|
||||||
def get_tokenizer(
|
def get_tokenizer(
|
||||||
self,
|
self,
|
||||||
lora_request: Optional[LoRARequest] = None,
|
lora_request: Optional[LoRARequest] = None,
|
||||||
@ -1170,8 +1182,7 @@ class LLM:
|
|||||||
A list of `EmbeddingRequestOutput` objects containing the
|
A list of `EmbeddingRequestOutput` objects containing the
|
||||||
embedding vectors in the same order as the input prompts.
|
embedding vectors in the same order as the input prompts.
|
||||||
"""
|
"""
|
||||||
model_config = self.llm_engine.model_config
|
if "embed" not in self.supported_tasks:
|
||||||
if "embed" not in model_config.supported_tasks:
|
|
||||||
raise ValueError("Embedding API is not supported by this model. "
|
raise ValueError("Embedding API is not supported by this model. "
|
||||||
"Please set `--task embed`.")
|
"Please set `--task embed`.")
|
||||||
|
|
||||||
@ -1215,8 +1226,7 @@ class LLM:
|
|||||||
A list of `ClassificationRequestOutput` objects containing the
|
A list of `ClassificationRequestOutput` objects containing the
|
||||||
embedding vectors in the same order as the input prompts.
|
embedding vectors in the same order as the input prompts.
|
||||||
"""
|
"""
|
||||||
model_config = self.llm_engine.model_config
|
if "classify" not in self.supported_tasks:
|
||||||
if "classify" not in model_config.supported_tasks:
|
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Classification API is not supported by this model. "
|
"Classification API is not supported by this model. "
|
||||||
"Please set `--task classify`.")
|
"Please set `--task classify`.")
|
||||||
@ -1397,8 +1407,8 @@ class LLM:
|
|||||||
|
|
||||||
raise ValueError(" ".join(messages))
|
raise ValueError(" ".join(messages))
|
||||||
|
|
||||||
if all(t not in model_config.supported_tasks
|
supported_tasks = self.supported_tasks
|
||||||
for t in ("embed", "classify")):
|
if all(t not in supported_tasks for t in ("embed", "classify")):
|
||||||
raise ValueError("Score API is not supported by this model. "
|
raise ValueError("Score API is not supported by this model. "
|
||||||
"Please set `--task embed` or `--task classify`.")
|
"Please set `--task embed` or `--task classify`.")
|
||||||
|
|
||||||
|
|||||||
@ -1586,6 +1586,14 @@ async def init_app_state(
|
|||||||
state.vllm_config = vllm_config
|
state.vllm_config = vllm_config
|
||||||
model_config = vllm_config.model_config
|
model_config = vllm_config.model_config
|
||||||
|
|
||||||
|
if envs.VLLM_USE_V1:
|
||||||
|
supported_tasks = await engine_client \
|
||||||
|
.get_supported_tasks() # type: ignore
|
||||||
|
else:
|
||||||
|
supported_tasks = model_config.supported_tasks
|
||||||
|
|
||||||
|
logger.info("Supported_tasks: %s", supported_tasks)
|
||||||
|
|
||||||
resolved_chat_template = load_chat_template(args.chat_template)
|
resolved_chat_template = load_chat_template(args.chat_template)
|
||||||
if resolved_chat_template is not None:
|
if resolved_chat_template is not None:
|
||||||
# Get the tokenizer to check official template
|
# Get the tokenizer to check official template
|
||||||
@ -1647,7 +1655,7 @@ async def init_app_state(
|
|||||||
reasoning_parser=args.reasoning_parser,
|
reasoning_parser=args.reasoning_parser,
|
||||||
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
|
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
|
||||||
enable_force_include_usage=args.enable_force_include_usage,
|
enable_force_include_usage=args.enable_force_include_usage,
|
||||||
) if "generate" in model_config.supported_tasks else None
|
) if "generate" in supported_tasks else None
|
||||||
state.openai_serving_chat = OpenAIServingChat(
|
state.openai_serving_chat = OpenAIServingChat(
|
||||||
engine_client,
|
engine_client,
|
||||||
model_config,
|
model_config,
|
||||||
@ -1664,7 +1672,7 @@ async def init_app_state(
|
|||||||
reasoning_parser=args.reasoning_parser,
|
reasoning_parser=args.reasoning_parser,
|
||||||
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
|
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
|
||||||
enable_force_include_usage=args.enable_force_include_usage,
|
enable_force_include_usage=args.enable_force_include_usage,
|
||||||
) if "generate" in model_config.supported_tasks else None
|
) if "generate" in supported_tasks else None
|
||||||
state.openai_serving_completion = OpenAIServingCompletion(
|
state.openai_serving_completion = OpenAIServingCompletion(
|
||||||
engine_client,
|
engine_client,
|
||||||
model_config,
|
model_config,
|
||||||
@ -1673,7 +1681,7 @@ async def init_app_state(
|
|||||||
return_tokens_as_token_ids=args.return_tokens_as_token_ids,
|
return_tokens_as_token_ids=args.return_tokens_as_token_ids,
|
||||||
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
|
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
|
||||||
enable_force_include_usage=args.enable_force_include_usage,
|
enable_force_include_usage=args.enable_force_include_usage,
|
||||||
) if "generate" in model_config.supported_tasks else None
|
) if "generate" in supported_tasks else None
|
||||||
state.openai_serving_pooling = OpenAIServingPooling(
|
state.openai_serving_pooling = OpenAIServingPooling(
|
||||||
engine_client,
|
engine_client,
|
||||||
model_config,
|
model_config,
|
||||||
@ -1681,7 +1689,7 @@ async def init_app_state(
|
|||||||
request_logger=request_logger,
|
request_logger=request_logger,
|
||||||
chat_template=resolved_chat_template,
|
chat_template=resolved_chat_template,
|
||||||
chat_template_content_format=args.chat_template_content_format,
|
chat_template_content_format=args.chat_template_content_format,
|
||||||
) if "encode" in model_config.supported_tasks else None
|
) if "encode" in supported_tasks else None
|
||||||
state.openai_serving_embedding = OpenAIServingEmbedding(
|
state.openai_serving_embedding = OpenAIServingEmbedding(
|
||||||
engine_client,
|
engine_client,
|
||||||
model_config,
|
model_config,
|
||||||
@ -1689,24 +1697,22 @@ async def init_app_state(
|
|||||||
request_logger=request_logger,
|
request_logger=request_logger,
|
||||||
chat_template=resolved_chat_template,
|
chat_template=resolved_chat_template,
|
||||||
chat_template_content_format=args.chat_template_content_format,
|
chat_template_content_format=args.chat_template_content_format,
|
||||||
) if "embed" in model_config.supported_tasks else None
|
) if "embed" in supported_tasks else None
|
||||||
state.openai_serving_classification = ServingClassification(
|
state.openai_serving_classification = ServingClassification(
|
||||||
engine_client,
|
engine_client,
|
||||||
model_config,
|
model_config,
|
||||||
state.openai_serving_models,
|
state.openai_serving_models,
|
||||||
request_logger=request_logger,
|
request_logger=request_logger,
|
||||||
) if "classify" in model_config.supported_tasks else None
|
) if "classify" in supported_tasks else None
|
||||||
|
|
||||||
enable_serving_reranking = ("classify" in model_config.supported_tasks
|
enable_serving_reranking = ("classify" in supported_tasks and getattr(
|
||||||
and getattr(model_config.hf_config,
|
model_config.hf_config, "num_labels", 0) == 1)
|
||||||
"num_labels", 0) == 1)
|
|
||||||
state.openai_serving_scores = ServingScores(
|
state.openai_serving_scores = ServingScores(
|
||||||
engine_client,
|
engine_client,
|
||||||
model_config,
|
model_config,
|
||||||
state.openai_serving_models,
|
state.openai_serving_models,
|
||||||
request_logger=request_logger,
|
request_logger=request_logger,
|
||||||
) if ("embed" in model_config.supported_tasks
|
) if ("embed" in supported_tasks or enable_serving_reranking) else None
|
||||||
or enable_serving_reranking) else None
|
|
||||||
|
|
||||||
state.openai_serving_tokenization = OpenAIServingTokenization(
|
state.openai_serving_tokenization = OpenAIServingTokenization(
|
||||||
engine_client,
|
engine_client,
|
||||||
@ -1721,13 +1727,13 @@ async def init_app_state(
|
|||||||
model_config,
|
model_config,
|
||||||
state.openai_serving_models,
|
state.openai_serving_models,
|
||||||
request_logger=request_logger,
|
request_logger=request_logger,
|
||||||
) if "transcription" in model_config.supported_tasks else None
|
) if "transcription" in supported_tasks else None
|
||||||
state.openai_serving_translation = OpenAIServingTranslation(
|
state.openai_serving_translation = OpenAIServingTranslation(
|
||||||
engine_client,
|
engine_client,
|
||||||
model_config,
|
model_config,
|
||||||
state.openai_serving_models,
|
state.openai_serving_models,
|
||||||
request_logger=request_logger,
|
request_logger=request_logger,
|
||||||
) if "transcription" in model_config.supported_tasks else None
|
) if "transcription" in supported_tasks else None
|
||||||
state.task = model_config.task
|
state.task = model_config.task
|
||||||
|
|
||||||
state.enable_server_load_tracking = args.enable_server_load_tracking
|
state.enable_server_load_tracking = args.enable_server_load_tracking
|
||||||
|
|||||||
@ -1007,6 +1007,13 @@ class CompletionRequest(OpenAIBaseModel):
|
|||||||
"default: 0). Any priority other than 0 will raise an error "
|
"default: 0). Any priority other than 0 will raise an error "
|
||||||
"if the served model does not use priority scheduling."),
|
"if the served model does not use priority scheduling."),
|
||||||
)
|
)
|
||||||
|
request_id: str = Field(
|
||||||
|
default_factory=lambda: f"{random_uuid()}",
|
||||||
|
description=(
|
||||||
|
"The request_id related to this request. If the caller does "
|
||||||
|
"not set it, a random_uuid will be generated. This id is used "
|
||||||
|
"through out the inference process and return in response."),
|
||||||
|
)
|
||||||
logits_processors: Optional[LogitsProcessors] = Field(
|
logits_processors: Optional[LogitsProcessors] = Field(
|
||||||
default=None,
|
default=None,
|
||||||
description=(
|
description=(
|
||||||
@ -1251,6 +1258,13 @@ class EmbeddingCompletionRequest(OpenAIBaseModel):
|
|||||||
"default: 0). Any priority other than 0 will raise an error "
|
"default: 0). Any priority other than 0 will raise an error "
|
||||||
"if the served model does not use priority scheduling."),
|
"if the served model does not use priority scheduling."),
|
||||||
)
|
)
|
||||||
|
request_id: str = Field(
|
||||||
|
default_factory=lambda: f"{random_uuid()}",
|
||||||
|
description=(
|
||||||
|
"The request_id related to this request. If the caller does "
|
||||||
|
"not set it, a random_uuid will be generated. This id is used "
|
||||||
|
"through out the inference process and return in response."),
|
||||||
|
)
|
||||||
|
|
||||||
# --8<-- [end:embedding-extra-params]
|
# --8<-- [end:embedding-extra-params]
|
||||||
|
|
||||||
@ -1302,6 +1316,13 @@ class EmbeddingChatRequest(OpenAIBaseModel):
|
|||||||
"default: 0). Any priority other than 0 will raise an error "
|
"default: 0). Any priority other than 0 will raise an error "
|
||||||
"if the served model does not use priority scheduling."),
|
"if the served model does not use priority scheduling."),
|
||||||
)
|
)
|
||||||
|
request_id: str = Field(
|
||||||
|
default_factory=lambda: f"{random_uuid()}",
|
||||||
|
description=(
|
||||||
|
"The request_id related to this request. If the caller does "
|
||||||
|
"not set it, a random_uuid will be generated. This id is used "
|
||||||
|
"through out the inference process and return in response."),
|
||||||
|
)
|
||||||
# --8<-- [end:chat-embedding-extra-params]
|
# --8<-- [end:chat-embedding-extra-params]
|
||||||
|
|
||||||
@model_validator(mode="before")
|
@model_validator(mode="before")
|
||||||
|
|||||||
@ -14,6 +14,7 @@ import torch
|
|||||||
from prometheus_client import start_http_server
|
from prometheus_client import start_http_server
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
import vllm.envs as envs
|
||||||
from vllm.config import VllmConfig
|
from vllm.config import VllmConfig
|
||||||
from vllm.engine.arg_utils import AsyncEngineArgs, optional_type
|
from vllm.engine.arg_utils import AsyncEngineArgs, optional_type
|
||||||
from vllm.engine.protocol import EngineClient
|
from vllm.engine.protocol import EngineClient
|
||||||
@ -335,6 +336,14 @@ async def run_batch(
|
|||||||
|
|
||||||
model_config = vllm_config.model_config
|
model_config = vllm_config.model_config
|
||||||
|
|
||||||
|
if envs.VLLM_USE_V1:
|
||||||
|
supported_tasks = await engine_client \
|
||||||
|
.get_supported_tasks() # type: ignore
|
||||||
|
else:
|
||||||
|
supported_tasks = model_config.supported_tasks
|
||||||
|
|
||||||
|
logger.info("Supported_tasks: %s", supported_tasks)
|
||||||
|
|
||||||
# Create the openai serving objects.
|
# Create the openai serving objects.
|
||||||
openai_serving_models = OpenAIServingModels(
|
openai_serving_models = OpenAIServingModels(
|
||||||
engine_client=engine_client,
|
engine_client=engine_client,
|
||||||
@ -351,7 +360,7 @@ async def run_batch(
|
|||||||
chat_template=None,
|
chat_template=None,
|
||||||
chat_template_content_format="auto",
|
chat_template_content_format="auto",
|
||||||
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
|
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
|
||||||
) if "generate" in model_config.supported_tasks else None
|
) if "generate" in supported_tasks else None
|
||||||
openai_serving_embedding = OpenAIServingEmbedding(
|
openai_serving_embedding = OpenAIServingEmbedding(
|
||||||
engine_client,
|
engine_client,
|
||||||
model_config,
|
model_config,
|
||||||
@ -359,19 +368,17 @@ async def run_batch(
|
|||||||
request_logger=request_logger,
|
request_logger=request_logger,
|
||||||
chat_template=None,
|
chat_template=None,
|
||||||
chat_template_content_format="auto",
|
chat_template_content_format="auto",
|
||||||
) if "embed" in model_config.supported_tasks else None
|
) if "embed" in supported_tasks else None
|
||||||
|
|
||||||
enable_serving_reranking = ("classify" in model_config.supported_tasks
|
enable_serving_reranking = ("classify" in supported_tasks and getattr(
|
||||||
and getattr(model_config.hf_config,
|
model_config.hf_config, "num_labels", 0) == 1)
|
||||||
"num_labels", 0) == 1)
|
|
||||||
|
|
||||||
openai_serving_scores = ServingScores(
|
openai_serving_scores = ServingScores(
|
||||||
engine_client,
|
engine_client,
|
||||||
model_config,
|
model_config,
|
||||||
openai_serving_models,
|
openai_serving_models,
|
||||||
request_logger=request_logger,
|
request_logger=request_logger,
|
||||||
) if ("embed" in model_config.supported_tasks
|
) if ("embed" in supported_tasks or enable_serving_reranking) else None
|
||||||
or enable_serving_reranking) else None
|
|
||||||
|
|
||||||
tracker = BatchProgressTracker()
|
tracker = BatchProgressTracker()
|
||||||
logger.info("Reading batch from %s...", args.input_file)
|
logger.info("Reading batch from %s...", args.input_file)
|
||||||
|
|||||||
@ -113,7 +113,9 @@ class OpenAIServingCompletion(OpenAIServing):
|
|||||||
return self.create_error_response(
|
return self.create_error_response(
|
||||||
"Echo is unsupported with prompt embeds.")
|
"Echo is unsupported with prompt embeds.")
|
||||||
|
|
||||||
request_id = f"cmpl-{self._base_request_id(raw_request)}"
|
request_id = (
|
||||||
|
f"cmpl-"
|
||||||
|
f"{self._base_request_id(raw_request, request.request_id)}")
|
||||||
created_time = int(time.time())
|
created_time = int(time.time())
|
||||||
|
|
||||||
request_metadata = RequestResponseMetadata(request_id=request_id)
|
request_metadata = RequestResponseMetadata(request_id=request_id)
|
||||||
|
|||||||
@ -163,8 +163,9 @@ class OpenAIServingEmbedding(EmbeddingMixin):
|
|||||||
for the API specification. This API mimics the OpenAI Embedding API.
|
for the API specification. This API mimics the OpenAI Embedding API.
|
||||||
"""
|
"""
|
||||||
model_name = self._get_model_name(request.model)
|
model_name = self._get_model_name(request.model)
|
||||||
request_id = (f"{self.request_id_prefix}-"
|
request_id = (
|
||||||
f"{self._base_request_id(raw_request)}")
|
f"{self.request_id_prefix}-"
|
||||||
|
f"{self._base_request_id(raw_request, request.request_id)}")
|
||||||
|
|
||||||
ctx = EmbeddingServeContext(
|
ctx = EmbeddingServeContext(
|
||||||
request=request,
|
request=request,
|
||||||
|
|||||||
@ -880,7 +880,10 @@ class OpenAIServing:
|
|||||||
_chat_template_kwargs.update(chat_template_kwargs or {})
|
_chat_template_kwargs.update(chat_template_kwargs or {})
|
||||||
|
|
||||||
request_prompt: Union[str, list[int]]
|
request_prompt: Union[str, list[int]]
|
||||||
if isinstance(tokenizer, MistralTokenizer):
|
|
||||||
|
if tokenizer is None:
|
||||||
|
request_prompt = "placeholder"
|
||||||
|
elif isinstance(tokenizer, MistralTokenizer):
|
||||||
request_prompt = apply_mistral_chat_template(
|
request_prompt = apply_mistral_chat_template(
|
||||||
tokenizer,
|
tokenizer,
|
||||||
messages=messages,
|
messages=messages,
|
||||||
@ -910,7 +913,14 @@ class OpenAIServing:
|
|||||||
request = tool_parser(tokenizer).adjust_request( # type: ignore
|
request = tool_parser(tokenizer).adjust_request( # type: ignore
|
||||||
request=request)
|
request=request)
|
||||||
|
|
||||||
if isinstance(request_prompt, str):
|
if tokenizer is None:
|
||||||
|
assert isinstance(request_prompt, str), (
|
||||||
|
"Prompt has to be a string", \
|
||||||
|
"when the tokenizer is not initialised"
|
||||||
|
)
|
||||||
|
prompt_inputs = TextTokensPrompt(prompt=request_prompt,
|
||||||
|
prompt_token_ids=[1])
|
||||||
|
elif isinstance(request_prompt, str):
|
||||||
prompt_inputs = await self._tokenize_prompt_input_async(
|
prompt_inputs = await self._tokenize_prompt_input_async(
|
||||||
request,
|
request,
|
||||||
tokenizer,
|
tokenizer,
|
||||||
@ -947,9 +957,11 @@ class OpenAIServing:
|
|||||||
def _load_and_validate_embed(embed: bytes) -> EmbedsPrompt:
|
def _load_and_validate_embed(embed: bytes) -> EmbedsPrompt:
|
||||||
tensor = torch.load(io.BytesIO(base64.b64decode(embed)),
|
tensor = torch.load(io.BytesIO(base64.b64decode(embed)),
|
||||||
weights_only=True)
|
weights_only=True)
|
||||||
assert isinstance(
|
assert isinstance(tensor, torch.Tensor) and tensor.dtype in (
|
||||||
tensor,
|
torch.float32,
|
||||||
(torch.FloatTensor, torch.BFloat16Tensor, torch.HalfTensor))
|
torch.bfloat16,
|
||||||
|
torch.float16,
|
||||||
|
)
|
||||||
if tensor.dim() > 2:
|
if tensor.dim() > 2:
|
||||||
tensor = tensor.squeeze(0)
|
tensor = tensor.squeeze(0)
|
||||||
assert tensor.dim() == 2
|
assert tensor.dim() == 2
|
||||||
|
|||||||
@ -96,7 +96,11 @@ class OpenAIServingPooling(OpenAIServing):
|
|||||||
self.max_model_len, truncate_prompt_tokens)
|
self.max_model_len, truncate_prompt_tokens)
|
||||||
lora_request = self._maybe_get_adapters(request)
|
lora_request = self._maybe_get_adapters(request)
|
||||||
|
|
||||||
tokenizer = await self.engine_client.get_tokenizer(lora_request)
|
if self.model_config.skip_tokenizer_init:
|
||||||
|
tokenizer = None
|
||||||
|
else:
|
||||||
|
tokenizer = await self.engine_client.get_tokenizer(lora_request
|
||||||
|
)
|
||||||
|
|
||||||
if isinstance(request, PoolingChatRequest):
|
if isinstance(request, PoolingChatRequest):
|
||||||
(
|
(
|
||||||
|
|||||||
@ -16,8 +16,8 @@ from vllm.config import VllmConfig
|
|||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.model_executor.layers.sampler import SamplerOutput
|
from vllm.model_executor.layers.sampler import SamplerOutput
|
||||||
from vllm.pooling_params import PoolingTask
|
|
||||||
from vllm.sequence import ExecuteModelRequest, PoolerOutput
|
from vllm.sequence import ExecuteModelRequest, PoolerOutput
|
||||||
|
from vllm.tasks import SupportedTask
|
||||||
from vllm.utils import make_async
|
from vllm.utils import make_async
|
||||||
from vllm.worker.worker_base import WorkerBase
|
from vllm.worker.worker_base import WorkerBase
|
||||||
|
|
||||||
@ -136,9 +136,9 @@ class ExecutorBase(ABC):
|
|||||||
return self.collective_rpc(rpc_func)
|
return self.collective_rpc(rpc_func)
|
||||||
|
|
||||||
@cached_property # Avoid unnecessary RPC calls
|
@cached_property # Avoid unnecessary RPC calls
|
||||||
def supported_pooling_tasks(self) -> tuple[PoolingTask, ...]:
|
def supported_tasks(self) -> tuple[SupportedTask, ...]:
|
||||||
output = self.collective_rpc("get_supported_pooling_tasks")
|
output = self.collective_rpc("get_supported_tasks")
|
||||||
return tuple({task for tasks in output for task in tasks})
|
return output[0]
|
||||||
|
|
||||||
def execute_model(
|
def execute_model(
|
||||||
self, execute_model_req: ExecuteModelRequest
|
self, execute_model_req: ExecuteModelRequest
|
||||||
|
|||||||
@ -1127,6 +1127,7 @@ def flashinfer_fused_moe_blockscale_fp8(
|
|||||||
tile_tokens_dim=_get_tile_tokens_dim(x.shape[0], top_k,
|
tile_tokens_dim=_get_tile_tokens_dim(x.shape[0], top_k,
|
||||||
global_num_experts),
|
global_num_experts),
|
||||||
routing_method_type=2, # DeepSeek-styled routing method
|
routing_method_type=2, # DeepSeek-styled routing method
|
||||||
|
use_shuffled_weight=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -5,144 +5,8 @@ from typing import Optional
|
|||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.triton_utils import tl, triton
|
from vllm.triton_utils import triton
|
||||||
from vllm.utils import cdiv, round_up
|
from vllm.utils import round_up
|
||||||
|
|
||||||
|
|
||||||
@triton.jit
|
|
||||||
def moe_align_block_size_stage1(
|
|
||||||
topk_ids_ptr,
|
|
||||||
tokens_cnts_ptr,
|
|
||||||
num_experts: tl.constexpr,
|
|
||||||
numel: tl.constexpr,
|
|
||||||
tokens_per_thread: tl.constexpr,
|
|
||||||
):
|
|
||||||
pid = tl.program_id(0)
|
|
||||||
|
|
||||||
start_idx = pid * tokens_per_thread
|
|
||||||
|
|
||||||
off_c = (pid + 1) * num_experts
|
|
||||||
|
|
||||||
for i in range(tokens_per_thread):
|
|
||||||
if start_idx + i < numel:
|
|
||||||
idx = tl.load(topk_ids_ptr + start_idx + i)
|
|
||||||
token_cnt = tl.load(tokens_cnts_ptr + off_c + idx)
|
|
||||||
tl.store(tokens_cnts_ptr + off_c + idx, token_cnt + 1)
|
|
||||||
|
|
||||||
|
|
||||||
@triton.jit
|
|
||||||
def moe_align_block_size_stage2(
|
|
||||||
tokens_cnts_ptr,
|
|
||||||
num_experts: tl.constexpr,
|
|
||||||
):
|
|
||||||
pid = tl.program_id(0)
|
|
||||||
|
|
||||||
last_cnt = 0
|
|
||||||
for i in range(1, num_experts + 1):
|
|
||||||
token_cnt = tl.load(tokens_cnts_ptr + i * num_experts + pid)
|
|
||||||
last_cnt = last_cnt + token_cnt
|
|
||||||
tl.store(tokens_cnts_ptr + i * num_experts + pid, last_cnt)
|
|
||||||
|
|
||||||
|
|
||||||
@triton.jit
|
|
||||||
def moe_align_block_size_stage3(
|
|
||||||
total_tokens_post_pad_ptr,
|
|
||||||
tokens_cnts_ptr,
|
|
||||||
cumsum_ptr,
|
|
||||||
num_experts: tl.constexpr,
|
|
||||||
block_size: tl.constexpr,
|
|
||||||
):
|
|
||||||
last_cumsum = 0
|
|
||||||
off_cnt = num_experts * num_experts
|
|
||||||
for i in range(1, num_experts + 1):
|
|
||||||
token_cnt = tl.load(tokens_cnts_ptr + off_cnt + i - 1)
|
|
||||||
last_cumsum = last_cumsum + tl.cdiv(token_cnt, block_size) * block_size
|
|
||||||
tl.store(cumsum_ptr + i, last_cumsum)
|
|
||||||
tl.store(total_tokens_post_pad_ptr, last_cumsum)
|
|
||||||
|
|
||||||
|
|
||||||
@triton.jit
|
|
||||||
def moe_align_block_size_stage4(
|
|
||||||
topk_ids_ptr,
|
|
||||||
sorted_token_ids_ptr,
|
|
||||||
expert_ids_ptr,
|
|
||||||
tokens_cnts_ptr,
|
|
||||||
cumsum_ptr,
|
|
||||||
num_experts: tl.constexpr,
|
|
||||||
block_size: tl.constexpr,
|
|
||||||
numel: tl.constexpr,
|
|
||||||
tokens_per_thread: tl.constexpr,
|
|
||||||
):
|
|
||||||
pid = tl.program_id(0)
|
|
||||||
start_idx = tl.load(cumsum_ptr + pid)
|
|
||||||
end_idx = tl.load(cumsum_ptr + pid + 1)
|
|
||||||
|
|
||||||
for i in range(start_idx, end_idx, block_size):
|
|
||||||
tl.store(expert_ids_ptr + i // block_size, pid)
|
|
||||||
|
|
||||||
start_idx = pid * tokens_per_thread
|
|
||||||
off_t = pid * num_experts
|
|
||||||
|
|
||||||
for i in range(start_idx, tl.minimum(start_idx + tokens_per_thread,
|
|
||||||
numel)):
|
|
||||||
expert_id = tl.load(topk_ids_ptr + i)
|
|
||||||
token_cnt = tl.load(tokens_cnts_ptr + off_t + expert_id)
|
|
||||||
rank_post_pad = token_cnt + tl.load(cumsum_ptr + expert_id)
|
|
||||||
tl.store(sorted_token_ids_ptr + rank_post_pad, i)
|
|
||||||
tl.store(tokens_cnts_ptr + off_t + expert_id, token_cnt + 1)
|
|
||||||
|
|
||||||
|
|
||||||
# Triton implementation based on:
|
|
||||||
# https://github.com/sgl-project/sglang/commit/ba5112ff691d791a9e38c6c71f59324a5fcb49d0
|
|
||||||
def moe_align_block_size_triton(
|
|
||||||
topk_ids: torch.Tensor,
|
|
||||||
num_experts: int,
|
|
||||||
block_size: int,
|
|
||||||
sorted_token_ids: torch.Tensor,
|
|
||||||
expert_ids: torch.Tensor,
|
|
||||||
num_tokens_post_pad: torch.Tensor,
|
|
||||||
) -> None:
|
|
||||||
numel = topk_ids.numel()
|
|
||||||
grid = (num_experts, )
|
|
||||||
tokens_cnts = torch.zeros((num_experts + 1, num_experts),
|
|
||||||
dtype=torch.int32,
|
|
||||||
device=topk_ids.device)
|
|
||||||
cumsum = torch.zeros((num_experts + 1, ),
|
|
||||||
dtype=torch.int32,
|
|
||||||
device=topk_ids.device)
|
|
||||||
tokens_per_thread = cdiv(numel, num_experts)
|
|
||||||
sorted_token_ids.fill_(numel)
|
|
||||||
expert_ids.zero_()
|
|
||||||
|
|
||||||
moe_align_block_size_stage1[grid](
|
|
||||||
topk_ids,
|
|
||||||
tokens_cnts,
|
|
||||||
num_experts,
|
|
||||||
numel,
|
|
||||||
tokens_per_thread,
|
|
||||||
)
|
|
||||||
moe_align_block_size_stage2[grid](
|
|
||||||
tokens_cnts,
|
|
||||||
num_experts,
|
|
||||||
)
|
|
||||||
moe_align_block_size_stage3[(1, )](
|
|
||||||
num_tokens_post_pad,
|
|
||||||
tokens_cnts,
|
|
||||||
cumsum,
|
|
||||||
num_experts,
|
|
||||||
block_size,
|
|
||||||
)
|
|
||||||
moe_align_block_size_stage4[grid](
|
|
||||||
topk_ids,
|
|
||||||
sorted_token_ids,
|
|
||||||
expert_ids,
|
|
||||||
tokens_cnts,
|
|
||||||
cumsum,
|
|
||||||
num_experts,
|
|
||||||
block_size,
|
|
||||||
numel,
|
|
||||||
tokens_per_thread,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def moe_align_block_size(
|
def moe_align_block_size(
|
||||||
|
|||||||
@ -76,25 +76,22 @@ def _moe_unpermute_and_reduce(
|
|||||||
|
|
||||||
def moe_permute(
|
def moe_permute(
|
||||||
hidden_states: torch.Tensor,
|
hidden_states: torch.Tensor,
|
||||||
topk_weights: torch.Tensor,
|
a1q_scale: Optional[torch.Tensor],
|
||||||
topk_ids: torch.Tensor,
|
topk_ids: torch.Tensor,
|
||||||
token_expert_indices: torch.Tensor,
|
|
||||||
topk: int,
|
|
||||||
n_expert: int,
|
n_expert: int,
|
||||||
n_local_expert: int,
|
n_local_expert: int = -1,
|
||||||
expert_map: Optional[torch.Tensor] = None,
|
expert_map: Optional[torch.Tensor] = None,
|
||||||
align_block_size: Optional[int] = None,
|
align_block_size: Optional[int] = None,
|
||||||
fill_invalid_expert: int = -1
|
fill_invalid_expert: int = -1
|
||||||
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
|
) -> tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor, torch.Tensor,
|
||||||
|
torch.Tensor]:
|
||||||
"""
|
"""
|
||||||
This function expands and permutes activation to gather uncontinuous tokens
|
This function expands and permutes activation to gather uncontinuous tokens
|
||||||
for each expert.
|
for each expert.
|
||||||
Parameters:
|
Parameters:
|
||||||
- hidden_states (torch.Tensor): The input tensor to the MoE layer.
|
- hidden_states (torch.Tensor): The input tensor to the MoE layer.
|
||||||
- topk_weights (torch.Tensor): topk expert route weight for each token.
|
- a1q_scale (Optional[torch.Tensor]): quant scale for hidden_states
|
||||||
- topk_ids (torch.Tensor): topk expert route id for each token.
|
- topk_ids (torch.Tensor): topk expert route id for each token.
|
||||||
- token_expert_indices (torch.Tensor): indice for expanded hidden.
|
|
||||||
- topk (int): The number of top-k experts to select.
|
|
||||||
- n_expert (int): The number of expert.
|
- n_expert (int): The number of expert.
|
||||||
- n_local_expert (int): The number of expert in current EP rank.
|
- n_local_expert (int): The number of expert in current EP rank.
|
||||||
- expert_map (Optional[torch.Tensor]): A tensor mapping expert indices
|
- expert_map (Optional[torch.Tensor]): A tensor mapping expert indices
|
||||||
@ -105,14 +102,17 @@ def moe_permute(
|
|||||||
to workaround DeepGemm unsupported -1 in m_indices
|
to workaround DeepGemm unsupported -1 in m_indices
|
||||||
Returns:
|
Returns:
|
||||||
- permuted_hidden_states (torch.Tensor): permuted activation.
|
- permuted_hidden_states (torch.Tensor): permuted activation.
|
||||||
|
- a1q_scale (Optional[torch.Tensor]): quant scale for hidden_states
|
||||||
- expert_first_token_offset (torch.Tensor): offset of the first token
|
- expert_first_token_offset (torch.Tensor): offset of the first token
|
||||||
of each expert for standard grouped gemm. if enable 'align_block_size'
|
of each expert for standard grouped gemm. if enable 'align_block_size'
|
||||||
expert_first_token_offset will align up to 'align_block_size'.
|
expert_first_token_offset will align up to 'align_block_size'.
|
||||||
- src_row_id2dst_row_id_map (torch.Tensor): idx map for moe_unpermute.
|
- inv_permuted_idx (torch.Tensor): idx map for moe_unpermute.
|
||||||
|
- permuted_idx (torch.Tensor): idx map from hidden to permuted_hidden.
|
||||||
- m_indices: m_indices for grouped gemm in deepgemm,`m_indices[i]` records
|
- m_indices: m_indices for grouped gemm in deepgemm,`m_indices[i]` records
|
||||||
the group which the j-th row of the LHS belong to.`
|
the group which the j-th row of the LHS belong to.`
|
||||||
"""
|
"""
|
||||||
n_token, n_hidden = hidden_states.size()
|
n_token, n_hidden = hidden_states.size()
|
||||||
|
topk = topk_ids.size(1)
|
||||||
assert (n_hidden * hidden_states.element_size()
|
assert (n_hidden * hidden_states.element_size()
|
||||||
) % 16 == 0, "permue kernel need hidden dim align to 16B"
|
) % 16 == 0, "permue kernel need hidden dim align to 16B"
|
||||||
permuted_row_size = n_token * topk
|
permuted_row_size = n_token * topk
|
||||||
@ -120,12 +120,19 @@ def moe_permute(
|
|||||||
permuted_row_size = (permuted_row_size + n_expert *
|
permuted_row_size = (permuted_row_size + n_expert *
|
||||||
(align_block_size - 1) + align_block_size -
|
(align_block_size - 1) + align_block_size -
|
||||||
1) // align_block_size * align_block_size
|
1) // align_block_size * align_block_size
|
||||||
|
if n_local_expert == -1:
|
||||||
|
n_local_expert = n_expert
|
||||||
permuted_hidden_states = torch.empty(
|
permuted_hidden_states = torch.empty(
|
||||||
(permuted_row_size, n_hidden),
|
(permuted_row_size, n_hidden),
|
||||||
dtype=hidden_states.dtype,
|
dtype=hidden_states.dtype,
|
||||||
device=hidden_states.device,
|
device=hidden_states.device,
|
||||||
)
|
)
|
||||||
|
token_expert_indices = torch.arange(0,
|
||||||
|
n_token * topk,
|
||||||
|
dtype=torch.int32,
|
||||||
|
device=hidden_states.device).reshape(
|
||||||
|
(n_token, topk))
|
||||||
|
|
||||||
m_indices = torch.full((permuted_row_size, ),
|
m_indices = torch.full((permuted_row_size, ),
|
||||||
fill_invalid_expert,
|
fill_invalid_expert,
|
||||||
dtype=torch.int32,
|
dtype=torch.int32,
|
||||||
@ -133,57 +140,54 @@ def moe_permute(
|
|||||||
expert_first_token_offset = torch.empty(n_local_expert + 1,
|
expert_first_token_offset = torch.empty(n_local_expert + 1,
|
||||||
dtype=torch.int64,
|
dtype=torch.int64,
|
||||||
device=hidden_states.device)
|
device=hidden_states.device)
|
||||||
src_row_id2dst_row_id_map = torch.empty((n_token, topk),
|
permuted_idx = torch.full((permuted_row_size, ),
|
||||||
dtype=torch.int32,
|
n_token * topk,
|
||||||
device=hidden_states.device)
|
dtype=torch.int32,
|
||||||
torch.ops._moe_C.moe_permute(hidden_states, topk_weights, topk_ids,
|
device=hidden_states.device)
|
||||||
token_expert_indices, expert_map, n_expert,
|
inv_permuted_idx = torch.empty((n_token, topk),
|
||||||
n_local_expert, topk, align_block_size,
|
dtype=torch.int32,
|
||||||
permuted_hidden_states,
|
device=hidden_states.device)
|
||||||
expert_first_token_offset,
|
topk_ids = topk_ids.to(torch.int32)
|
||||||
src_row_id2dst_row_id_map, m_indices)
|
torch.ops._moe_C.moe_permute(hidden_states, topk_ids, token_expert_indices,
|
||||||
return (permuted_hidden_states, expert_first_token_offset,
|
expert_map, n_expert, n_local_expert, topk,
|
||||||
src_row_id2dst_row_id_map, m_indices)
|
align_block_size, permuted_hidden_states,
|
||||||
|
expert_first_token_offset, inv_permuted_idx,
|
||||||
|
permuted_idx, m_indices)
|
||||||
|
if a1q_scale is not None:
|
||||||
|
a1q_scale = a1q_scale[permuted_idx.clamp(max=n_token * topk - 1) //
|
||||||
|
topk]
|
||||||
|
return (permuted_hidden_states, a1q_scale, expert_first_token_offset,
|
||||||
|
inv_permuted_idx.flatten(), m_indices)
|
||||||
|
|
||||||
|
|
||||||
def moe_unpermute(
|
def moe_unpermute(
|
||||||
|
out: torch.Tensor,
|
||||||
permuted_hidden_states: torch.Tensor,
|
permuted_hidden_states: torch.Tensor,
|
||||||
topk_weights: torch.Tensor,
|
topk_weights: torch.Tensor,
|
||||||
topk_ids: torch.Tensor,
|
inv_permuted_idx: torch.Tensor,
|
||||||
src_row_id2dst_row_id_map: torch.Tensor,
|
expert_first_token_offset: Optional[torch.Tensor] = None,
|
||||||
expert_first_token_offset: torch.Tensor,
|
) -> None:
|
||||||
topk: int,
|
|
||||||
n_expert: int,
|
|
||||||
n_local_expert: int,
|
|
||||||
) -> torch.Tensor:
|
|
||||||
"""
|
"""
|
||||||
This function expands and permutes activation to gathering uncontinuous
|
This function expands and permutes activation to gathering uncontinuous
|
||||||
tokens for each expert.
|
tokens for each expert.
|
||||||
Parameters:
|
Parameters:
|
||||||
|
- out (torch.Tensor): output tensor
|
||||||
- permuted_hidden_states (torch.Tensor): permuted activation.
|
- permuted_hidden_states (torch.Tensor): permuted activation.
|
||||||
- topk_weights (torch.Tensor): topk expert route weight for each token.
|
- topk_weights (torch.Tensor): topk expert route weight for each token.
|
||||||
- topk_ids (torch.Tensor): topk expert route id for each token.
|
- inv_permuted_idx (torch.Tensor): row idx map for moe_unpermute.
|
||||||
- expert_first_token_offset (torch.Tensor): offset of the first token
|
- expert_first_token_offset (Optional[torch.Tensor]): offset of the first
|
||||||
of each expert for grouped gemm.
|
token of each expert for grouped gemm.
|
||||||
- topk (int): The number of top-k experts to select.
|
|
||||||
- n_expert (int): The number of expert.
|
|
||||||
- n_local_expert (int): The number of expert in current EP rank.
|
|
||||||
Returns:
|
Returns:
|
||||||
- hidden_states (torch.Tensor): The reduced and unpermuted activation
|
- hidden_states (torch.Tensor): The reduced and unpermuted activation
|
||||||
tensor.
|
tensor.
|
||||||
"""
|
"""
|
||||||
n_token, n_hidden = topk_weights.size(0), permuted_hidden_states.size(-1)
|
topk = topk_weights.size(1)
|
||||||
|
n_hidden = permuted_hidden_states.size(-1)
|
||||||
assert (n_hidden * permuted_hidden_states.element_size()
|
assert (n_hidden * permuted_hidden_states.element_size()
|
||||||
) % 16 == 0, "unpermue kernel need hidden dim align to 16B"
|
) % 16 == 0, "unpermue kernel need hidden dim align to 16B"
|
||||||
hidden_states = torch.empty((n_token, n_hidden),
|
|
||||||
dtype=permuted_hidden_states.dtype,
|
|
||||||
device=permuted_hidden_states.device)
|
|
||||||
|
|
||||||
torch.ops._moe_C.moe_unpermute(permuted_hidden_states, topk_weights,
|
torch.ops._moe_C.moe_unpermute(permuted_hidden_states, topk_weights,
|
||||||
topk_ids, src_row_id2dst_row_id_map,
|
inv_permuted_idx, expert_first_token_offset,
|
||||||
expert_first_token_offset, n_expert,
|
topk, out)
|
||||||
n_local_expert, topk, hidden_states)
|
|
||||||
return hidden_states
|
|
||||||
|
|
||||||
|
|
||||||
def moe_permute_unpermute_supported():
|
def moe_permute_unpermute_supported():
|
||||||
|
|||||||
@ -24,6 +24,7 @@ from vllm.model_executor.layers.mamba.mamba_utils import (
|
|||||||
extra_groups_for_head_shards, get_mamba_state_shape)
|
extra_groups_for_head_shards, get_mamba_state_shape)
|
||||||
from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
|
from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
|
||||||
causal_conv1d_fn, causal_conv1d_update)
|
causal_conv1d_fn, causal_conv1d_update)
|
||||||
|
from vllm.model_executor.layers.mamba.ops.layernorm_gated import rms_norm_gated
|
||||||
from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
|
from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
|
||||||
selective_state_update)
|
selective_state_update)
|
||||||
from vllm.model_executor.layers.mamba.ops.ssd_combined import (
|
from vllm.model_executor.layers.mamba.ops.ssd_combined import (
|
||||||
@ -133,21 +134,15 @@ class Mixer2RMSNormGated(CustomOp):
|
|||||||
return x * nn.functional.silu(gate.to(
|
return x * nn.functional.silu(gate.to(
|
||||||
torch.float32)).to(input_dtype)
|
torch.float32)).to(input_dtype)
|
||||||
|
|
||||||
if self.tp_size > 1 or self.n_groups != 1:
|
if (((self.n_groups % self.tp_size) != 0) or self.n_groups != 1):
|
||||||
return self.forward_native(x, gate)
|
return self.forward_native(x, gate)
|
||||||
|
|
||||||
from vllm import _custom_ops as ops
|
return rms_norm_gated(x,
|
||||||
|
self.weight.data,
|
||||||
# cast x and gate to float32 before silu
|
bias=None,
|
||||||
out = torch.empty_like(x)
|
z=gate,
|
||||||
y = x * nn.functional.silu(gate.to(torch.float32))
|
eps=self.variance_epsilon,
|
||||||
ops.rms_norm(
|
norm_before_gate=False)
|
||||||
out,
|
|
||||||
y.to(x.dtype),
|
|
||||||
self.weight.data,
|
|
||||||
self.variance_epsilon,
|
|
||||||
)
|
|
||||||
return out
|
|
||||||
|
|
||||||
|
|
||||||
def mamba_v2_sharded_weight_loader(
|
def mamba_v2_sharded_weight_loader(
|
||||||
|
|||||||
168
vllm/model_executor/layers/mamba/ops/layernorm_gated.py
Normal file
168
vllm/model_executor/layers/mamba/ops/layernorm_gated.py
Normal file
@ -0,0 +1,168 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
# Copyright (c) 2024, Tri Dao.
|
||||||
|
# Adapted from https://github.com/state-spaces/mamba/blob/60dadf2e0ee730ac337035d5533de10bc26e4847/mamba_ssm/ops/triton/layernorm_gated.py
|
||||||
|
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from vllm.triton_utils import tl, triton
|
||||||
|
|
||||||
|
|
||||||
|
@triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None})
|
||||||
|
@triton.heuristics({"HAS_Z": lambda args: args["Z"] is not None})
|
||||||
|
@triton.jit
|
||||||
|
def _layer_norm_fwd_1pass_kernel(
|
||||||
|
X, # pointer to the input
|
||||||
|
Y, # pointer to the output
|
||||||
|
W, # pointer to the weights
|
||||||
|
B, # pointer to the biases
|
||||||
|
Z, # pointer to the other branch
|
||||||
|
Mean, # pointer to the mean
|
||||||
|
Rstd, # pointer to the 1/std
|
||||||
|
stride_x_row: tl.int64,
|
||||||
|
stride_y_row: tl.int64,
|
||||||
|
stride_z_row: tl.int64,
|
||||||
|
M: tl.int64, # number of rows in X
|
||||||
|
N: tl.int64, # number of columns in X
|
||||||
|
eps, # epsilon to avoid division by zero
|
||||||
|
BLOCK_N: tl.constexpr,
|
||||||
|
HAS_BIAS: tl.constexpr,
|
||||||
|
HAS_Z: tl.constexpr,
|
||||||
|
NORM_BEFORE_GATE: tl.constexpr,
|
||||||
|
IS_RMS_NORM: tl.constexpr,
|
||||||
|
):
|
||||||
|
# Map the program id to the row of X and Y it should compute.
|
||||||
|
row = tl.program_id(0)
|
||||||
|
group = tl.program_id(1)
|
||||||
|
X += row * stride_x_row + group * N
|
||||||
|
Y += row * stride_y_row + group * N
|
||||||
|
if HAS_Z:
|
||||||
|
Z += row * stride_z_row + group * N
|
||||||
|
if not IS_RMS_NORM:
|
||||||
|
Mean += group * M
|
||||||
|
Rstd += group * M
|
||||||
|
W += group * N
|
||||||
|
if HAS_BIAS:
|
||||||
|
B += group * N
|
||||||
|
# Compute mean and variance
|
||||||
|
cols = tl.arange(0, BLOCK_N)
|
||||||
|
x = tl.load(X + cols, mask=cols < N, other=0.).to(tl.float32)
|
||||||
|
if HAS_Z and not NORM_BEFORE_GATE:
|
||||||
|
z = tl.load(Z + cols, mask=cols < N).to(tl.float32)
|
||||||
|
x *= z * tl.sigmoid(z)
|
||||||
|
if not IS_RMS_NORM:
|
||||||
|
mean = tl.sum(x, axis=0) / N
|
||||||
|
tl.store(Mean + row, mean)
|
||||||
|
xbar = tl.where(cols < N, x - mean, 0.)
|
||||||
|
var = tl.sum(xbar * xbar, axis=0) / N
|
||||||
|
else:
|
||||||
|
xbar = tl.where(cols < N, x, 0.)
|
||||||
|
var = tl.sum(xbar * xbar, axis=0) / N
|
||||||
|
rstd = 1 / tl.sqrt(var + eps)
|
||||||
|
tl.store(Rstd + row, rstd)
|
||||||
|
# Normalize and apply linear transformation
|
||||||
|
mask = cols < N
|
||||||
|
w = tl.load(W + cols, mask=mask).to(tl.float32)
|
||||||
|
if HAS_BIAS:
|
||||||
|
b = tl.load(B + cols, mask=mask).to(tl.float32)
|
||||||
|
x_hat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd
|
||||||
|
y = x_hat * w + b if HAS_BIAS else x_hat * w
|
||||||
|
if HAS_Z and NORM_BEFORE_GATE:
|
||||||
|
z = tl.load(Z + cols, mask=mask).to(tl.float32)
|
||||||
|
y *= z * tl.sigmoid(z)
|
||||||
|
# Write output
|
||||||
|
tl.store(Y + cols, y, mask=mask)
|
||||||
|
|
||||||
|
|
||||||
|
def _layer_norm_fwd(x,
|
||||||
|
weight,
|
||||||
|
bias,
|
||||||
|
eps,
|
||||||
|
z=None,
|
||||||
|
out=None,
|
||||||
|
group_size=None,
|
||||||
|
norm_before_gate=True,
|
||||||
|
is_rms_norm=False):
|
||||||
|
M, N = x.shape
|
||||||
|
if group_size is None:
|
||||||
|
group_size = N
|
||||||
|
assert N % group_size == 0
|
||||||
|
ngroups = N // group_size
|
||||||
|
assert x.stride(-1) == 1
|
||||||
|
if z is not None:
|
||||||
|
assert z.stride(-1) == 1
|
||||||
|
assert z.shape == (M, N)
|
||||||
|
assert weight.shape == (N, )
|
||||||
|
assert weight.stride(-1) == 1
|
||||||
|
if bias is not None:
|
||||||
|
assert bias.stride(-1) == 1
|
||||||
|
assert bias.shape == (N, )
|
||||||
|
# allocate output
|
||||||
|
if out is not None:
|
||||||
|
assert out.shape == x.shape
|
||||||
|
else:
|
||||||
|
out = torch.empty_like(x)
|
||||||
|
assert out.stride(-1) == 1
|
||||||
|
mean = torch.empty((ngroups * M, ), dtype=torch.float32,
|
||||||
|
device=x.device) if not is_rms_norm else None
|
||||||
|
rstd = torch.empty((ngroups * M, ), dtype=torch.float32, device=x.device)
|
||||||
|
# Less than 64KB per feature: enqueue fused kernel
|
||||||
|
MAX_FUSED_SIZE = 65536 // x.element_size()
|
||||||
|
BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(group_size))
|
||||||
|
if group_size > BLOCK_N:
|
||||||
|
raise RuntimeError(
|
||||||
|
"This layer norm doesn't support feature dim >= 64KB.")
|
||||||
|
# heuristics for number of warps
|
||||||
|
num_warps = min(max(BLOCK_N // 256, 1), 8)
|
||||||
|
grid = (M, ngroups)
|
||||||
|
with torch.cuda.device(x.device.index):
|
||||||
|
_layer_norm_fwd_1pass_kernel[grid](x,
|
||||||
|
out,
|
||||||
|
weight,
|
||||||
|
bias,
|
||||||
|
z,
|
||||||
|
mean,
|
||||||
|
rstd,
|
||||||
|
x.stride(0),
|
||||||
|
out.stride(0),
|
||||||
|
z.stride(0) if z is not None else 0,
|
||||||
|
M,
|
||||||
|
group_size,
|
||||||
|
eps,
|
||||||
|
BLOCK_N=BLOCK_N,
|
||||||
|
NORM_BEFORE_GATE=norm_before_gate,
|
||||||
|
IS_RMS_NORM=is_rms_norm,
|
||||||
|
num_warps=num_warps)
|
||||||
|
return out, mean, rstd
|
||||||
|
|
||||||
|
|
||||||
|
def rms_norm_gated(x,
|
||||||
|
weight,
|
||||||
|
bias,
|
||||||
|
z=None,
|
||||||
|
eps=1e-6,
|
||||||
|
group_size=None,
|
||||||
|
norm_before_gate=True):
|
||||||
|
x_shape_og = x.shape
|
||||||
|
# reshape input data into 2D tensor
|
||||||
|
x = x.reshape(-1, x.shape[-1])
|
||||||
|
if x.stride(-1) != 1:
|
||||||
|
x = x.contiguous()
|
||||||
|
if z is not None:
|
||||||
|
assert z.shape == x_shape_og
|
||||||
|
z = z.reshape(-1, z.shape[-1])
|
||||||
|
if z.stride(-1) != 1:
|
||||||
|
z = z.contiguous()
|
||||||
|
weight = weight.contiguous()
|
||||||
|
if bias is not None:
|
||||||
|
bias = bias.contiguous()
|
||||||
|
y, _, _ = _layer_norm_fwd(x,
|
||||||
|
weight,
|
||||||
|
bias,
|
||||||
|
eps,
|
||||||
|
z=z,
|
||||||
|
group_size=group_size,
|
||||||
|
norm_before_gate=norm_before_gate,
|
||||||
|
is_rms_norm=True)
|
||||||
|
|
||||||
|
return y.reshape(x_shape_og)
|
||||||
@ -16,8 +16,9 @@ from vllm.config import ModelConfig, PoolerConfig
|
|||||||
from vllm.model_executor.pooling_metadata import ( # noqa: E501
|
from vllm.model_executor.pooling_metadata import ( # noqa: E501
|
||||||
PoolingMetadata as V0PoolingMetadata)
|
PoolingMetadata as V0PoolingMetadata)
|
||||||
from vllm.model_executor.pooling_metadata import PoolingTensors
|
from vllm.model_executor.pooling_metadata import PoolingTensors
|
||||||
from vllm.pooling_params import PoolingParams, PoolingTask
|
from vllm.pooling_params import PoolingParams
|
||||||
from vllm.sequence import PoolerOutput, PoolingSequenceGroupOutput
|
from vllm.sequence import PoolerOutput, PoolingSequenceGroupOutput
|
||||||
|
from vllm.tasks import PoolingTask
|
||||||
from vllm.utils import resolve_obj_by_qualname
|
from vllm.utils import resolve_obj_by_qualname
|
||||||
from vllm.v1.pool.metadata import PoolingMetadata as V1PoolingMetadata
|
from vllm.v1.pool.metadata import PoolingMetadata as V1PoolingMetadata
|
||||||
|
|
||||||
|
|||||||
@ -33,7 +33,7 @@ from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
|
|||||||
find_matched_target, is_activation_quantization_format,
|
find_matched_target, is_activation_quantization_format,
|
||||||
should_ignore_layer)
|
should_ignore_layer)
|
||||||
from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
|
from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
|
||||||
from vllm.model_executor.layers.quantization.utils.nvfp4_emulation_utils import ( # noqa: E501
|
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||||
cutlass_fp4_supported)
|
cutlass_fp4_supported)
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
|
|||||||
@ -27,8 +27,8 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
|
|||||||
prepare_moe_fp4_layer_for_marlin)
|
prepare_moe_fp4_layer_for_marlin)
|
||||||
from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
|
from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
|
||||||
prepare_moe_fp8_layer_for_marlin)
|
prepare_moe_fp8_layer_for_marlin)
|
||||||
from vllm.model_executor.layers.quantization.utils.nvfp4_emulation_utils import ( # noqa: E501
|
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||||
cutlass_fp4_supported)
|
cutlass_fp4_supported, swizzle_blockscale)
|
||||||
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
|
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
|
||||||
all_close_1d, normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize)
|
all_close_1d, normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize)
|
||||||
from vllm.model_executor.utils import set_weight_attrs
|
from vllm.model_executor.utils import set_weight_attrs
|
||||||
@ -193,29 +193,6 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
|
|||||||
{"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
|
{"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
|
||||||
set_weight_attrs(w2_input_scale, extra_weight_attrs)
|
set_weight_attrs(w2_input_scale, extra_weight_attrs)
|
||||||
|
|
||||||
def swizzle_blockscale(self, scale: torch.tensor):
|
|
||||||
assert (scale.dtype == torch.float8_e4m3fn)
|
|
||||||
# Pad and blockwise interleave weight_scale
|
|
||||||
scale_ndim = scale.ndim
|
|
||||||
if scale.ndim == 2:
|
|
||||||
scale = scale.unsqueeze(0)
|
|
||||||
assert scale.ndim == 3
|
|
||||||
B, M, K = scale.shape
|
|
||||||
round_up_multiple = lambda x, m: (x + m - 1) // m * m
|
|
||||||
M_padded = round_up_multiple(M, 128)
|
|
||||||
K_padded = round_up_multiple(K, 4)
|
|
||||||
padded_scale = torch.zeros((B, M_padded, K_padded), dtype=scale.dtype)
|
|
||||||
padded_scale[:B, :M, :K] = scale
|
|
||||||
batches, rows, cols = padded_scale.shape
|
|
||||||
assert rows % 128 == 0
|
|
||||||
assert cols % 4 == 0
|
|
||||||
padded_scale = padded_scale.reshape(batches, rows // 128, 4, 32,
|
|
||||||
cols // 4, 4)
|
|
||||||
swizzled_scale = padded_scale.permute((0, 1, 4, 3, 2, 5))
|
|
||||||
swizzled_scale = swizzled_scale.contiguous().cuda()
|
|
||||||
return (swizzled_scale.reshape(M, K)
|
|
||||||
if scale_ndim == 2 else swizzled_scale.reshape(B, M, K))
|
|
||||||
|
|
||||||
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
|
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
|
||||||
|
|
||||||
# From packed to weight
|
# From packed to weight
|
||||||
@ -243,13 +220,13 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
|
|||||||
return
|
return
|
||||||
|
|
||||||
# swizzle weight scales
|
# swizzle weight scales
|
||||||
layer.w13_blockscale_swizzled = torch.nn.Parameter(
|
layer.w13_blockscale_swizzled = torch.nn.Parameter(swizzle_blockscale(
|
||||||
self.swizzle_blockscale(layer.w13_weight_scale),
|
layer.w13_weight_scale),
|
||||||
requires_grad=False)
|
requires_grad=False)
|
||||||
|
|
||||||
layer.w2_blockscale_swizzled = torch.nn.Parameter(
|
layer.w2_blockscale_swizzled = torch.nn.Parameter(swizzle_blockscale(
|
||||||
self.swizzle_blockscale(layer.w2_weight_scale),
|
layer.w2_weight_scale),
|
||||||
requires_grad=False)
|
requires_grad=False)
|
||||||
|
|
||||||
# w13
|
# w13
|
||||||
w13_input_global_scale = layer.w13_input_global_scale.max(
|
w13_input_global_scale = layer.w13_input_global_scale.max(
|
||||||
|
|||||||
@ -9,8 +9,7 @@ from torch.nn.parameter import Parameter
|
|||||||
|
|
||||||
import vllm.envs as envs
|
import vllm.envs as envs
|
||||||
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
|
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
|
||||||
from vllm._custom_ops import (cutlass_scaled_fp4_mm,
|
from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
|
||||||
cutlass_scaled_mm_supports_fp4, scaled_fp4_quant)
|
|
||||||
from vllm.distributed import get_ep_group
|
from vllm.distributed import get_ep_group
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig
|
from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig
|
||||||
@ -28,7 +27,7 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
|
|||||||
apply_fp4_marlin_linear, is_fp4_marlin_supported,
|
apply_fp4_marlin_linear, is_fp4_marlin_supported,
|
||||||
prepare_fp4_layer_for_marlin, prepare_moe_fp4_layer_for_marlin)
|
prepare_fp4_layer_for_marlin, prepare_moe_fp4_layer_for_marlin)
|
||||||
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||||
GroupShape, is_layer_skipped)
|
GroupShape, cutlass_fp4_supported, is_layer_skipped, swizzle_blockscale)
|
||||||
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
|
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
|
||||||
Fp8LinearOp, requantize_with_max_scale)
|
Fp8LinearOp, requantize_with_max_scale)
|
||||||
from vllm.model_executor.parameter import (ModelWeightParameter,
|
from vllm.model_executor.parameter import (ModelWeightParameter,
|
||||||
@ -667,14 +666,6 @@ class ModelOptNvFp4Config(QuantizationConfig):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def cutlass_fp4_supported() -> bool:
|
|
||||||
if not current_platform.is_cuda():
|
|
||||||
return False
|
|
||||||
capability_tuple = current_platform.get_device_capability()
|
|
||||||
capability = -1 if capability_tuple is None else capability_tuple.to_int()
|
|
||||||
return cutlass_scaled_mm_supports_fp4(capability)
|
|
||||||
|
|
||||||
|
|
||||||
class ModelOptFp8KVCacheMethod(BaseKVCacheMethod):
|
class ModelOptFp8KVCacheMethod(BaseKVCacheMethod):
|
||||||
"""
|
"""
|
||||||
Supports loading kv-cache scaling factors from FP8 checkpoints.
|
Supports loading kv-cache scaling factors from FP8 checkpoints.
|
||||||
@ -772,29 +763,6 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
|
|||||||
|
|
||||||
layer.register_parameter("weight_scale", weight_scale)
|
layer.register_parameter("weight_scale", weight_scale)
|
||||||
|
|
||||||
def swizzle_blockscale(self, scale: torch.tensor):
|
|
||||||
assert (scale.dtype == torch.float8_e4m3fn)
|
|
||||||
# Pad and blockwise interleave weight_scale
|
|
||||||
scale_ndim = scale.ndim
|
|
||||||
if scale.ndim == 2:
|
|
||||||
scale = scale.unsqueeze(0)
|
|
||||||
assert scale.ndim == 3
|
|
||||||
B, M, K = scale.shape
|
|
||||||
round_up_multiple = lambda x, m: (x + m - 1) // m * m
|
|
||||||
M_padded = round_up_multiple(M, 128)
|
|
||||||
K_padded = round_up_multiple(K, 4)
|
|
||||||
padded_scale = torch.zeros((B, M_padded, K_padded), dtype=scale.dtype)
|
|
||||||
padded_scale[:B, :M, :K] = scale
|
|
||||||
batches, rows, cols = padded_scale.shape
|
|
||||||
assert rows % 128 == 0
|
|
||||||
assert cols % 4 == 0
|
|
||||||
padded_scale = padded_scale.reshape(batches, rows // 128, 4, 32,
|
|
||||||
cols // 4, 4)
|
|
||||||
swizzled_scale = padded_scale.permute((0, 1, 4, 3, 2, 5))
|
|
||||||
swizzled_scale = swizzled_scale.contiguous().cuda()
|
|
||||||
return (swizzled_scale.reshape(M, K)
|
|
||||||
if scale_ndim == 2 else swizzled_scale.reshape(B, M, K))
|
|
||||||
|
|
||||||
def process_weights_after_loading(self, layer: Module) -> None:
|
def process_weights_after_loading(self, layer: Module) -> None:
|
||||||
|
|
||||||
# global scales:
|
# global scales:
|
||||||
@ -814,7 +782,7 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
|
|||||||
"Expected weight_scale.dim(1) to be divisible by 16")
|
"Expected weight_scale.dim(1) to be divisible by 16")
|
||||||
assert (layer.weight_scale.dtype == torch.float8_e4m3fn), (
|
assert (layer.weight_scale.dtype == torch.float8_e4m3fn), (
|
||||||
"Weight Block scale must be represented as FP8-E4M3")
|
"Weight Block scale must be represented as FP8-E4M3")
|
||||||
swizzled_weight_scale = self.swizzle_blockscale(layer.weight_scale)
|
swizzled_weight_scale = swizzle_blockscale(layer.weight_scale)
|
||||||
|
|
||||||
layer.weight_scale_swizzled = Parameter(swizzled_weight_scale,
|
layer.weight_scale_swizzled = Parameter(swizzled_weight_scale,
|
||||||
requires_grad=False)
|
requires_grad=False)
|
||||||
@ -1060,29 +1028,6 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
|
|||||||
weight_loader=weight_loader)
|
weight_loader=weight_loader)
|
||||||
layer.register_parameter("w2_input_scale", w2_input_scale)
|
layer.register_parameter("w2_input_scale", w2_input_scale)
|
||||||
|
|
||||||
def swizzle_blockscale(self, scale: torch.tensor):
|
|
||||||
assert (scale.dtype == torch.float8_e4m3fn)
|
|
||||||
# Pad and blockwise interleave weight_scale
|
|
||||||
scale_ndim = scale.ndim
|
|
||||||
if scale.ndim == 2:
|
|
||||||
scale = scale.unsqueeze(0)
|
|
||||||
assert scale.ndim == 3
|
|
||||||
B, M, K = scale.shape
|
|
||||||
round_up_multiple = lambda x, m: (x + m - 1) // m * m
|
|
||||||
M_padded = round_up_multiple(M, 128)
|
|
||||||
K_padded = round_up_multiple(K, 4)
|
|
||||||
padded_scale = torch.zeros((B, M_padded, K_padded), dtype=scale.dtype)
|
|
||||||
padded_scale[:B, :M, :K] = scale
|
|
||||||
batches, rows, cols = padded_scale.shape
|
|
||||||
assert rows % 128 == 0
|
|
||||||
assert cols % 4 == 0
|
|
||||||
padded_scale = padded_scale.reshape(batches, rows // 128, 4, 32,
|
|
||||||
cols // 4, 4)
|
|
||||||
swizzled_scale = padded_scale.permute((0, 1, 4, 3, 2, 5))
|
|
||||||
swizzled_scale = swizzled_scale.contiguous().cuda()
|
|
||||||
return (swizzled_scale.reshape(M, K)
|
|
||||||
if scale_ndim == 2 else swizzled_scale.reshape(B, M, K))
|
|
||||||
|
|
||||||
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
|
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
|
||||||
# GEMM 1
|
# GEMM 1
|
||||||
# The FlashInfer Cutlass fused MoE kernel expects the combined weights
|
# The FlashInfer Cutlass fused MoE kernel expects the combined weights
|
||||||
@ -1128,8 +1073,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
|
|||||||
"Expected weight_scale.dim(1) to be divisible by 16")
|
"Expected weight_scale.dim(1) to be divisible by 16")
|
||||||
assert (layer.w13_weight_scale.dtype == torch.float8_e4m3fn), (
|
assert (layer.w13_weight_scale.dtype == torch.float8_e4m3fn), (
|
||||||
"Weight Blockscale must be represented as FP8-E4M3")
|
"Weight Blockscale must be represented as FP8-E4M3")
|
||||||
w13_blockscale_swizzled = self.swizzle_blockscale(
|
w13_blockscale_swizzled = swizzle_blockscale(layer.w13_weight_scale)
|
||||||
layer.w13_weight_scale)
|
|
||||||
|
|
||||||
layer.w13_blockscale_swizzled = Parameter(w13_blockscale_swizzled,
|
layer.w13_blockscale_swizzled = Parameter(w13_blockscale_swizzled,
|
||||||
requires_grad=False)
|
requires_grad=False)
|
||||||
@ -1151,7 +1095,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
|
|||||||
"Expected weight_scale.dim(1) to be divisible by 16")
|
"Expected weight_scale.dim(1) to be divisible by 16")
|
||||||
assert (layer.w2_weight_scale.dtype == torch.float8_e4m3fn), (
|
assert (layer.w2_weight_scale.dtype == torch.float8_e4m3fn), (
|
||||||
"Weight Blockscale must be represented as FP8-E4M3")
|
"Weight Blockscale must be represented as FP8-E4M3")
|
||||||
w2_blockscale_swizzled = self.swizzle_blockscale(layer.w2_weight_scale)
|
w2_blockscale_swizzled = swizzle_blockscale(layer.w2_weight_scale)
|
||||||
|
|
||||||
layer.w2_blockscale_swizzled = Parameter(w2_blockscale_swizzled,
|
layer.w2_blockscale_swizzled = Parameter(w2_blockscale_swizzled,
|
||||||
requires_grad=False)
|
requires_grad=False)
|
||||||
|
|||||||
@ -3,18 +3,19 @@
|
|||||||
# Copyright © 2025, Oracle and/or its affiliates.
|
# Copyright © 2025, Oracle and/or its affiliates.
|
||||||
|
|
||||||
import os
|
import os
|
||||||
from typing import Any, Optional
|
from typing import Any, Callable, Optional
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
from torch.nn.parameter import Parameter
|
from torch.nn.parameter import Parameter
|
||||||
|
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
|
from vllm.model_executor.layers.fused_moe import FusedMoE, FusedMoEMethodBase
|
||||||
from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
|
from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
|
||||||
set_weight_attrs)
|
set_weight_attrs)
|
||||||
from vllm.model_executor.layers.quantization import QuantizationMethods
|
from vllm.model_executor.layers.quantization import QuantizationMethods
|
||||||
from vllm.model_executor.layers.quantization.base_config import (
|
from vllm.model_executor.layers.quantization.base_config import (
|
||||||
QuantizationConfig)
|
QuantizationConfig, QuantizeMethodBase)
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
"""By default, use 8 bit as target precision, but it can be
|
"""By default, use 8 bit as target precision, but it can be
|
||||||
@ -71,9 +72,11 @@ class RTNConfig(QuantizationConfig):
|
|||||||
return cls(weight_bits, group_size)
|
return cls(weight_bits, group_size)
|
||||||
|
|
||||||
def get_quant_method(self, layer: torch.nn.Module,
|
def get_quant_method(self, layer: torch.nn.Module,
|
||||||
prefix: str) -> Optional["RTNLinearMethod"]:
|
prefix: str) -> Optional["QuantizeMethodBase"]:
|
||||||
if isinstance(layer, LinearBase):
|
if isinstance(layer, LinearBase):
|
||||||
return RTNLinearMethod(self)
|
return RTNLinearMethod(self)
|
||||||
|
elif isinstance(layer, FusedMoE):
|
||||||
|
return RTNMoEMethod(self)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
@ -94,11 +97,18 @@ class RTNTensor:
|
|||||||
self.data.narrow(dim, start // factor, length // factor),
|
self.data.narrow(dim, start // factor, length // factor),
|
||||||
self.scale.narrow(dim, start, length), self.quant_config)
|
self.scale.narrow(dim, start, length), self.quant_config)
|
||||||
|
|
||||||
|
def __getitem__(self, key):
|
||||||
|
return RTNTensor(self.data[key], self.scale[key], self.quant_config)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def shape(self):
|
def shape(self):
|
||||||
shape = self.data.shape
|
shape = self.data.shape
|
||||||
factor = 1 if self.quant_config.weight_bits == 8 else 2
|
factor = 1 if self.quant_config.weight_bits == 8 else 2
|
||||||
return torch.Size((shape[0] * factor, shape[1]))
|
batch_present = len(shape) == 3
|
||||||
|
if batch_present:
|
||||||
|
return torch.Size((shape[0], shape[1] * factor, shape[2]))
|
||||||
|
else:
|
||||||
|
return torch.Size((shape[0] * factor, shape[1]))
|
||||||
|
|
||||||
def copy_(self, loaded_weight: torch.Tensor) -> None:
|
def copy_(self, loaded_weight: torch.Tensor) -> None:
|
||||||
qweight, weight_scale = rtn_quantize(loaded_weight.cuda(),
|
qweight, weight_scale = rtn_quantize(loaded_weight.cuda(),
|
||||||
@ -165,7 +175,7 @@ class RTNLinearMethod(LinearMethodBase):
|
|||||||
weight = RTNParameter(data=torch.empty(output_size_per_partition //
|
weight = RTNParameter(data=torch.empty(output_size_per_partition //
|
||||||
factor,
|
factor,
|
||||||
input_size_per_partition,
|
input_size_per_partition,
|
||||||
dtype=torch.int8),
|
dtype=torch.uint8),
|
||||||
scale=scale,
|
scale=scale,
|
||||||
quant_config=self.quant_config)
|
quant_config=self.quant_config)
|
||||||
|
|
||||||
@ -180,18 +190,7 @@ class RTNLinearMethod(LinearMethodBase):
|
|||||||
layer.output_size_per_partition = output_size_per_partition
|
layer.output_size_per_partition = output_size_per_partition
|
||||||
|
|
||||||
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
|
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
|
||||||
"""torch.compile does not know how to deal with a Parameter subclass
|
fix_weights(layer, "weight")
|
||||||
(aka RTNParameter). As we don't really need RTNParameters for the
|
|
||||||
forward pass, we replace them with equivalent instances of Parameters.
|
|
||||||
"""
|
|
||||||
old_weight = layer.weight
|
|
||||||
assert isinstance(old_weight, RTNParameter)
|
|
||||||
data = old_weight.data.data
|
|
||||||
|
|
||||||
delattr(layer, "weight")
|
|
||||||
|
|
||||||
new_weight = Parameter(data=data, requires_grad=False)
|
|
||||||
layer.register_parameter("weight", new_weight)
|
|
||||||
|
|
||||||
def apply(self,
|
def apply(self,
|
||||||
layer: torch.nn.Module,
|
layer: torch.nn.Module,
|
||||||
@ -209,6 +208,128 @@ class RTNLinearMethod(LinearMethodBase):
|
|||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
class RTNMoEMethod(FusedMoEMethodBase):
|
||||||
|
|
||||||
|
def __init__(self, quant_config: RTNConfig):
|
||||||
|
self.quant_config = quant_config
|
||||||
|
|
||||||
|
def create_weights(self, layer: torch.nn.Module, num_experts: int,
|
||||||
|
hidden_size: int, intermediate_size_per_partition: int,
|
||||||
|
params_dtype: torch.dtype, **extra_weight_attrs):
|
||||||
|
|
||||||
|
factor = 1 if self.quant_config.weight_bits == 8 else 2
|
||||||
|
|
||||||
|
# Fused gate_up_proj (column parallel)
|
||||||
|
num_groups_per_col = (hidden_size // self.quant_config.group_size
|
||||||
|
if self.quant_config.group_size != -1 else 1)
|
||||||
|
w13_scale = Parameter(
|
||||||
|
torch.empty(num_experts,
|
||||||
|
2 * intermediate_size_per_partition,
|
||||||
|
num_groups_per_col,
|
||||||
|
dtype=params_dtype),
|
||||||
|
requires_grad=False,
|
||||||
|
)
|
||||||
|
layer.register_parameter("w13_scale", w13_scale)
|
||||||
|
|
||||||
|
w13_weight = RTNParameter(data=torch.empty(
|
||||||
|
num_experts,
|
||||||
|
2 * intermediate_size_per_partition // factor,
|
||||||
|
hidden_size,
|
||||||
|
dtype=torch.uint8),
|
||||||
|
scale=w13_scale,
|
||||||
|
quant_config=self.quant_config)
|
||||||
|
layer.register_parameter("w13_weight", w13_weight)
|
||||||
|
set_weight_attrs(w13_weight, extra_weight_attrs)
|
||||||
|
|
||||||
|
# down_proj (row parallel)
|
||||||
|
num_groups_per_col = (intermediate_size_per_partition //
|
||||||
|
self.quant_config.group_size
|
||||||
|
if self.quant_config.group_size != -1 else 1)
|
||||||
|
w2_scale = Parameter(torch.zeros(num_experts,
|
||||||
|
hidden_size,
|
||||||
|
num_groups_per_col,
|
||||||
|
dtype=params_dtype),
|
||||||
|
requires_grad=False)
|
||||||
|
layer.register_parameter("w2_scale", w2_scale)
|
||||||
|
|
||||||
|
w2_weight = RTNParameter(data=torch.empty(
|
||||||
|
num_experts,
|
||||||
|
hidden_size // factor,
|
||||||
|
intermediate_size_per_partition,
|
||||||
|
dtype=torch.uint8),
|
||||||
|
scale=w2_scale,
|
||||||
|
quant_config=self.quant_config)
|
||||||
|
layer.register_parameter("w2_weight", w2_weight)
|
||||||
|
set_weight_attrs(w2_weight, extra_weight_attrs)
|
||||||
|
|
||||||
|
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
|
||||||
|
weight_bits = self.quant_config.weight_bits
|
||||||
|
fix_weights(layer, "w13_weight", weight_bits == 4)
|
||||||
|
fix_weights(layer, "w2_weight", weight_bits == 4)
|
||||||
|
|
||||||
|
def apply(
|
||||||
|
self,
|
||||||
|
layer: torch.nn.Module,
|
||||||
|
x: torch.Tensor,
|
||||||
|
router_logits: torch.Tensor,
|
||||||
|
top_k: int,
|
||||||
|
renormalize: bool,
|
||||||
|
use_grouped_topk: bool = False,
|
||||||
|
topk_group: Optional[int] = None,
|
||||||
|
num_expert_group: Optional[int] = None,
|
||||||
|
global_num_experts: int = -1,
|
||||||
|
expert_map: Optional[torch.Tensor] = None,
|
||||||
|
custom_routing_function: Optional[Callable] = None,
|
||||||
|
scoring_func: str = "softmax",
|
||||||
|
e_score_correction_bias: Optional[torch.Tensor] = None,
|
||||||
|
apply_router_weight_on_input: bool = False,
|
||||||
|
activation: str = "silu",
|
||||||
|
enable_eplb: bool = False,
|
||||||
|
expert_load_view: Optional[torch.Tensor] = None,
|
||||||
|
logical_to_physical_map: Optional[torch.Tensor] = None,
|
||||||
|
logical_replica_count: Optional[torch.Tensor] = None,
|
||||||
|
) -> torch.Tensor:
|
||||||
|
if enable_eplb:
|
||||||
|
raise NotImplementedError(
|
||||||
|
"EPLB not supported for `RTNMoEMethod` yet.")
|
||||||
|
|
||||||
|
from vllm.model_executor.layers.fused_moe import fused_experts
|
||||||
|
|
||||||
|
topk_weights, topk_ids = FusedMoE.select_experts(
|
||||||
|
hidden_states=x,
|
||||||
|
router_logits=router_logits,
|
||||||
|
use_grouped_topk=use_grouped_topk,
|
||||||
|
top_k=top_k,
|
||||||
|
renormalize=renormalize,
|
||||||
|
topk_group=topk_group,
|
||||||
|
num_expert_group=num_expert_group,
|
||||||
|
custom_routing_function=custom_routing_function,
|
||||||
|
scoring_func=scoring_func,
|
||||||
|
e_score_correction_bias=e_score_correction_bias)
|
||||||
|
|
||||||
|
weight_bits = self.quant_config.weight_bits
|
||||||
|
group_size = self.quant_config.group_size
|
||||||
|
|
||||||
|
ret = fused_experts(
|
||||||
|
x,
|
||||||
|
layer.w13_weight,
|
||||||
|
layer.w2_weight,
|
||||||
|
topk_weights=topk_weights,
|
||||||
|
topk_ids=topk_ids,
|
||||||
|
inplace=True,
|
||||||
|
activation=activation,
|
||||||
|
use_int4_w4a16=weight_bits == 4,
|
||||||
|
use_int8_w8a16=weight_bits == 8,
|
||||||
|
global_num_experts=global_num_experts,
|
||||||
|
w1_scale=layer.w13_scale,
|
||||||
|
w2_scale=layer.w2_scale,
|
||||||
|
apply_router_weight_on_input=apply_router_weight_on_input,
|
||||||
|
expert_map=expert_map,
|
||||||
|
block_shape=[0, group_size])
|
||||||
|
|
||||||
|
return ret
|
||||||
|
|
||||||
|
|
||||||
def rtn_quantize(tensor: torch.Tensor, num_bits: int,
|
def rtn_quantize(tensor: torch.Tensor, num_bits: int,
|
||||||
group_size: int) -> tuple[torch.Tensor, torch.Tensor]:
|
group_size: int) -> tuple[torch.Tensor, torch.Tensor]:
|
||||||
"""Quantize a tensor using per-group static scaling factor.
|
"""Quantize a tensor using per-group static scaling factor.
|
||||||
@ -221,34 +342,44 @@ def rtn_quantize(tensor: torch.Tensor, num_bits: int,
|
|||||||
If equal to -1, each row in the input tensor is treated
|
If equal to -1, each row in the input tensor is treated
|
||||||
as one group.
|
as one group.
|
||||||
"""
|
"""
|
||||||
|
batch_present = len(tensor.shape) == 3
|
||||||
|
if not batch_present:
|
||||||
|
tensor = tensor.unsqueeze(0)
|
||||||
|
|
||||||
q_range = 2**num_bits
|
q_range = 2**num_bits
|
||||||
num_groups = (tensor.shape[0] * tensor.shape[1] //
|
num_groups = (tensor.shape[1] * tensor.shape[2] //
|
||||||
group_size if group_size != -1 else tensor.shape[0])
|
group_size if group_size != -1 else tensor.shape[1])
|
||||||
"""Calculate a scaling factor per input group.
|
"""Calculate a scaling factor per input group.
|
||||||
"""
|
"""
|
||||||
input_flat = tensor.reshape(num_groups, -1)
|
input_flat = tensor.reshape(tensor.shape[0], num_groups, -1)
|
||||||
input_min = torch.min(input_flat, dim=1, keepdim=True)[0]
|
input_min = torch.min(input_flat, dim=2, keepdim=True)[0]
|
||||||
input_max = torch.max(input_flat, dim=1, keepdim=True)[0]
|
input_max = torch.max(input_flat, dim=2, keepdim=True)[0]
|
||||||
input_max_abs = torch.max(input_min.abs(), input_max.abs())
|
input_max_abs = torch.max(input_min.abs(), input_max.abs())
|
||||||
scale = (input_max_abs * 2.0 / (q_range - 1))
|
scale = (input_max_abs * 2.0 / (q_range - 1))
|
||||||
"""Scale each input group, truncate and round to the nearest integer.
|
"""Scale each input group, round to the nearest integer, shift
|
||||||
|
the range and truncate.
|
||||||
"""
|
"""
|
||||||
scaled_input = input_flat / scale
|
scaled_input = input_flat / scale
|
||||||
scaled_input = scaled_input.clamp(-q_range // 2, q_range // 2 - 1)
|
|
||||||
scaled_input = scaled_input.round()
|
scaled_input = scaled_input.round()
|
||||||
|
scaled_input += q_range // 2
|
||||||
|
scaled_input = scaled_input.clamp(0, q_range - 1)
|
||||||
|
|
||||||
scale = scale.reshape(tensor.shape[0], -1).contiguous()
|
scale = scale.reshape(tensor.shape[0], tensor.shape[1], -1).contiguous()
|
||||||
inputs_q = scaled_input.reshape(tensor.shape).to(torch.int8)
|
inputs_q = scaled_input.reshape(tensor.shape).to(torch.uint8)
|
||||||
inputs_q = inputs_q.contiguous()
|
inputs_q = inputs_q.contiguous()
|
||||||
|
|
||||||
if num_bits == 4:
|
if num_bits == 4:
|
||||||
"""Pack two 4-bit values into each byte.
|
"""Pack two 4-bit values into each byte.
|
||||||
"""
|
"""
|
||||||
inputs_q = (inputs_q[:, 1::2] << 4) | (inputs_q[:, ::2] & 0xf)
|
inputs_q = (inputs_q[:, :, 1::2] << 4) | (inputs_q[:, :, ::2] & 0xf)
|
||||||
inputs_q = inputs_q.reshape(tensor.shape[0] // 2, tensor.shape[1])
|
inputs_q = inputs_q.reshape(tensor.shape[0], tensor.shape[1] // 2,
|
||||||
|
tensor.shape[2])
|
||||||
inputs_q = inputs_q.contiguous()
|
inputs_q = inputs_q.contiguous()
|
||||||
|
|
||||||
|
if not batch_present:
|
||||||
|
inputs_q = inputs_q.squeeze(0)
|
||||||
|
scale = scale.squeeze(0)
|
||||||
|
|
||||||
return inputs_q, scale
|
return inputs_q, scale
|
||||||
|
|
||||||
|
|
||||||
@ -259,31 +390,60 @@ def rtn_dequantize(tensor: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
|
|||||||
tensor: The input tensor.
|
tensor: The input tensor.
|
||||||
scale: The tensor with per-group scale factors.
|
scale: The tensor with per-group scale factors.
|
||||||
"""
|
"""
|
||||||
|
batch_present = len(tensor.shape) == 3
|
||||||
|
if not batch_present:
|
||||||
|
tensor = tensor.unsqueeze(0)
|
||||||
|
scale = scale.unsqueeze(0)
|
||||||
|
|
||||||
num_groups = scale.size(0) * scale.size(1)
|
num_groups = scale.size(1) * scale.size(2)
|
||||||
input_dim, output_dim = tensor.shape
|
batch, input_dim, output_dim = tensor.shape
|
||||||
|
|
||||||
num_bits = 8 if input_dim == scale.size(0) else 4
|
num_bits = 8 if input_dim == scale.size(1) else 4
|
||||||
|
q_range = 2**num_bits
|
||||||
if num_bits == 4:
|
if num_bits == 4:
|
||||||
input_dim *= 2
|
input_dim *= 2
|
||||||
|
|
||||||
data = torch.empty((input_dim, output_dim),
|
data = torch.empty((batch, input_dim, output_dim),
|
||||||
dtype=scale.dtype,
|
dtype=scale.dtype,
|
||||||
device=tensor.device)
|
device=tensor.device)
|
||||||
|
|
||||||
if num_bits == 8:
|
if num_bits == 8:
|
||||||
data.copy_(tensor)
|
data.copy_(tensor)
|
||||||
|
data -= q_range // 2
|
||||||
else:
|
else:
|
||||||
"""Unpack two 4-bit values from each byte.
|
"""Unpack two 4-bit values from each byte.
|
||||||
"""
|
"""
|
||||||
tensor = tensor.reshape(input_dim, output_dim // 2)
|
tensor = tensor.reshape(batch, input_dim, output_dim // 2)
|
||||||
for i in range(2):
|
for i in range(2):
|
||||||
data[:, i::2] = (tensor << 4 * (1 - i)) >> 4
|
data[:, :, i::2] = ((tensor << 4 *
|
||||||
|
(1 - i)) >> 4).to(torch.int8) - q_range // 2
|
||||||
"""Scale each input group with its scaling factor.
|
"""Scale each input group with its scaling factor.
|
||||||
"""
|
"""
|
||||||
scale = scale.reshape(num_groups, -1)
|
scale = scale.reshape(batch, num_groups, -1)
|
||||||
data = data.reshape(num_groups, -1)
|
data = data.reshape(batch, num_groups, -1)
|
||||||
data = torch.mul(data, scale)
|
data = torch.mul(data, scale)
|
||||||
|
|
||||||
input_deq = data.reshape((input_dim, output_dim)).contiguous()
|
input_deq = data.reshape((batch, input_dim, output_dim)).contiguous()
|
||||||
|
if not batch_present:
|
||||||
|
input_deq = input_deq.squeeze(0)
|
||||||
|
|
||||||
return input_deq
|
return input_deq
|
||||||
|
|
||||||
|
|
||||||
|
def fix_weights(layer: torch.nn.Module,
|
||||||
|
param_name: str,
|
||||||
|
reshape: bool = False):
|
||||||
|
"""torch.compile does not know how to deal with a Parameter subclass
|
||||||
|
(aka RTNParameter). As we don't really need RTNParameters for the
|
||||||
|
forward pass, we replace them with equivalent instances of Parameters.
|
||||||
|
"""
|
||||||
|
old_weight = getattr(layer, param_name)
|
||||||
|
assert isinstance(old_weight, RTNParameter)
|
||||||
|
data = old_weight.data.data
|
||||||
|
|
||||||
|
delattr(layer, param_name)
|
||||||
|
|
||||||
|
if reshape:
|
||||||
|
data = data.reshape(old_weight.shape[0], old_weight.shape[1] * 2, -1)
|
||||||
|
new_weight = Parameter(data=data, requires_grad=False)
|
||||||
|
layer.register_parameter(param_name, new_weight)
|
||||||
|
|||||||
@ -238,13 +238,20 @@ def per_token_group_quant_int8(
|
|||||||
int8_min = iinfo.min
|
int8_min = iinfo.min
|
||||||
|
|
||||||
x_q = torch.empty_like(x, device=x.device, dtype=dtype)
|
x_q = torch.empty_like(x, device=x.device, dtype=dtype)
|
||||||
M = x.numel() // group_size
|
|
||||||
N = group_size
|
|
||||||
x_s = torch.empty(
|
x_s = torch.empty(
|
||||||
x.shape[:-1] + (x.shape[-1] // group_size, ),
|
x.shape[:-1] + (x.shape[-1] // group_size, ),
|
||||||
device=x.device,
|
device=x.device,
|
||||||
dtype=torch.float32,
|
dtype=torch.float32,
|
||||||
)
|
)
|
||||||
|
# prefer CUDA kernel if available
|
||||||
|
if current_platform.is_cuda():
|
||||||
|
torch.ops._C.per_token_group_quant_int8(x, x_q, x_s, group_size, eps,
|
||||||
|
float(int8_min),
|
||||||
|
float(int8_max))
|
||||||
|
return x_q, x_s
|
||||||
|
|
||||||
|
M = x.numel() // group_size
|
||||||
|
N = group_size
|
||||||
|
|
||||||
BLOCK = triton.next_power_of_2(N)
|
BLOCK = triton.next_power_of_2(N)
|
||||||
# heuristics for number of warps
|
# heuristics for number of warps
|
||||||
|
|||||||
@ -2,13 +2,12 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm._custom_ops import cutlass_scaled_mm_supports_fp4
|
|
||||||
from vllm.platforms import current_platform
|
|
||||||
from vllm.scalar_type import scalar_types
|
from vllm.scalar_type import scalar_types
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"break_fp4_bytes", "dequantize_to_dtype", "ref_nvfp4_quant",
|
"break_fp4_bytes",
|
||||||
"cutlass_fp4_supported"
|
"dequantize_to_dtype",
|
||||||
|
"ref_nvfp4_quant",
|
||||||
]
|
]
|
||||||
|
|
||||||
FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
|
FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
|
||||||
@ -17,14 +16,6 @@ kE2M1ToFloat = torch.tensor([0., 0.5, 1., 1.5, 2., 3., 4., 6.],
|
|||||||
dtype=torch.float32)
|
dtype=torch.float32)
|
||||||
|
|
||||||
|
|
||||||
def cutlass_fp4_supported() -> bool:
|
|
||||||
if not current_platform.is_cuda():
|
|
||||||
return False
|
|
||||||
capability_tuple = current_platform.get_device_capability()
|
|
||||||
capability = -1 if capability_tuple is None else capability_tuple.to_int()
|
|
||||||
return cutlass_scaled_mm_supports_fp4(capability)
|
|
||||||
|
|
||||||
|
|
||||||
def break_fp4_bytes(a, dtype):
|
def break_fp4_bytes(a, dtype):
|
||||||
assert a.dtype == torch.uint8
|
assert a.dtype == torch.uint8
|
||||||
m, n = a.shape
|
m, n = a.shape
|
||||||
|
|||||||
@ -8,8 +8,10 @@ from typing import ClassVar, NamedTuple, Optional
|
|||||||
import numpy
|
import numpy
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
from vllm._custom_ops import cutlass_scaled_mm_supports_fp4
|
||||||
from vllm.model_executor.layers.quantization.qqq import (
|
from vllm.model_executor.layers.quantization.qqq import (
|
||||||
MARLIN_QQQ_SUPPORTED_NUM_BITS)
|
MARLIN_QQQ_SUPPORTED_NUM_BITS)
|
||||||
|
from vllm.platforms import current_platform
|
||||||
from vllm.scalar_type import ScalarType, scalar_types
|
from vllm.scalar_type import ScalarType, scalar_types
|
||||||
|
|
||||||
|
|
||||||
@ -592,3 +594,56 @@ def awq_pack(
|
|||||||
q_w = q_w.reshape((-1, size_n)).contiguous()
|
q_w = q_w.reshape((-1, size_n)).contiguous()
|
||||||
|
|
||||||
return pack_cols(q_w, num_bits, size_k, size_n)
|
return pack_cols(q_w, num_bits, size_k, size_n)
|
||||||
|
|
||||||
|
|
||||||
|
def swizzle_blockscale(scale: torch.Tensor) -> torch.Tensor:
|
||||||
|
"""
|
||||||
|
Pad and block-interleave the FP4 block-scales so that they match the data
|
||||||
|
layout expected by the CUTLASS / FlashInfer kernels.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
scale: torch.Tensor
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
torch.Tensor
|
||||||
|
The swizzled tensor with the same logical shape as *scale*.
|
||||||
|
"""
|
||||||
|
assert scale.dtype == torch.float8_e4m3fn, (
|
||||||
|
"swizzle_blockscale expects the input tensor to be in "
|
||||||
|
"torch.float8_e4m3fn format.")
|
||||||
|
|
||||||
|
scale_ndim = scale.ndim
|
||||||
|
if scale_ndim == 2:
|
||||||
|
scale = scale.unsqueeze(0) # (1, M, K)
|
||||||
|
assert scale.ndim == 3, "Expected a 2-D or 3-D tensor for block scales."
|
||||||
|
|
||||||
|
B, M, K = scale.shape
|
||||||
|
|
||||||
|
def _round_up(x: int, m: int) -> int:
|
||||||
|
return (x + m - 1) // m * m
|
||||||
|
|
||||||
|
M_padded = _round_up(M, 128)
|
||||||
|
K_padded = _round_up(K, 4)
|
||||||
|
|
||||||
|
padded = torch.zeros((B, M_padded, K_padded),
|
||||||
|
dtype=scale.dtype,
|
||||||
|
device=scale.device)
|
||||||
|
padded[:B, :M, :K] = scale
|
||||||
|
|
||||||
|
# Reshape / permute to the layout required by the kernel.
|
||||||
|
padded = padded.reshape(B, M_padded // 128, 4, 32, K_padded // 4, 4)
|
||||||
|
swizzled = padded.permute(0, 1, 4, 3, 2, 5).contiguous().cuda()
|
||||||
|
|
||||||
|
if scale_ndim == 2:
|
||||||
|
return swizzled.reshape(M, K)
|
||||||
|
return swizzled.reshape(B, M, K)
|
||||||
|
|
||||||
|
|
||||||
|
def cutlass_fp4_supported() -> bool:
|
||||||
|
if not current_platform.is_cuda():
|
||||||
|
return False
|
||||||
|
capability_tuple = current_platform.get_device_capability()
|
||||||
|
capability = -1 if capability_tuple is None else capability_tuple.to_int()
|
||||||
|
return cutlass_scaled_mm_supports_fp4(capability)
|
||||||
|
|||||||
@ -1,7 +1,7 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
from collections.abc import Iterable, Mapping, Sequence
|
from collections.abc import Iterable, Mapping, Sequence
|
||||||
from typing import Optional, TypedDict, Union
|
from typing import Annotated, Optional, Union
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
@ -29,6 +29,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
|||||||
PromptUpdate)
|
PromptUpdate)
|
||||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
|
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
||||||
|
|
||||||
# yapf: disable
|
# yapf: disable
|
||||||
from .idefics2_vision_model import Idefics2VisionConfig
|
from .idefics2_vision_model import Idefics2VisionConfig
|
||||||
@ -42,15 +43,26 @@ from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
|
|||||||
merge_multimodal_embeddings)
|
merge_multimodal_embeddings)
|
||||||
|
|
||||||
|
|
||||||
class AriaImagePixelInputs(TypedDict):
|
class AriaImagePixelInputs(TensorSchema):
|
||||||
pixel_values: torch.Tensor
|
|
||||||
pixel_mask: Optional[torch.Tensor]
|
|
||||||
"""
|
"""
|
||||||
Shape:
|
Dimensions:
|
||||||
pixel_values: `(batch_size * num_images, num_channels, height, width)`
|
- b: Batch size
|
||||||
pixel_mask: `(batch_size * num_images, height, width)`
|
- n: Number of images
|
||||||
|
- c: Number of channels
|
||||||
|
- h: Height of each image
|
||||||
|
- w: Width of each image
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
pixel_values: Annotated[
|
||||||
|
torch.Tensor,
|
||||||
|
TensorShape("bn", 3, "h", "w"),
|
||||||
|
]
|
||||||
|
|
||||||
|
pixel_mask: Annotated[
|
||||||
|
Optional[torch.Tensor],
|
||||||
|
TensorShape("bn", "h", "w"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
class AriaVisionTransformer(Idefics3VisionTransformer, SupportsQuant):
|
class AriaVisionTransformer(Idefics3VisionTransformer, SupportsQuant):
|
||||||
packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
|
packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
|
||||||
@ -540,12 +552,6 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
|
|||||||
self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
|
self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
|
||||||
self.vocab_size, logit_scale)
|
self.vocab_size, logit_scale)
|
||||||
|
|
||||||
def _validate_image_sizes(
|
|
||||||
self, images: list[torch.Tensor]) -> list[torch.Tensor]:
|
|
||||||
if not all(img.shape == images[0].shape for img in images):
|
|
||||||
raise ValueError("All images must be the same size")
|
|
||||||
return images
|
|
||||||
|
|
||||||
def _parse_and_validate_image_input(
|
def _parse_and_validate_image_input(
|
||||||
self, **kwargs: object) -> Optional[AriaImagePixelInputs]:
|
self, **kwargs: object) -> Optional[AriaImagePixelInputs]:
|
||||||
pixel_values = kwargs.pop("pixel_values", None)
|
pixel_values = kwargs.pop("pixel_values", None)
|
||||||
@ -554,23 +560,9 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
|
|||||||
if pixel_values is None:
|
if pixel_values is None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
if not isinstance(pixel_values, (torch.Tensor, list)):
|
|
||||||
raise ValueError("Incorrect type of pixel values. "
|
|
||||||
f"Got type: {type(pixel_values)}")
|
|
||||||
|
|
||||||
pixel_values = self._validate_image_sizes(pixel_values)
|
|
||||||
pixel_values = flatten_bn(pixel_values, concat=True)
|
|
||||||
|
|
||||||
if pixel_mask is not None:
|
|
||||||
if not isinstance(pixel_mask, (torch.Tensor, list)):
|
|
||||||
raise ValueError("Incorrect type of pixel mask. "
|
|
||||||
f"Got type: {type(pixel_mask)}")
|
|
||||||
|
|
||||||
pixel_mask = flatten_bn(pixel_mask, concat=True)
|
|
||||||
|
|
||||||
return AriaImagePixelInputs(
|
return AriaImagePixelInputs(
|
||||||
pixel_values=pixel_values,
|
pixel_values=flatten_bn(pixel_values, concat=True),
|
||||||
pixel_mask=pixel_mask,
|
pixel_mask=flatten_bn(pixel_mask, concat=True),
|
||||||
)
|
)
|
||||||
|
|
||||||
def _create_patch_attention_mask(
|
def _create_patch_attention_mask(
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user