mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-06 03:07:03 +08:00
Merge remote-tracking branch 'nm/lwilkinson/fix-flashmla-full-cudagraph' into wide_ep_working_branch
This commit is contained in:
commit
f1c9ef3afd
@ -74,7 +74,7 @@ Here is an example of one test inside `latency-tests.json`:
|
||||
In this example:
|
||||
|
||||
- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
|
||||
- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
|
||||
- The `parameters` attribute control the command line arguments to be used for `vllm bench latency`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `vllm bench latency`. For example, the corresponding command line arguments for `vllm bench latency` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
|
||||
|
||||
Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
|
||||
|
||||
@ -82,13 +82,13 @@ WARNING: The benchmarking script will save json results by itself, so please do
|
||||
|
||||
### Throughput test
|
||||
|
||||
The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
|
||||
The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `vllm bench throughput`.
|
||||
|
||||
The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
|
||||
|
||||
### Serving test
|
||||
|
||||
We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
|
||||
We test the throughput by using `vllm bench serve` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
|
||||
|
||||
```json
|
||||
[
|
||||
@ -118,8 +118,8 @@ Inside this example:
|
||||
|
||||
- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
|
||||
- The `server-parameters` includes the command line arguments for vLLM server.
|
||||
- The `client-parameters` includes the command line arguments for `benchmark_serving.py`.
|
||||
- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `benchmark_serving.py`
|
||||
- The `client-parameters` includes the command line arguments for `vllm bench serve`.
|
||||
- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `vllm bench serve`
|
||||
|
||||
The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly.
|
||||
|
||||
|
||||
@ -100,7 +100,7 @@ if __name__ == "__main__":
|
||||
raw_result = json.loads(f.read())
|
||||
|
||||
if "serving" in str(test_file):
|
||||
# this result is generated via `benchmark_serving.py`
|
||||
# this result is generated via `vllm bench serve` command
|
||||
|
||||
# attach the benchmarking command to raw_result
|
||||
try:
|
||||
@ -120,7 +120,7 @@ if __name__ == "__main__":
|
||||
continue
|
||||
|
||||
elif "latency" in f.name:
|
||||
# this result is generated via `benchmark_latency.py`
|
||||
# this result is generated via `vllm bench latency` command
|
||||
|
||||
# attach the benchmarking command to raw_result
|
||||
try:
|
||||
@ -148,7 +148,7 @@ if __name__ == "__main__":
|
||||
continue
|
||||
|
||||
elif "throughput" in f.name:
|
||||
# this result is generated via `benchmark_throughput.py`
|
||||
# this result is generated via `vllm bench throughput` command
|
||||
|
||||
# attach the benchmarking command to raw_result
|
||||
try:
|
||||
|
||||
@ -73,7 +73,7 @@ get_current_llm_serving_engine() {
|
||||
echo "Container: vllm"
|
||||
# move to a completely irrelevant directory, to avoid import vllm from current folder
|
||||
export CURRENT_LLM_SERVING_ENGINE=vllm
|
||||
|
||||
|
||||
return
|
||||
fi
|
||||
}
|
||||
@ -95,12 +95,14 @@ json2args() {
|
||||
}
|
||||
|
||||
kill_gpu_processes() {
|
||||
pkill -f python
|
||||
pkill -f python3
|
||||
pkill -f tritonserver
|
||||
pkill -f pt_main_thread
|
||||
pkill -f text-generation
|
||||
pkill -f lmdeploy
|
||||
pkill -f '[p]ython'
|
||||
pkill -f '[p]ython3'
|
||||
pkill -f '[t]ritonserver'
|
||||
pkill -f '[p]t_main_thread'
|
||||
pkill -f '[t]ext-generation'
|
||||
pkill -f '[l]mdeploy'
|
||||
# vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
|
||||
pkill -f '[V]LLM'
|
||||
|
||||
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
|
||||
sleep 1
|
||||
@ -125,7 +127,7 @@ ensure_installed() {
|
||||
}
|
||||
|
||||
run_serving_tests() {
|
||||
# run serving tests using `benchmark_serving.py`
|
||||
# run serving tests using `vllm bench serve` command
|
||||
# $1: a json file specifying serving test cases
|
||||
|
||||
local serving_test_file
|
||||
@ -225,7 +227,7 @@ run_serving_tests() {
|
||||
|
||||
if [[ "$dataset_name" = "sharegpt" ]]; then
|
||||
|
||||
client_command="python3 benchmark_serving.py \
|
||||
client_command="vllm bench serve \
|
||||
--backend $backend \
|
||||
--tokenizer /tokenizer_cache \
|
||||
--model $model \
|
||||
@ -246,7 +248,7 @@ run_serving_tests() {
|
||||
sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
|
||||
sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
|
||||
|
||||
client_command="python3 benchmark_serving.py \
|
||||
client_command="vllm bench serve \
|
||||
--backend $backend \
|
||||
--tokenizer /tokenizer_cache \
|
||||
--model $model \
|
||||
@ -265,13 +267,13 @@ run_serving_tests() {
|
||||
$client_args"
|
||||
|
||||
else
|
||||
|
||||
|
||||
echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
|
||||
exit 1
|
||||
|
||||
fi
|
||||
|
||||
|
||||
|
||||
|
||||
echo "Running test case $test_name with qps $qps"
|
||||
echo "Client command: $client_command"
|
||||
@ -302,7 +304,7 @@ run_serving_tests() {
|
||||
}
|
||||
|
||||
run_genai_perf_tests() {
|
||||
# run genai-perf tests
|
||||
# run genai-perf tests
|
||||
|
||||
# $1: a json file specifying genai-perf test cases
|
||||
local genai_perf_test_file
|
||||
@ -311,14 +313,14 @@ run_genai_perf_tests() {
|
||||
# Iterate over genai-perf tests
|
||||
jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
|
||||
# get the test name, and append the GPU type back to it.
|
||||
test_name=$(echo "$params" | jq -r '.test_name')
|
||||
|
||||
test_name=$(echo "$params" | jq -r '.test_name')
|
||||
|
||||
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||
echo "Skip test case $test_name."
|
||||
continue
|
||||
fi
|
||||
|
||||
|
||||
# prepend the current serving engine to the test name
|
||||
test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
|
||||
|
||||
@ -369,10 +371,10 @@ run_genai_perf_tests() {
|
||||
qps=$num_prompts
|
||||
echo "now qps is $qps"
|
||||
fi
|
||||
|
||||
|
||||
new_test_name=$test_name"_qps_"$qps
|
||||
backend=$CURRENT_LLM_SERVING_ENGINE
|
||||
|
||||
|
||||
if [[ "$backend" == *"vllm"* ]]; then
|
||||
backend="vllm"
|
||||
fi
|
||||
@ -413,7 +415,7 @@ prepare_dataset() {
|
||||
do
|
||||
cat sonnet.txt >> sonnet_4x.txt
|
||||
done
|
||||
|
||||
|
||||
}
|
||||
|
||||
main() {
|
||||
|
||||
@ -126,7 +126,8 @@ kill_gpu_processes() {
|
||||
ps -aux
|
||||
lsof -t -i:8000 | xargs -r kill -9
|
||||
pgrep python3 | xargs -r kill -9
|
||||
|
||||
# vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
|
||||
pgrep VLLM | xargs -r kill -9
|
||||
|
||||
# wait until GPU memory usage smaller than 1GB
|
||||
if command -v nvidia-smi; then
|
||||
@ -164,7 +165,7 @@ upload_to_buildkite() {
|
||||
}
|
||||
|
||||
run_latency_tests() {
|
||||
# run latency tests using `benchmark_latency.py`
|
||||
# run latency tests using `vllm bench latency` command
|
||||
# $1: a json file specifying latency test cases
|
||||
|
||||
local latency_test_file
|
||||
@ -205,7 +206,7 @@ run_latency_tests() {
|
||||
fi
|
||||
fi
|
||||
|
||||
latency_command=" $latency_envs python3 benchmark_latency.py \
|
||||
latency_command=" $latency_envs vllm bench latency \
|
||||
--output-json $RESULTS_FOLDER/${test_name}.json \
|
||||
$latency_args"
|
||||
|
||||
@ -231,7 +232,7 @@ run_latency_tests() {
|
||||
}
|
||||
|
||||
run_throughput_tests() {
|
||||
# run throughput tests using `benchmark_throughput.py`
|
||||
# run throughput tests using `vllm bench throughput`
|
||||
# $1: a json file specifying throughput test cases
|
||||
|
||||
local throughput_test_file
|
||||
@ -272,7 +273,7 @@ run_throughput_tests() {
|
||||
fi
|
||||
fi
|
||||
|
||||
throughput_command=" $throughput_envs python3 benchmark_throughput.py \
|
||||
throughput_command=" $throughput_envs vllm bench throughput \
|
||||
--output-json $RESULTS_FOLDER/${test_name}.json \
|
||||
$throughput_args"
|
||||
|
||||
@ -297,7 +298,7 @@ run_throughput_tests() {
|
||||
}
|
||||
|
||||
run_serving_tests() {
|
||||
# run serving tests using `benchmark_serving.py`
|
||||
# run serving tests using `vllm bench serve` command
|
||||
# $1: a json file specifying serving test cases
|
||||
|
||||
local serving_test_file
|
||||
@ -393,7 +394,7 @@ run_serving_tests() {
|
||||
|
||||
# pass the tensor parallel size to the client so that it can be displayed
|
||||
# on the benchmark dashboard
|
||||
client_command="python3 benchmark_serving.py \
|
||||
client_command="vllm bench serve \
|
||||
--save-result \
|
||||
--result-dir $RESULTS_FOLDER \
|
||||
--result-filename ${new_test_name}.json \
|
||||
@ -447,7 +448,7 @@ main() {
|
||||
(which jq) || (apt-get update && apt-get -y install jq)
|
||||
(which lsof) || (apt-get update && apt-get install -y lsof)
|
||||
|
||||
# get the current IP address, required by benchmark_serving.py
|
||||
# get the current IP address, required by `vllm bench serve` command
|
||||
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
|
||||
# turn of the reporting of the status of each request, to clean up the terminal output
|
||||
export VLLM_LOGGING_LEVEL="WARNING"
|
||||
|
||||
@ -13,9 +13,9 @@ NUMA_NODE=${NUMA_NODE:-1}
|
||||
export CMAKE_BUILD_PARALLEL_LEVEL=32
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() {
|
||||
set -e;
|
||||
docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
|
||||
remove_docker_container() {
|
||||
set -e;
|
||||
docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
|
||||
}
|
||||
trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
@ -69,7 +69,7 @@ function cpu_tests() {
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -s -v \
|
||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
|
||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
|
||||
|
||||
# Note: disable it until supports V1
|
||||
# Run AWQ test
|
||||
@ -83,7 +83,7 @@ function cpu_tests() {
|
||||
set -e
|
||||
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
|
||||
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
|
||||
python3 benchmarks/benchmark_serving.py \
|
||||
vllm bench serve \
|
||||
--backend vllm \
|
||||
--dataset-name random \
|
||||
--model meta-llama/Llama-3.2-3B-Instruct \
|
||||
|
||||
166
.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
Executable file
166
.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
Executable file
@ -0,0 +1,166 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -xu
|
||||
|
||||
|
||||
remove_docker_container() {
|
||||
docker rm -f tpu-test || true;
|
||||
docker rm -f vllm-tpu || true;
|
||||
}
|
||||
|
||||
trap remove_docker_container EXIT
|
||||
|
||||
# Remove the container that might not be cleaned up in the previous run.
|
||||
remove_docker_container
|
||||
|
||||
# Build the docker image.
|
||||
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
|
||||
|
||||
# Set up cleanup.
|
||||
cleanup_docker() {
|
||||
# Get Docker's root directory
|
||||
docker_root=$(docker info -f '{{.DockerRootDir}}')
|
||||
if [ -z "$docker_root" ]; then
|
||||
echo "Failed to determine Docker root directory."
|
||||
exit 1
|
||||
fi
|
||||
echo "Docker root directory: $docker_root"
|
||||
# Check disk usage of the filesystem where Docker's root directory is located
|
||||
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
|
||||
# Define the threshold
|
||||
threshold=70
|
||||
if [ "$disk_usage" -gt "$threshold" ]; then
|
||||
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
|
||||
# Remove dangling images (those that are not tagged and not used by any container)
|
||||
docker image prune -f
|
||||
# Remove unused volumes / force the system prune for old images as well.
|
||||
docker volume prune -f && docker system prune --force --filter "until=72h" --all
|
||||
echo "Docker images and volumes cleanup completed."
|
||||
else
|
||||
echo "Disk usage is below $threshold%. No cleanup needed."
|
||||
fi
|
||||
}
|
||||
cleanup_docker
|
||||
|
||||
# For HF_TOKEN.
|
||||
source /etc/environment
|
||||
|
||||
docker run --privileged --net host --shm-size=16G -it \
|
||||
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
|
||||
vllm-tpu /bin/bash -c '
|
||||
set -e # Exit immediately if a command exits with a non-zero status.
|
||||
set -u # Treat unset variables as an error.
|
||||
|
||||
echo "--- Starting script inside Docker container ---"
|
||||
|
||||
# Create results directory
|
||||
RESULTS_DIR=$(mktemp -d)
|
||||
# If mktemp fails, set -e will cause the script to exit.
|
||||
echo "Results will be stored in: $RESULTS_DIR"
|
||||
|
||||
# Install dependencies
|
||||
echo "--- Installing Python dependencies ---"
|
||||
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
||||
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
||||
&& python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 \
|
||||
&& python3 -m pip install --progress-bar off hf-transfer
|
||||
echo "--- Python dependencies installed ---"
|
||||
export VLLM_USE_V1=1
|
||||
export VLLM_XLA_CHECK_RECOMPILATION=1
|
||||
export VLLM_XLA_CACHE_PATH=
|
||||
echo "Using VLLM V1"
|
||||
|
||||
echo "--- Hardware Information ---"
|
||||
# tpu-info
|
||||
echo "--- Starting Tests ---"
|
||||
set +e
|
||||
overall_script_exit_code=0
|
||||
|
||||
# --- Test Definitions ---
|
||||
# If a test fails, this function will print logs and will not cause the main script to exit.
|
||||
run_test() {
|
||||
local test_num=$1
|
||||
local test_name=$2
|
||||
local test_command=$3
|
||||
local log_file="$RESULTS_DIR/test_${test_num}.log"
|
||||
local actual_exit_code
|
||||
|
||||
echo "--- TEST_$test_num: Running $test_name ---"
|
||||
|
||||
# Execute the test command.
|
||||
eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2)
|
||||
actual_exit_code=$?
|
||||
|
||||
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log
|
||||
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log
|
||||
|
||||
if [ "$actual_exit_code" -ne 0 ]; then
|
||||
echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2
|
||||
echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2
|
||||
if [ -f "$log_file" ]; then
|
||||
cat "$log_file" >&2
|
||||
else
|
||||
echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2
|
||||
fi
|
||||
echo "--- End of log for TEST_$test_num ($test_name) ---" >&2
|
||||
return "$actual_exit_code" # Return the failure code
|
||||
else
|
||||
echo "TEST_$test_num ($test_name) PASSED."
|
||||
return 0 # Return success
|
||||
fi
|
||||
}
|
||||
|
||||
# Helper function to call run_test and update the overall script exit code
|
||||
run_and_track_test() {
|
||||
local test_num_arg="$1"
|
||||
local test_name_arg="$2"
|
||||
local test_command_arg="$3"
|
||||
|
||||
# Run the test
|
||||
run_test "$test_num_arg" "$test_name_arg" "$test_command_arg"
|
||||
local test_specific_exit_code=$?
|
||||
|
||||
# If the test failed, set the overall script exit code to 1
|
||||
if [ "$test_specific_exit_code" -ne 0 ]; then
|
||||
# No need for extra echo here, run_test already logged the failure.
|
||||
overall_script_exit_code=1
|
||||
fi
|
||||
}
|
||||
|
||||
# --- Actual Test Execution ---
|
||||
run_and_track_test 1 "test_struct_output_generate.py" \
|
||||
"HF_HUB_DISABLE_XET=1 python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
|
||||
run_and_track_test 2 "test_moe_pallas.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
|
||||
run_and_track_test 3 "test_lora.py" \
|
||||
"VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
|
||||
run_and_track_test 4 "test_tpu_qkv_linear.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
|
||||
run_and_track_test 5 "test_spmd_model_weight_loading.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
|
||||
run_and_track_test 6 "test_kv_cache_update_kernel.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
|
||||
|
||||
# After all tests have been attempted, exit with the overall status.
|
||||
if [ "$overall_script_exit_code" -ne 0 ]; then
|
||||
echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---"
|
||||
else
|
||||
echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---"
|
||||
fi
|
||||
exit "$overall_script_exit_code"
|
||||
' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct.
|
||||
|
||||
# Capture the exit code of the docker run command
|
||||
DOCKER_RUN_EXIT_CODE=$?
|
||||
|
||||
# The trap will run for cleanup.
|
||||
# Exit the main script with the Docker run command's exit code.
|
||||
if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then
|
||||
echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE."
|
||||
exit "$DOCKER_RUN_EXIT_CODE"
|
||||
else
|
||||
echo "Docker run command completed successfully."
|
||||
exit 0
|
||||
fi
|
||||
# TODO: This test fails because it uses RANDOM_SEED sampling
|
||||
# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
||||
@ -150,18 +150,6 @@ run_and_track_test 9 "test_multimodal.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py"
|
||||
run_and_track_test 10 "test_pallas.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
|
||||
run_and_track_test 11 "test_struct_output_generate.py" \
|
||||
"HF_HUB_DISABLE_XET=1 python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
|
||||
run_and_track_test 12 "test_moe_pallas.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
|
||||
run_and_track_test 13 "test_lora.py" \
|
||||
"VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
|
||||
run_and_track_test 14 "test_tpu_qkv_linear.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
|
||||
run_and_track_test 15 "test_spmd_model_weight_loading.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
|
||||
run_and_track_test 16 "test_kv_cache_update_kernel.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
|
||||
|
||||
# After all tests have been attempted, exit with the overall status.
|
||||
if [ "$overall_script_exit_code" -ne 0 ]; then
|
||||
|
||||
@ -11,10 +11,10 @@ cd "$(dirname "${BASH_SOURCE[0]}")/../.."
|
||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||
|
||||
# run python-based benchmarks and upload the result to buildkite
|
||||
python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
|
||||
vllm bench latency --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
|
||||
bench_latency_exit_code=$?
|
||||
|
||||
python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
|
||||
vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
|
||||
bench_throughput_exit_code=$?
|
||||
|
||||
# run server-based benchmarks and upload the result to buildkite
|
||||
@ -24,7 +24,7 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
|
||||
|
||||
# wait for server to start, timeout after 600 seconds
|
||||
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
||||
python3 benchmarks/benchmark_serving.py \
|
||||
vllm bench serve \
|
||||
--backend vllm \
|
||||
--dataset-name sharegpt \
|
||||
--dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \
|
||||
|
||||
@ -77,7 +77,7 @@ done
|
||||
echo "run benchmark test..."
|
||||
echo "logging to $BM_LOG"
|
||||
echo
|
||||
python benchmarks/benchmark_serving.py \
|
||||
vllm bench serve \
|
||||
--backend vllm \
|
||||
--model $MODEL \
|
||||
--dataset-name sonnet \
|
||||
|
||||
2
.github/workflows/lint-and-deploy.yaml
vendored
2
.github/workflows/lint-and-deploy.yaml
vendored
@ -7,7 +7,7 @@ permissions:
|
||||
|
||||
jobs:
|
||||
lint-and-deploy:
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: ubuntu-24.04-arm
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
|
||||
@ -98,7 +98,7 @@ Then run the benchmarking script
|
||||
```bash
|
||||
# download dataset
|
||||
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||
python3 vllm/benchmarks/benchmark_serving.py \
|
||||
vllm bench serve \
|
||||
--backend vllm \
|
||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||
--endpoint /v1/completions \
|
||||
@ -111,25 +111,25 @@ If successful, you will see the following output
|
||||
|
||||
```
|
||||
============ Serving Benchmark Result ============
|
||||
Successful requests: 10
|
||||
Benchmark duration (s): 5.78
|
||||
Total input tokens: 1369
|
||||
Total generated tokens: 2212
|
||||
Request throughput (req/s): 1.73
|
||||
Output token throughput (tok/s): 382.89
|
||||
Total Token throughput (tok/s): 619.85
|
||||
Successful requests: 10
|
||||
Benchmark duration (s): 5.78
|
||||
Total input tokens: 1369
|
||||
Total generated tokens: 2212
|
||||
Request throughput (req/s): 1.73
|
||||
Output token throughput (tok/s): 382.89
|
||||
Total Token throughput (tok/s): 619.85
|
||||
---------------Time to First Token----------------
|
||||
Mean TTFT (ms): 71.54
|
||||
Median TTFT (ms): 73.88
|
||||
P99 TTFT (ms): 79.49
|
||||
Mean TTFT (ms): 71.54
|
||||
Median TTFT (ms): 73.88
|
||||
P99 TTFT (ms): 79.49
|
||||
-----Time per Output Token (excl. 1st token)------
|
||||
Mean TPOT (ms): 7.91
|
||||
Median TPOT (ms): 7.96
|
||||
P99 TPOT (ms): 8.03
|
||||
Mean TPOT (ms): 7.91
|
||||
Median TPOT (ms): 7.96
|
||||
P99 TPOT (ms): 8.03
|
||||
---------------Inter-token Latency----------------
|
||||
Mean ITL (ms): 7.74
|
||||
Median ITL (ms): 7.70
|
||||
P99 ITL (ms): 8.39
|
||||
Mean ITL (ms): 7.74
|
||||
Median ITL (ms): 7.70
|
||||
P99 ITL (ms): 8.39
|
||||
==================================================
|
||||
```
|
||||
|
||||
@ -141,7 +141,7 @@ If the dataset you want to benchmark is not supported yet in vLLM, even then you
|
||||
{"prompt": "What is the capital of India?"}
|
||||
{"prompt": "What is the capital of Iran?"}
|
||||
{"prompt": "What is the capital of China?"}
|
||||
```
|
||||
```
|
||||
|
||||
```bash
|
||||
# start server
|
||||
@ -150,7 +150,7 @@ VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests
|
||||
|
||||
```bash
|
||||
# run benchmarking script
|
||||
python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detailed \
|
||||
vllm bench serve --port 9001 --save-result --save-detailed \
|
||||
--backend vllm \
|
||||
--model meta-llama/Llama-3.1-8B-Instruct \
|
||||
--endpoint /v1/completions \
|
||||
@ -174,7 +174,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
|
||||
```
|
||||
|
||||
```bash
|
||||
python3 vllm/benchmarks/benchmark_serving.py \
|
||||
vllm bench serve \
|
||||
--backend openai-chat \
|
||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||
--endpoint /v1/chat/completions \
|
||||
@ -194,7 +194,7 @@ VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
|
||||
```
|
||||
|
||||
``` bash
|
||||
python3 benchmarks/benchmark_serving.py \
|
||||
vllm bench serve \
|
||||
--model meta-llama/Meta-Llama-3-8B-Instruct \
|
||||
--dataset-name hf \
|
||||
--dataset-path likaixin/InstructCoder \
|
||||
@ -210,7 +210,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
|
||||
**`lmms-lab/LLaVA-OneVision-Data`**
|
||||
|
||||
```bash
|
||||
python3 vllm/benchmarks/benchmark_serving.py \
|
||||
vllm bench serve \
|
||||
--backend openai-chat \
|
||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||
--endpoint /v1/chat/completions \
|
||||
@ -224,7 +224,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
|
||||
**`Aeala/ShareGPT_Vicuna_unfiltered`**
|
||||
|
||||
```bash
|
||||
python3 vllm/benchmarks/benchmark_serving.py \
|
||||
vllm bench serve \
|
||||
--backend openai-chat \
|
||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||
--endpoint /v1/chat/completions \
|
||||
@ -237,7 +237,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
|
||||
**`AI-MO/aimo-validation-aime`**
|
||||
|
||||
``` bash
|
||||
python3 vllm/benchmarks/benchmark_serving.py \
|
||||
vllm bench serve \
|
||||
--model Qwen/QwQ-32B \
|
||||
--dataset-name hf \
|
||||
--dataset-path AI-MO/aimo-validation-aime \
|
||||
@ -248,7 +248,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
|
||||
**`philschmid/mt-bench`**
|
||||
|
||||
``` bash
|
||||
python3 vllm/benchmarks/benchmark_serving.py \
|
||||
vllm bench serve \
|
||||
--model Qwen/QwQ-32B \
|
||||
--dataset-name hf \
|
||||
--dataset-path philschmid/mt-bench \
|
||||
@ -261,7 +261,7 @@ When using OpenAI-compatible backends such as `vllm`, optional sampling
|
||||
parameters can be specified. Example client command:
|
||||
|
||||
```bash
|
||||
python3 vllm/benchmarks/benchmark_serving.py \
|
||||
vllm bench serve \
|
||||
--backend vllm \
|
||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||
--endpoint /v1/completions \
|
||||
@ -296,7 +296,7 @@ The following arguments can be used to control the ramp-up:
|
||||
<br/>
|
||||
|
||||
```bash
|
||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
||||
vllm bench throughput \
|
||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||
--dataset-name sonnet \
|
||||
--dataset-path vllm/benchmarks/sonnet.txt \
|
||||
@ -314,7 +314,7 @@ Total num output tokens: 1500
|
||||
**VisionArena Benchmark for Vision Language Models**
|
||||
|
||||
``` bash
|
||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
||||
vllm bench throughput \
|
||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||
--backend vllm-chat \
|
||||
--dataset-name hf \
|
||||
@ -336,7 +336,7 @@ Total num output tokens: 1280
|
||||
``` bash
|
||||
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
||||
VLLM_USE_V1=1 \
|
||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
||||
vllm bench throughput \
|
||||
--dataset-name=hf \
|
||||
--dataset-path=likaixin/InstructCoder \
|
||||
--model=meta-llama/Meta-Llama-3-8B-Instruct \
|
||||
@ -360,7 +360,7 @@ Total num output tokens: 204800
|
||||
**`lmms-lab/LLaVA-OneVision-Data`**
|
||||
|
||||
```bash
|
||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
||||
vllm bench throughput \
|
||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||
--backend vllm-chat \
|
||||
--dataset-name hf \
|
||||
@ -373,7 +373,7 @@ python3 vllm/benchmarks/benchmark_throughput.py \
|
||||
**`Aeala/ShareGPT_Vicuna_unfiltered`**
|
||||
|
||||
```bash
|
||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
||||
vllm bench throughput \
|
||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||
--backend vllm-chat \
|
||||
--dataset-name hf \
|
||||
@ -385,7 +385,7 @@ python3 vllm/benchmarks/benchmark_throughput.py \
|
||||
**`AI-MO/aimo-validation-aime`**
|
||||
|
||||
```bash
|
||||
python3 benchmarks/benchmark_throughput.py \
|
||||
vllm bench throughput \
|
||||
--model Qwen/QwQ-32B \
|
||||
--backend vllm \
|
||||
--dataset-name hf \
|
||||
@ -399,7 +399,7 @@ python3 benchmarks/benchmark_throughput.py \
|
||||
``` bash
|
||||
# download dataset
|
||||
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
||||
vllm bench throughput \
|
||||
--model meta-llama/Llama-2-7b-hf \
|
||||
--backend vllm \
|
||||
--dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
|
||||
|
||||
@ -105,7 +105,7 @@ After the script finishes, you will find the results in a new, timestamped direc
|
||||
|
||||
- **Log Files**: The directory (`$BASE/auto-benchmark/YYYY_MM_DD_HH_MM/`) contains detailed logs for each run:
|
||||
- `vllm_log_...txt`: The log output from the vLLM server for each parameter combination.
|
||||
- `bm_log_...txt`: The log output from the `benchmark_serving.py` script for each benchmark run.
|
||||
- `bm_log_...txt`: The log output from the `vllm bench serve` command for each benchmark run.
|
||||
|
||||
- **Final Result Summary**: A file named `result.txt` is created in the log directory. It contains a summary of each tested combination and concludes with the overall best parameters found.
|
||||
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script aims to tune the best server parameter combinations to maximize throughput for given requirement.
|
||||
# This script aims to tune the best server parameter combinations to maximize throughput for given requirement.
|
||||
# See details in README (benchmarks/auto_tune/README.md).
|
||||
|
||||
TAG=$(date +"%Y_%m_%d_%H_%M")
|
||||
@ -56,7 +56,7 @@ start_server() {
|
||||
local max_num_batched_tokens=$3
|
||||
local vllm_log=$4
|
||||
local profile_dir=$5
|
||||
|
||||
|
||||
pkill -f vllm
|
||||
|
||||
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir vllm serve $MODEL \
|
||||
@ -73,9 +73,9 @@ start_server() {
|
||||
|
||||
# wait for 10 minutes...
|
||||
server_started=0
|
||||
for i in {1..60}; do
|
||||
for i in {1..60}; do
|
||||
RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
|
||||
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
|
||||
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
|
||||
if [[ "$STATUS_CODE" -eq 200 ]]; then
|
||||
server_started=1
|
||||
break
|
||||
@ -98,10 +98,10 @@ update_best_profile() {
|
||||
selected_profile_file=
|
||||
if [[ "$SYSTEM" == "TPU" ]]; then
|
||||
selected_profile_file="${sorted_paths[$profile_index]}/*.xplane.pb"
|
||||
fi
|
||||
fi
|
||||
if [[ "$SYSTEM" == "GPU" ]]; then
|
||||
selected_profile_file="${sorted_paths[$profile_index]}"
|
||||
fi
|
||||
fi
|
||||
rm -f $PROFILE_PATH/*
|
||||
cp $selected_profile_file $PROFILE_PATH
|
||||
}
|
||||
@ -129,14 +129,14 @@ run_benchmark() {
|
||||
echo "server started."
|
||||
fi
|
||||
echo
|
||||
|
||||
|
||||
echo "run benchmark test..."
|
||||
meet_latency_requirement=0
|
||||
# get a basic qps by using request-rate inf
|
||||
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
|
||||
prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
|
||||
adjusted_input_len=$(( INPUT_LEN - prefix_len ))
|
||||
python3 benchmarks/benchmark_serving.py \
|
||||
vllm bench serve \
|
||||
--backend vllm \
|
||||
--model $MODEL \
|
||||
--dataset-name random \
|
||||
@ -169,7 +169,7 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len ))
|
||||
curl -X POST http://0.0.0.0:8004/reset_prefix_cache
|
||||
sleep 5
|
||||
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
|
||||
python3 benchmarks/benchmark_serving.py \
|
||||
vllm bench serve \
|
||||
--backend vllm \
|
||||
--model $MODEL \
|
||||
--dataset-name random \
|
||||
|
||||
@ -11,6 +11,7 @@ from typing import Any, Optional
|
||||
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
from typing_extensions import deprecated
|
||||
|
||||
import vllm.envs as envs
|
||||
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
||||
@ -34,6 +35,10 @@ def save_to_pytorch_benchmark_format(
|
||||
write_to_json(pt_file, pt_records)
|
||||
|
||||
|
||||
@deprecated(
|
||||
"benchmark_latency.py is deprecated and will be removed in a "
|
||||
"future version. Please use 'vllm bench latency' instead.",
|
||||
)
|
||||
def main(args: argparse.Namespace):
|
||||
print(args)
|
||||
|
||||
|
||||
@ -38,6 +38,7 @@ from typing import Any, Literal, Optional
|
||||
import numpy as np
|
||||
from tqdm.asyncio import tqdm
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
from typing_extensions import deprecated
|
||||
|
||||
from backend_request_func import (
|
||||
ASYNC_REQUEST_FUNCS,
|
||||
@ -593,6 +594,10 @@ def save_to_pytorch_benchmark_format(
|
||||
write_to_json(pt_file, pt_records)
|
||||
|
||||
|
||||
@deprecated(
|
||||
"benchmark_serving.py is deprecated and will be removed in a future "
|
||||
"version. Please use 'vllm bench serve' instead.",
|
||||
)
|
||||
def main(args: argparse.Namespace):
|
||||
print(args)
|
||||
random.seed(args.seed)
|
||||
|
||||
@ -15,6 +15,7 @@ import torch
|
||||
import uvloop
|
||||
from tqdm import tqdm
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase
|
||||
from typing_extensions import deprecated
|
||||
|
||||
from benchmark_dataset import (
|
||||
AIMODataset,
|
||||
@ -382,6 +383,10 @@ def get_requests(args, tokenizer):
|
||||
return dataset_cls(**common_kwargs).sample(**sample_kwargs)
|
||||
|
||||
|
||||
@deprecated(
|
||||
"benchmark_throughput.py is deprecated and will be removed in a "
|
||||
"future version. Please use 'vllm bench throughput' instead.",
|
||||
)
|
||||
def main(args: argparse.Namespace):
|
||||
if args.seed is None:
|
||||
args.seed = 0
|
||||
|
||||
@ -3,7 +3,7 @@
|
||||
# benchmark the overhead of disaggregated prefill.
|
||||
# methodology:
|
||||
# - send all request to prefill vLLM instance. It will buffer KV cache.
|
||||
# - then send all request to decode instance.
|
||||
# - then send all request to decode instance.
|
||||
# - The TTFT of decode instance is the overhead.
|
||||
|
||||
set -ex
|
||||
@ -12,6 +12,8 @@ kill_gpu_processes() {
|
||||
# kill all processes on GPU.
|
||||
pgrep pt_main_thread | xargs -r kill -9
|
||||
pgrep python3 | xargs -r kill -9
|
||||
# vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
|
||||
pgrep VLLM | xargs -r kill -9
|
||||
sleep 10
|
||||
|
||||
# remove vllm config file
|
||||
@ -61,7 +63,7 @@ benchmark() {
|
||||
--gpu-memory-utilization 0.6 \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
||||
|
||||
|
||||
|
||||
CUDA_VISIBLE_DEVICES=1 python3 \
|
||||
-m vllm.entrypoints.openai.api_server \
|
||||
@ -76,38 +78,38 @@ benchmark() {
|
||||
wait_for_server 8200
|
||||
|
||||
# let the prefill instance finish prefill
|
||||
python3 ../benchmark_serving.py \
|
||||
--backend vllm \
|
||||
--model $model \
|
||||
--dataset-name $dataset_name \
|
||||
--dataset-path $dataset_path \
|
||||
--sonnet-input-len $input_len \
|
||||
--sonnet-output-len "$output_len" \
|
||||
--sonnet-prefix-len $prefix_len \
|
||||
--num-prompts $num_prompts \
|
||||
--port 8100 \
|
||||
--save-result \
|
||||
--result-dir $results_folder \
|
||||
--result-filename disagg_prefill_tp1.json \
|
||||
--request-rate "inf"
|
||||
vllm bench serve \
|
||||
--backend vllm \
|
||||
--model $model \
|
||||
--dataset-name $dataset_name \
|
||||
--dataset-path $dataset_path \
|
||||
--sonnet-input-len $input_len \
|
||||
--sonnet-output-len "$output_len" \
|
||||
--sonnet-prefix-len $prefix_len \
|
||||
--num-prompts $num_prompts \
|
||||
--port 8100 \
|
||||
--save-result \
|
||||
--result-dir $results_folder \
|
||||
--result-filename disagg_prefill_tp1.json \
|
||||
--request-rate "inf"
|
||||
|
||||
|
||||
# send the request to decode.
|
||||
# The TTFT of this command will be the overhead of disagg prefill impl.
|
||||
python3 ../benchmark_serving.py \
|
||||
--backend vllm \
|
||||
--model $model \
|
||||
--dataset-name $dataset_name \
|
||||
--dataset-path $dataset_path \
|
||||
--sonnet-input-len $input_len \
|
||||
--sonnet-output-len "$output_len" \
|
||||
--sonnet-prefix-len $prefix_len \
|
||||
--num-prompts $num_prompts \
|
||||
--port 8200 \
|
||||
--save-result \
|
||||
--result-dir $results_folder \
|
||||
--result-filename disagg_prefill_tp1_overhead.json \
|
||||
--request-rate "$qps"
|
||||
vllm bench serve \
|
||||
--backend vllm \
|
||||
--model $model \
|
||||
--dataset-name $dataset_name \
|
||||
--dataset-path $dataset_path \
|
||||
--sonnet-input-len $input_len \
|
||||
--sonnet-output-len "$output_len" \
|
||||
--sonnet-prefix-len $prefix_len \
|
||||
--num-prompts $num_prompts \
|
||||
--port 8200 \
|
||||
--save-result \
|
||||
--result-dir $results_folder \
|
||||
--result-filename disagg_prefill_tp1_overhead.json \
|
||||
--request-rate "$qps"
|
||||
kill_gpu_processes
|
||||
|
||||
}
|
||||
|
||||
@ -18,6 +18,8 @@ kill_gpu_processes() {
|
||||
# kill all processes on GPU.
|
||||
pgrep pt_main_thread | xargs -r kill -9
|
||||
pgrep python3 | xargs -r kill -9
|
||||
# vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
|
||||
pgrep VLLM | xargs -r kill -9
|
||||
for port in 8000 8100 8200; do lsof -t -i:$port | xargs -r kill -9; done
|
||||
sleep 1
|
||||
}
|
||||
@ -58,7 +60,7 @@ launch_chunked_prefill() {
|
||||
|
||||
|
||||
launch_disagg_prefill() {
|
||||
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
|
||||
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
|
||||
# disagg prefill
|
||||
CUDA_VISIBLE_DEVICES=0 python3 \
|
||||
-m vllm.entrypoints.openai.api_server \
|
||||
@ -97,20 +99,20 @@ benchmark() {
|
||||
output_len=$2
|
||||
tag=$3
|
||||
|
||||
python3 ../benchmark_serving.py \
|
||||
--backend vllm \
|
||||
--model $model \
|
||||
--dataset-name $dataset_name \
|
||||
--dataset-path $dataset_path \
|
||||
--sonnet-input-len $input_len \
|
||||
--sonnet-output-len "$output_len" \
|
||||
--sonnet-prefix-len $prefix_len \
|
||||
--num-prompts $num_prompts \
|
||||
--port 8000 \
|
||||
--save-result \
|
||||
--result-dir $results_folder \
|
||||
--result-filename "$tag"-qps-"$qps".json \
|
||||
--request-rate "$qps"
|
||||
vllm bench serve \
|
||||
--backend vllm \
|
||||
--model $model \
|
||||
--dataset-name $dataset_name \
|
||||
--dataset-path $dataset_path \
|
||||
--sonnet-input-len $input_len \
|
||||
--sonnet-output-len "$output_len" \
|
||||
--sonnet-prefix-len $prefix_len \
|
||||
--num-prompts $num_prompts \
|
||||
--port 8000 \
|
||||
--save-result \
|
||||
--result-dir $results_folder \
|
||||
--result-filename "$tag"-qps-"$qps".json \
|
||||
--request-rate "$qps"
|
||||
|
||||
sleep 2
|
||||
}
|
||||
|
||||
@ -5,9 +5,8 @@ import itertools
|
||||
|
||||
import torch
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
|
||||
moe_align_block_size_triton,
|
||||
moe_align_block_size,
|
||||
)
|
||||
from vllm.triton_utils import triton
|
||||
|
||||
@ -21,60 +20,6 @@ def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor:
|
||||
)
|
||||
|
||||
|
||||
def check_correctness(num_tokens, num_experts=256, block_size=256, topk=8):
|
||||
"""
|
||||
Verifies vllm vs. Triton
|
||||
"""
|
||||
topk_ids = get_topk_ids(num_tokens, num_experts, topk)
|
||||
|
||||
# 1. malloc space for triton and vllm
|
||||
# malloc enough space (max_num_tokens_padded) for the sorted ids
|
||||
max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
|
||||
sorted_ids_triton = torch.empty(
|
||||
(max_num_tokens_padded,), dtype=torch.int32, device="cuda"
|
||||
)
|
||||
expert_ids_triton = torch.empty(
|
||||
(max_num_tokens_padded // block_size,), dtype=torch.int32, device="cuda"
|
||||
)
|
||||
num_tokens_post_pad_triton = torch.empty((1,), dtype=torch.int32, device="cuda")
|
||||
|
||||
sorted_ids_vllm = torch.empty_like(sorted_ids_triton)
|
||||
expert_ids_vllm = torch.empty_like(expert_ids_triton)
|
||||
num_tokens_post_pad_vllm = torch.empty_like(num_tokens_post_pad_triton)
|
||||
|
||||
# 2. run implementations
|
||||
moe_align_block_size_triton(
|
||||
topk_ids,
|
||||
num_experts,
|
||||
block_size,
|
||||
sorted_ids_triton,
|
||||
expert_ids_triton,
|
||||
num_tokens_post_pad_triton,
|
||||
)
|
||||
|
||||
ops.moe_align_block_size(
|
||||
topk_ids,
|
||||
num_experts,
|
||||
block_size,
|
||||
sorted_ids_vllm,
|
||||
expert_ids_vllm,
|
||||
num_tokens_post_pad_vllm,
|
||||
)
|
||||
print(f"✅ VLLM implementation works with {num_experts} experts!")
|
||||
|
||||
# 3. compare results
|
||||
if torch.allclose(expert_ids_triton, expert_ids_vllm) and torch.allclose(
|
||||
num_tokens_post_pad_triton, num_tokens_post_pad_vllm
|
||||
):
|
||||
print("✅ Triton and VLLM implementations match.")
|
||||
else:
|
||||
print("❌ Triton and VLLM implementations DO NOT match.")
|
||||
print("Triton expert_ids:", expert_ids_triton)
|
||||
print("VLLM expert_ids:", expert_ids_vllm)
|
||||
print("Triton num_tokens_post_pad:", num_tokens_post_pad_triton)
|
||||
print("VLLM num_tokens_post_pad:", num_tokens_post_pad_vllm)
|
||||
|
||||
|
||||
# test configurations
|
||||
num_tokens_range = [1, 16, 256, 4096]
|
||||
num_experts_range = [16, 64, 224, 256, 280, 512]
|
||||
@ -87,8 +32,8 @@ configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range
|
||||
x_names=["num_tokens", "num_experts", "topk"],
|
||||
x_vals=configs,
|
||||
line_arg="provider",
|
||||
line_vals=["vllm", "triton"], # "triton"
|
||||
line_names=["VLLM", "Triton"], # "Triton"
|
||||
line_vals=["vllm"],
|
||||
line_names=["vLLM"],
|
||||
plot_name="moe-align-block-size-performance",
|
||||
args={},
|
||||
)
|
||||
@ -98,36 +43,11 @@ def benchmark(num_tokens, num_experts, topk, provider):
|
||||
block_size = 256
|
||||
topk_ids = get_topk_ids(num_tokens, num_experts, topk)
|
||||
|
||||
max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
|
||||
sorted_ids = torch.empty((max_num_tokens_padded,), dtype=torch.int32, device="cuda")
|
||||
max_num_m_blocks = max_num_tokens_padded // block_size
|
||||
expert_ids = torch.empty((max_num_m_blocks,), dtype=torch.int32, device="cuda")
|
||||
num_tokens_post_pad = torch.empty((1,), dtype=torch.int32, device="cuda")
|
||||
|
||||
quantiles = [0.5, 0.2, 0.8]
|
||||
|
||||
if provider == "vllm":
|
||||
ms, min_ms, max_ms = triton.testing.do_bench(
|
||||
lambda: ops.moe_align_block_size(
|
||||
topk_ids,
|
||||
num_experts,
|
||||
block_size,
|
||||
sorted_ids.clone(),
|
||||
expert_ids.clone(),
|
||||
num_tokens_post_pad.clone(),
|
||||
),
|
||||
quantiles=quantiles,
|
||||
)
|
||||
elif provider == "triton":
|
||||
ms, min_ms, max_ms = triton.testing.do_bench(
|
||||
lambda: moe_align_block_size_triton(
|
||||
topk_ids,
|
||||
num_experts,
|
||||
block_size,
|
||||
sorted_ids.clone(),
|
||||
expert_ids.clone(),
|
||||
num_tokens_post_pad.clone(),
|
||||
),
|
||||
lambda: moe_align_block_size(topk_ids, block_size, num_experts),
|
||||
quantiles=quantiles,
|
||||
)
|
||||
|
||||
@ -151,6 +71,4 @@ if __name__ == "__main__":
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
print("Running correctness check...")
|
||||
check_correctness(num_tokens=1024, num_experts=args.num_experts, topk=args.topk)
|
||||
benchmark.run(print_data=True, show_plots=True)
|
||||
|
||||
@ -8,12 +8,13 @@ import ray
|
||||
import torch
|
||||
from transformers import AutoConfig
|
||||
|
||||
from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
|
||||
from vllm.model_executor.layers.fused_moe.fused_moe import *
|
||||
from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
|
||||
_moe_permute,
|
||||
_moe_unpermute_and_reduce,
|
||||
moe_permute,
|
||||
moe_unpermute,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.fused_moe import *
|
||||
from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import *
|
||||
from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
@ -63,18 +64,19 @@ def benchmark_permute(
|
||||
|
||||
def run():
|
||||
if use_customized_permute:
|
||||
(permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = (
|
||||
moe_permute(
|
||||
qhidden_states,
|
||||
topk_weights=topk_weights,
|
||||
topk_ids=topk_ids,
|
||||
token_expert_indices=token_expert_indices,
|
||||
topk=topk,
|
||||
n_expert=num_experts,
|
||||
n_local_expert=num_experts,
|
||||
expert_map=None,
|
||||
align_block_size=align_block_size,
|
||||
)
|
||||
(
|
||||
permuted_hidden_states,
|
||||
a1q_scale,
|
||||
first_token_off,
|
||||
inv_perm_idx,
|
||||
m_indices,
|
||||
) = moe_permute(
|
||||
qhidden_states,
|
||||
a1q_scale=None,
|
||||
topk_ids=topk_ids,
|
||||
n_expert=num_experts,
|
||||
expert_map=None,
|
||||
align_block_size=align_block_size,
|
||||
)
|
||||
else:
|
||||
(
|
||||
@ -150,18 +152,19 @@ def benchmark_unpermute(
|
||||
|
||||
def prepare():
|
||||
if use_customized_permute:
|
||||
(permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = (
|
||||
moe_permute(
|
||||
qhidden_states,
|
||||
topk_weights=topk_weights,
|
||||
topk_ids=topk_ids,
|
||||
token_expert_indices=token_expert_indices,
|
||||
topk=topk,
|
||||
n_expert=num_experts,
|
||||
n_local_expert=num_experts,
|
||||
expert_map=None,
|
||||
align_block_size=align_block_size,
|
||||
)
|
||||
(
|
||||
permuted_hidden_states,
|
||||
a1q_scale,
|
||||
first_token_off,
|
||||
inv_perm_idx,
|
||||
m_indices,
|
||||
) = moe_permute(
|
||||
qhidden_states,
|
||||
a1q_scale=None,
|
||||
topk_ids=topk_ids,
|
||||
n_expert=num_experts,
|
||||
expert_map=None,
|
||||
align_block_size=align_block_size,
|
||||
)
|
||||
# convert to fp16/bf16 as gemm output
|
||||
return (
|
||||
@ -191,16 +194,19 @@ def benchmark_unpermute(
|
||||
|
||||
def run(input: tuple):
|
||||
if use_customized_permute:
|
||||
(permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = input
|
||||
(
|
||||
permuted_hidden_states,
|
||||
first_token_off,
|
||||
inv_perm_idx,
|
||||
m_indices,
|
||||
) = input
|
||||
output = torch.empty_like(hidden_states)
|
||||
moe_unpermute(
|
||||
output,
|
||||
permuted_hidden_states,
|
||||
topk_weights,
|
||||
topk_ids,
|
||||
inv_perm_idx,
|
||||
first_token_off,
|
||||
topk,
|
||||
num_experts,
|
||||
num_experts,
|
||||
)
|
||||
else:
|
||||
(
|
||||
@ -211,7 +217,11 @@ def benchmark_unpermute(
|
||||
inv_perm,
|
||||
) = input
|
||||
_moe_unpermute_and_reduce(
|
||||
output_hidden_states, permuted_hidden_states, inv_perm, topk_weights
|
||||
output_hidden_states,
|
||||
permuted_hidden_states,
|
||||
inv_perm,
|
||||
topk_weights,
|
||||
True,
|
||||
)
|
||||
|
||||
# JIT compilation & warmup
|
||||
|
||||
@ -151,7 +151,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding);
|
||||
|
||||
// Quantization
|
||||
#if defined(__AVX512F__) || defined(__aarch64__)
|
||||
#if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__))
|
||||
at::Tag stride_tag = at::Tag::needs_fixed_stride_order;
|
||||
|
||||
// Compute int8 quantized tensor for given scaling factor.
|
||||
|
||||
@ -10,32 +10,28 @@
|
||||
|
||||
void moe_permute(
|
||||
const torch::Tensor& input, // [n_token, hidden]
|
||||
const torch::Tensor& topk_weights, //[n_token, topk]
|
||||
torch::Tensor& topk_ids, // [n_token, topk]
|
||||
const torch::Tensor& topk_ids, // [n_token, topk]
|
||||
const torch::Tensor& token_expert_indices, // [n_token, topk]
|
||||
const std::optional<torch::Tensor>& expert_map, // [n_expert]
|
||||
int64_t n_expert, int64_t n_local_expert, int64_t topk,
|
||||
const std::optional<int64_t>& align_block_size,
|
||||
torch::Tensor&
|
||||
permuted_input, // [topk * n_token/align_block_size_m, hidden]
|
||||
torch::Tensor& permuted_input, // [permuted_size, hidden]
|
||||
torch::Tensor& expert_first_token_offset, // [n_local_expert + 1]
|
||||
torch::Tensor& src_row_id2dst_row_id_map, // [n_token, topk]
|
||||
torch::Tensor& inv_permuted_idx, // [n_token, topk]
|
||||
torch::Tensor& permuted_idx, // [permute_size]
|
||||
torch::Tensor& m_indices) { // [align_expand_m]
|
||||
TORCH_CHECK(topk_weights.scalar_type() == at::ScalarType::Float,
|
||||
"topk_weights must be float32");
|
||||
TORCH_CHECK(expert_first_token_offset.scalar_type() == at::ScalarType::Long,
|
||||
"expert_first_token_offset must be int64");
|
||||
TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int,
|
||||
"topk_ids must be int32");
|
||||
TORCH_CHECK(token_expert_indices.scalar_type() == at::ScalarType::Int,
|
||||
"token_expert_indices must be int32");
|
||||
TORCH_CHECK(src_row_id2dst_row_id_map.scalar_type() == at::ScalarType::Int,
|
||||
"src_row_id2dst_row_id_map must be int32");
|
||||
TORCH_CHECK(inv_permuted_idx.scalar_type() == at::ScalarType::Int,
|
||||
"inv_permuted_idx must be int32");
|
||||
TORCH_CHECK(expert_first_token_offset.size(0) == n_local_expert + 1,
|
||||
"expert_first_token_offset shape != n_local_expert+1")
|
||||
TORCH_CHECK(
|
||||
src_row_id2dst_row_id_map.sizes() == token_expert_indices.sizes(),
|
||||
"token_expert_indices shape must be same as src_row_id2dst_row_id_map");
|
||||
TORCH_CHECK(inv_permuted_idx.sizes() == token_expert_indices.sizes(),
|
||||
"token_expert_indices shape must be same as inv_permuted_idx");
|
||||
auto n_token = input.sizes()[0];
|
||||
auto n_hidden = input.sizes()[1];
|
||||
auto align_block_size_value =
|
||||
@ -46,8 +42,9 @@ void moe_permute(
|
||||
auto sort_workspace = torch::empty(
|
||||
{sorter_size},
|
||||
torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false));
|
||||
auto copy_topk_ids = topk_ids.clone(); // copy topk_ids for preprocess
|
||||
auto permuted_experts_id = torch::empty_like(topk_ids);
|
||||
auto dst_row_id2src_row_id_map = torch::empty_like(src_row_id2dst_row_id_map);
|
||||
auto sorted_row_idx = torch::empty_like(inv_permuted_idx);
|
||||
auto align_expert_first_token_offset =
|
||||
torch::zeros_like(expert_first_token_offset);
|
||||
|
||||
@ -67,24 +64,22 @@ void moe_permute(
|
||||
const int* expert_map_ptr = get_ptr<int>(expert_map.value());
|
||||
valid_num_ptr =
|
||||
get_ptr<int64_t>(expert_first_token_offset) + n_local_expert;
|
||||
preprocessTopkIdLauncher(get_ptr<int>(topk_ids), n_token * topk,
|
||||
preprocessTopkIdLauncher(get_ptr<int>(copy_topk_ids), n_token * topk,
|
||||
expert_map_ptr, n_expert, stream);
|
||||
}
|
||||
// expert sort topk expert id and scan expert id get expert_first_token_offset
|
||||
sortAndScanExpert(get_ptr<int>(topk_ids), get_ptr<int>(token_expert_indices),
|
||||
get_ptr<int>(permuted_experts_id),
|
||||
get_ptr<int>(dst_row_id2src_row_id_map),
|
||||
get_ptr<int64_t>(expert_first_token_offset), n_token,
|
||||
n_expert, n_local_expert, topk, sorter,
|
||||
get_ptr<int>(sort_workspace), stream);
|
||||
sortAndScanExpert(
|
||||
get_ptr<int>(copy_topk_ids), get_ptr<int>(token_expert_indices),
|
||||
get_ptr<int>(permuted_experts_id), get_ptr<int>(sorted_row_idx),
|
||||
get_ptr<int64_t>(expert_first_token_offset), n_token, n_expert,
|
||||
n_local_expert, topk, sorter, get_ptr<int>(sort_workspace), stream);
|
||||
|
||||
// dispatch expandInputRowsKernelLauncher
|
||||
MOE_DISPATCH(input.scalar_type(), [&] {
|
||||
expandInputRowsKernelLauncher<scalar_t>(
|
||||
get_ptr<scalar_t>(input), get_ptr<scalar_t>(permuted_input),
|
||||
get_ptr<float>(topk_weights), get_ptr<int>(permuted_experts_id),
|
||||
get_ptr<int>(dst_row_id2src_row_id_map),
|
||||
get_ptr<int>(src_row_id2dst_row_id_map),
|
||||
get_ptr<int>(permuted_experts_id), get_ptr<int>(sorted_row_idx),
|
||||
get_ptr<int>(inv_permuted_idx), get_ptr<int>(permuted_idx),
|
||||
get_ptr<int64_t>(expert_first_token_offset), n_token, valid_num_ptr,
|
||||
n_hidden, topk, n_local_expert, align_block_size_value, stream);
|
||||
});
|
||||
@ -101,32 +96,34 @@ void moe_permute(
|
||||
}
|
||||
|
||||
void moe_unpermute(
|
||||
const torch::Tensor& permuted_hidden_states, // [n_token * topk, hidden]
|
||||
const torch::Tensor& topk_weights, //[n_token, topk]
|
||||
const torch::Tensor& topk_ids, // [n_token, topk]
|
||||
const torch::Tensor& src_row_id2dst_row_id_map, // [n_token, topk]
|
||||
const torch::Tensor& expert_first_token_offset, // [n_local_expert+1]
|
||||
int64_t n_expert, int64_t n_local_expert, int64_t topk,
|
||||
const torch::Tensor& permuted_hidden_states, // [n_token * topk, hidden]
|
||||
const torch::Tensor& topk_weights, // [n_token, topk]
|
||||
const torch::Tensor& inv_permuted_idx, // [n_token, topk]
|
||||
const std::optional<torch::Tensor>&
|
||||
expert_first_token_offset, // [n_local_expert+1]
|
||||
int64_t topk,
|
||||
torch::Tensor& hidden_states // [n_token, hidden]
|
||||
) {
|
||||
TORCH_CHECK(src_row_id2dst_row_id_map.sizes() == topk_ids.sizes(),
|
||||
"topk_ids shape must be same as src_row_id2dst_row_id_map");
|
||||
TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int,
|
||||
"topk_ids must be int32");
|
||||
TORCH_CHECK(
|
||||
permuted_hidden_states.scalar_type() == hidden_states.scalar_type(),
|
||||
"topk_ids dtype must be same as src_row_id2dst_row_id_map");
|
||||
"permuted_hidden_states dtype must be same as hidden_states");
|
||||
auto n_token = hidden_states.size(0);
|
||||
auto n_hidden = hidden_states.size(1);
|
||||
auto stream = at::cuda::getCurrentCUDAStream().stream();
|
||||
const int64_t* valid_ptr =
|
||||
get_ptr<int64_t>(expert_first_token_offset) + n_local_expert;
|
||||
|
||||
int64_t const* valid_ptr = nullptr;
|
||||
if (expert_first_token_offset.has_value()) {
|
||||
int n_local_expert = expert_first_token_offset.value().size(0) - 1;
|
||||
valid_ptr =
|
||||
get_ptr<int64_t>(expert_first_token_offset.value()) + n_local_expert;
|
||||
}
|
||||
|
||||
MOE_DISPATCH(hidden_states.scalar_type(), [&] {
|
||||
finalizeMoeRoutingKernelLauncher<scalar_t, scalar_t>(
|
||||
get_ptr<scalar_t>(permuted_hidden_states),
|
||||
get_ptr<scalar_t>(hidden_states), get_ptr<float>(topk_weights),
|
||||
get_ptr<int>(src_row_id2dst_row_id_map), get_ptr<int>(topk_ids),
|
||||
n_token, n_hidden, topk, valid_ptr, stream);
|
||||
get_ptr<int>(inv_permuted_idx), n_token, n_hidden, topk, valid_ptr,
|
||||
stream);
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@ -177,7 +177,7 @@ __global__ void getMIndicesKernel(int64_t* expert_first_token_offset,
|
||||
int tidx = threadIdx.x;
|
||||
extern __shared__ int64_t smem_expert_first_token_offset[];
|
||||
for (int i = tidx; i <= num_local_expert; i += blockDim.x) {
|
||||
smem_expert_first_token_offset[tidx] = __ldg(expert_first_token_offset + i);
|
||||
smem_expert_first_token_offset[i] = __ldg(expert_first_token_offset + i);
|
||||
}
|
||||
__syncthreads();
|
||||
auto last_token_offset = smem_expert_first_token_offset[eidx + 1];
|
||||
|
||||
@ -57,31 +57,19 @@ void sortAndScanExpert(int* expert_for_source_row, const int* source_rows,
|
||||
|
||||
template <typename T>
|
||||
void expandInputRowsKernelLauncher(
|
||||
T const* unpermuted_input, T* permuted_output,
|
||||
const float* unpermuted_scales, int* sorted_experts,
|
||||
T const* unpermuted_input, T* permuted_output, int* sorted_experts,
|
||||
int const* expanded_dest_row_to_expanded_source_row,
|
||||
int* expanded_source_row_to_expanded_dest_row,
|
||||
int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
|
||||
int64_t* expert_first_token_offset, int64_t const num_rows,
|
||||
int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k,
|
||||
int num_local_experts, const int& align_block_size, cudaStream_t stream);
|
||||
|
||||
// Final kernel to unpermute and scale
|
||||
// This kernel unpermutes the original data, does the k-way reduction and
|
||||
// performs the final skip connection.
|
||||
template <typename T, typename OutputType, bool CHECK_SKIPPED>
|
||||
__global__ void finalizeMoeRoutingKernel(
|
||||
T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
|
||||
float const* scales, int const* expanded_source_row_to_expanded_dest_row,
|
||||
int const* expert_for_source_row, int64_t const orig_cols, int64_t const k,
|
||||
int64_t const* num_valid_ptr);
|
||||
|
||||
template <class T, class OutputType>
|
||||
void finalizeMoeRoutingKernelLauncher(
|
||||
T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
|
||||
float const* scales, int const* expanded_source_row_to_expanded_dest_row,
|
||||
int const* expert_for_source_row, int64_t const num_rows,
|
||||
int64_t const cols, int64_t const k, int64_t const* num_valid_ptr,
|
||||
cudaStream_t stream);
|
||||
int64_t const num_rows, int64_t const cols, int64_t const k,
|
||||
int64_t const* num_valid_ptr, cudaStream_t stream);
|
||||
|
||||
void preprocessTopkIdLauncher(int* topk_id_ptr, int size,
|
||||
const int* expert_map_ptr, int num_experts,
|
||||
|
||||
@ -2,10 +2,9 @@
|
||||
|
||||
template <typename T, bool CHECK_SKIPPED, bool ALIGN_BLOCK_SIZE>
|
||||
__global__ void expandInputRowsKernel(
|
||||
T const* unpermuted_input, T* permuted_output,
|
||||
const float* unpermuted_scales, int* sorted_experts,
|
||||
T const* unpermuted_input, T* permuted_output, int* sorted_experts,
|
||||
int const* expanded_dest_row_to_expanded_source_row,
|
||||
int* expanded_source_row_to_expanded_dest_row,
|
||||
int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
|
||||
int64_t* expert_first_token_offset, int64_t const num_rows,
|
||||
int64_t const* num_dest_rows, int64_t const cols, int64_t k,
|
||||
int num_local_experts, int align_block_size) {
|
||||
@ -54,6 +53,10 @@ __global__ void expandInputRowsKernel(
|
||||
assert(expanded_dest_row <= INT32_MAX);
|
||||
expanded_source_row_to_expanded_dest_row[expanded_source_row] =
|
||||
static_cast<int>(expanded_dest_row);
|
||||
// skip non local expert token
|
||||
if (!CHECK_SKIPPED || blockIdx.x < *num_dest_rows) {
|
||||
permuted_idx[expanded_dest_row] = expanded_source_row;
|
||||
}
|
||||
}
|
||||
|
||||
if (!CHECK_SKIPPED || blockIdx.x < *num_dest_rows) {
|
||||
@ -62,7 +65,7 @@ __global__ void expandInputRowsKernel(
|
||||
using DataElem = cutlass::Array<T, ELEM_PER_THREAD>;
|
||||
|
||||
// Duplicate and permute rows
|
||||
int64_t const source_row = expanded_source_row % num_rows;
|
||||
int64_t const source_row = expanded_source_row / k;
|
||||
|
||||
auto const* source_row_ptr =
|
||||
reinterpret_cast<DataElem const*>(unpermuted_input + source_row * cols);
|
||||
@ -82,10 +85,9 @@ __global__ void expandInputRowsKernel(
|
||||
|
||||
template <typename T>
|
||||
void expandInputRowsKernelLauncher(
|
||||
T const* unpermuted_input, T* permuted_output,
|
||||
const float* unpermuted_scales, int* sorted_experts,
|
||||
T const* unpermuted_input, T* permuted_output, int* sorted_experts,
|
||||
int const* expanded_dest_row_to_expanded_source_row,
|
||||
int* expanded_source_row_to_expanded_dest_row,
|
||||
int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
|
||||
int64_t* expert_first_token_offset, int64_t const num_rows,
|
||||
int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k,
|
||||
int num_local_experts, const int& align_block_size, cudaStream_t stream) {
|
||||
@ -105,11 +107,11 @@ void expandInputRowsKernelLauncher(
|
||||
int64_t smem_size = sizeof(int64_t) * (num_local_experts + 1);
|
||||
|
||||
func<<<blocks, threads, smem_size, stream>>>(
|
||||
unpermuted_input, permuted_output, unpermuted_scales, sorted_experts,
|
||||
unpermuted_input, permuted_output, sorted_experts,
|
||||
expanded_dest_row_to_expanded_source_row,
|
||||
expanded_source_row_to_expanded_dest_row, expert_first_token_offset,
|
||||
num_rows, num_valid_tokens_ptr, cols, k, num_local_experts,
|
||||
align_block_size);
|
||||
expanded_source_row_to_expanded_dest_row, permuted_idx,
|
||||
expert_first_token_offset, num_rows, num_valid_tokens_ptr, cols, k,
|
||||
num_local_experts, align_block_size);
|
||||
}
|
||||
|
||||
template <class T, class U>
|
||||
@ -128,11 +130,9 @@ template <typename T, typename OutputType, bool CHECK_SKIPPED>
|
||||
__global__ void finalizeMoeRoutingKernel(
|
||||
T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
|
||||
float const* scales, int const* expanded_source_row_to_expanded_dest_row,
|
||||
int const* expert_for_source_row, int64_t const orig_cols, int64_t const k,
|
||||
int64_t const* num_valid_ptr) {
|
||||
int64_t const orig_cols, int64_t const k, int64_t const* num_valid_ptr) {
|
||||
assert(orig_cols % 4 == 0);
|
||||
int64_t const original_row = blockIdx.x;
|
||||
int64_t const num_rows = gridDim.x;
|
||||
auto const offset = original_row * orig_cols;
|
||||
OutputType* reduced_row_ptr = reduced_unpermuted_output + offset;
|
||||
int64_t const num_valid = *num_valid_ptr;
|
||||
@ -159,14 +159,13 @@ __global__ void finalizeMoeRoutingKernel(
|
||||
ComputeElem thread_output;
|
||||
thread_output.fill(0);
|
||||
for (int k_idx = 0; k_idx < k; ++k_idx) {
|
||||
int64_t const expanded_original_row = original_row + k_idx * num_rows;
|
||||
int64_t const expanded_original_row = original_row * k + k_idx;
|
||||
int64_t const expanded_permuted_row =
|
||||
expanded_source_row_to_expanded_dest_row[expanded_original_row];
|
||||
|
||||
int64_t const k_offset = original_row * k + k_idx;
|
||||
float const row_scale = scales[k_offset];
|
||||
|
||||
// Check after row_rescale has accumulated
|
||||
if (CHECK_SKIPPED && expanded_permuted_row >= num_valid) {
|
||||
continue;
|
||||
}
|
||||
@ -189,9 +188,8 @@ template <class T, class OutputType>
|
||||
void finalizeMoeRoutingKernelLauncher(
|
||||
T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
|
||||
float const* scales, int const* expanded_source_row_to_expanded_dest_row,
|
||||
int const* expert_for_source_row, int64_t const num_rows,
|
||||
int64_t const cols, int64_t const k, int64_t const* num_valid_ptr,
|
||||
cudaStream_t stream) {
|
||||
int64_t const num_rows, int64_t const cols, int64_t const k,
|
||||
int64_t const* num_valid_ptr, cudaStream_t stream) {
|
||||
int64_t const blocks = num_rows;
|
||||
int64_t const threads = 256;
|
||||
bool const check_finished = num_valid_ptr != nullptr;
|
||||
@ -201,6 +199,5 @@ void finalizeMoeRoutingKernelLauncher(
|
||||
auto* const kernel = func_map[check_finished];
|
||||
kernel<<<blocks, threads, 0, stream>>>(
|
||||
expanded_permuted_rows, reduced_unpermuted_output, scales,
|
||||
expanded_source_row_to_expanded_dest_row, expert_for_source_row, cols, k,
|
||||
num_valid_ptr);
|
||||
expanded_source_row_to_expanded_dest_row, cols, k, num_valid_ptr);
|
||||
}
|
||||
|
||||
@ -56,18 +56,17 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
|
||||
" -> Tensor");
|
||||
|
||||
m.def(
|
||||
"moe_permute(Tensor input, Tensor topk_weight, Tensor! topk_ids,"
|
||||
"moe_permute(Tensor input, Tensor topk_ids,"
|
||||
"Tensor token_expert_indices, Tensor? expert_map, int n_expert,"
|
||||
"int n_local_expert,"
|
||||
"int topk, int? align_block_size,Tensor! permuted_input, Tensor! "
|
||||
"expert_first_token_offset, Tensor! src_row_id2dst_row_id_map, Tensor! "
|
||||
"m_indices)->()");
|
||||
"expert_first_token_offset, Tensor! inv_permuted_idx, Tensor! "
|
||||
"permuted_idx, Tensor! m_indices)->()");
|
||||
|
||||
m.def(
|
||||
"moe_unpermute(Tensor permuted_hidden_states, Tensor topk_weights,"
|
||||
"Tensor topk_ids,Tensor src_row_id2dst_row_id_map, Tensor "
|
||||
"expert_first_token_offset, int n_expert, int n_local_expert,int "
|
||||
"topk, Tensor! hidden_states)->()");
|
||||
"Tensor inv_permuted_idx, Tensor? expert_first_token_offset, "
|
||||
"int topk, Tensor! hidden_states)->()");
|
||||
|
||||
m.def("moe_permute_unpermute_supported() -> bool");
|
||||
m.impl("moe_permute_unpermute_supported", &moe_permute_unpermute_supported);
|
||||
|
||||
@ -292,6 +292,11 @@ void per_token_group_quant_fp8(const torch::Tensor& input,
|
||||
torch::Tensor& output_q, torch::Tensor& output_s,
|
||||
int64_t group_size, double eps, double fp8_min,
|
||||
double fp8_max, bool scale_ue8m0);
|
||||
|
||||
void per_token_group_quant_int8(const torch::Tensor& input,
|
||||
torch::Tensor& output_q,
|
||||
torch::Tensor& output_s, int64_t group_size,
|
||||
double eps, double int8_min, double int8_max);
|
||||
#endif
|
||||
|
||||
void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
|
||||
|
||||
@ -1,6 +1,8 @@
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <torch/all.h>
|
||||
|
||||
#include "../per_token_group_quant_8bit.h"
|
||||
|
||||
#include <cmath>
|
||||
|
||||
#include "../../dispatch_utils.h"
|
||||
@ -336,3 +338,11 @@ void dynamic_scaled_int8_quant(
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
void per_token_group_quant_int8(const torch::Tensor& input,
|
||||
torch::Tensor& output_q,
|
||||
torch::Tensor& output_s, int64_t group_size,
|
||||
double eps, double int8_min, double int8_max) {
|
||||
per_token_group_quant_8bit(input, output_q, output_s, group_size, eps,
|
||||
int8_min, int8_max);
|
||||
}
|
||||
@ -1,6 +1,8 @@
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/util/Float8_e4m3fn.h>
|
||||
|
||||
#include "../per_token_group_quant_8bit.h"
|
||||
|
||||
#include <cmath>
|
||||
|
||||
#include <cuda_fp16.h>
|
||||
@ -120,7 +122,7 @@ void per_token_group_quant_8bit(const torch::Tensor& input,
|
||||
torch::Tensor& output_q,
|
||||
torch::Tensor& output_s, int64_t group_size,
|
||||
double eps, double min_8bit, double max_8bit,
|
||||
bool scale_ue8m0 = false) {
|
||||
bool scale_ue8m0) {
|
||||
TORCH_CHECK(input.is_contiguous());
|
||||
TORCH_CHECK(output_q.is_contiguous());
|
||||
|
||||
@ -198,6 +200,8 @@ void per_token_group_quant_8bit(const torch::Tensor& input,
|
||||
input.scalar_type(), "per_token_group_quant_8bit", ([&] {
|
||||
if (dst_type == at::ScalarType::Float8_e4m3fn) {
|
||||
LAUNCH_KERNEL(scalar_t, c10::Float8_e4m3fn);
|
||||
} else if (dst_type == at::ScalarType::Char) {
|
||||
LAUNCH_KERNEL(scalar_t, int8_t);
|
||||
}
|
||||
}));
|
||||
|
||||
|
||||
@ -187,8 +187,12 @@ struct PrepackedLayoutBTemplate {
|
||||
CUTE_HOST_DEVICE static constexpr auto TVbNbKL_to_offset_copy(
|
||||
Shape_NKL shape_mkl) {
|
||||
auto layout = TVbNbKL_to_offset(shape_mkl);
|
||||
return make_layout(coalesce(get<0>(layout)), get<1>(layout),
|
||||
get<2>(layout));
|
||||
// for 4-bit elements, having >= 64 values per column
|
||||
// allows TMA to load full 32-byte sectors
|
||||
auto inner_layout =
|
||||
make_layout(make_shape(_256{}, size<0>(layout) / _256{}));
|
||||
|
||||
return make_layout(inner_layout, get<1>(layout), get<2>(layout));
|
||||
}
|
||||
|
||||
// ((BlockN, BlockK), (BlocksN, BlocksK), L) -> (storage_idx)
|
||||
|
||||
10
csrc/quantization/per_token_group_quant_8bit.h
Normal file
10
csrc/quantization/per_token_group_quant_8bit.h
Normal file
@ -0,0 +1,10 @@
|
||||
#pragma once
|
||||
#include <torch/all.h>
|
||||
|
||||
// TODO(wentao): refactor the folder to 8bit, then includes fp8 and int8 folders
|
||||
// 8-bit per-token-group quantization helper used by both FP8 and INT8
|
||||
void per_token_group_quant_8bit(const torch::Tensor& input,
|
||||
torch::Tensor& output_q,
|
||||
torch::Tensor& output_s, int64_t group_size,
|
||||
double eps, double min_8bit, double max_8bit,
|
||||
bool scale_ue8m0 = false);
|
||||
@ -624,6 +624,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
ops.impl("per_token_group_fp8_quant", torch::kCUDA,
|
||||
&per_token_group_quant_fp8);
|
||||
|
||||
// Compute per-token-group INT8 quantized tensor and scaling factor.
|
||||
ops.def(
|
||||
"per_token_group_quant_int8(Tensor input, Tensor! output_q, Tensor! "
|
||||
"output_s, int group_size, float eps, float int8_min, float int8_max) -> "
|
||||
"()");
|
||||
ops.impl("per_token_group_quant_int8", torch::kCUDA,
|
||||
&per_token_group_quant_int8);
|
||||
|
||||
// reorder weight for AllSpark Ampere W8A16 Fused Gemm kernel
|
||||
ops.def(
|
||||
"rearrange_kn_weight_as_n32k16_order(Tensor b_qweight, Tensor b_scales, "
|
||||
|
||||
@ -1,62 +0,0 @@
|
||||
# This vLLM Dockerfile is used to construct an image that can build and run vLLM on ARM CPU platform.
|
||||
|
||||
FROM ubuntu:22.04 AS cpu-test-arm
|
||||
|
||||
ENV CCACHE_DIR=/root/.cache/ccache
|
||||
|
||||
ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
|
||||
|
||||
RUN --mount=type=cache,target=/var/cache/apt \
|
||||
apt-get update -y \
|
||||
&& apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
|
||||
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
|
||||
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
|
||||
|
||||
# tcmalloc provides better memory allocation efficiency, e.g., holding memory in caches to speed up access of commonly-used objects.
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
pip install py-cpuinfo # Use this to gather CPU info and optimize based on ARM Neoverse cores
|
||||
|
||||
# Set LD_PRELOAD for tcmalloc on ARM
|
||||
ENV LD_PRELOAD="/usr/lib/aarch64-linux-gnu/libtcmalloc_minimal.so.4"
|
||||
|
||||
RUN echo 'ulimit -c 0' >> ~/.bashrc
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
|
||||
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
--mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
|
||||
pip install --upgrade pip && \
|
||||
pip install -r requirements/build.txt
|
||||
|
||||
FROM cpu-test-arm AS build
|
||||
|
||||
WORKDIR /workspace/vllm
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
--mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
|
||||
--mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
|
||||
pip install -v -r requirements/cpu.txt
|
||||
|
||||
COPY . .
|
||||
ARG GIT_REPO_CHECK=0
|
||||
RUN --mount=type=bind,source=.git,target=.git \
|
||||
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
|
||||
|
||||
# Disabling AVX512 specific optimizations for ARM
|
||||
ARG VLLM_CPU_DISABLE_AVX512="true"
|
||||
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
--mount=type=cache,target=/root/.cache/ccache \
|
||||
--mount=type=bind,source=.git,target=.git \
|
||||
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
|
||||
pip install dist/*.whl && \
|
||||
rm -rf dist
|
||||
|
||||
WORKDIR /workspace/
|
||||
|
||||
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
|
||||
|
||||
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
||||
@ -1,4 +1,11 @@
|
||||
# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
|
||||
# This vLLM Dockerfile is used to build images that can run vLLM on both x86_64 and arm64 CPU platforms.
|
||||
#
|
||||
# Supported platforms:
|
||||
# - linux/amd64 (x86_64)
|
||||
# - linux/arm64 (aarch64)
|
||||
#
|
||||
# Use the `--platform` option with `docker buildx build` to specify the target architecture, e.g.:
|
||||
# docker buildx build --platform=linux/arm64 -f docker/Dockerfile.cpu .
|
||||
#
|
||||
# Build targets:
|
||||
# vllm-openai (default): used for serving deployment
|
||||
@ -53,7 +60,20 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --upgrade pip && \
|
||||
uv pip install -r requirements/cpu.txt
|
||||
|
||||
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/venv/lib/libiomp5.so:$LD_PRELOAD"
|
||||
ARG TARGETARCH
|
||||
ENV TARGETARCH=${TARGETARCH}
|
||||
|
||||
RUN if [ "$TARGETARCH" = "arm64" ]; then \
|
||||
PRELOAD_PATH="/usr/lib/aarch64-linux-gnu/libtcmalloc_minimal.so.4"; \
|
||||
else \
|
||||
PRELOAD_PATH="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/venv/lib/libiomp5.so"; \
|
||||
fi && \
|
||||
echo "export LD_PRELOAD=$PRELOAD_PATH" >> ~/.bashrc
|
||||
|
||||
# Ensure that the LD_PRELOAD environment variable for export is in effect.
|
||||
SHELL ["/bin/bash", "-c"]
|
||||
|
||||
ENV LD_PRELOAD=${LD_PRELOAD}
|
||||
|
||||
RUN echo 'ulimit -c 0' >> ~/.bashrc
|
||||
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
ARG NIGHTLY_DATE="20250714"
|
||||
ARG NIGHTLY_DATE="20250724"
|
||||
ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.12_tpuvm_$NIGHTLY_DATE"
|
||||
|
||||
FROM $BASE_IMAGE
|
||||
|
||||
@ -9,10 +9,13 @@ We support tracing vLLM workers using the `torch.profiler` module. You can enabl
|
||||
|
||||
The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` environment variable set.
|
||||
|
||||
When using `benchmarks/benchmark_serving.py`, you can enable profiling by passing the `--profile` flag.
|
||||
When using `vllm bench serve`, you can enable profiling by passing the `--profile` flag.
|
||||
|
||||
Traces can be visualized using <https://ui.perfetto.dev/>.
|
||||
|
||||
!!! tip
|
||||
You can directly call bench module without installing vllm using `python -m vllm.entrypoints.cli.main bench`.
|
||||
|
||||
!!! tip
|
||||
Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly.
|
||||
|
||||
@ -35,10 +38,10 @@ VLLM_TORCH_PROFILER_DIR=./vllm_profile \
|
||||
--model meta-llama/Meta-Llama-3-70B
|
||||
```
|
||||
|
||||
benchmark_serving.py:
|
||||
vllm bench command:
|
||||
|
||||
```bash
|
||||
python benchmarks/benchmark_serving.py \
|
||||
vllm bench serve \
|
||||
--backend vllm \
|
||||
--model meta-llama/Meta-Llama-3-70B \
|
||||
--dataset-name sharegpt \
|
||||
@ -69,13 +72,13 @@ apt install nsight-systems-cli
|
||||
|
||||
For basic usage, you can just append `nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace=node` before any existing script you would run for offline inference.
|
||||
|
||||
The following is an example using the `benchmarks/benchmark_latency.py` script:
|
||||
The following is an example using the `vllm bench latency` script:
|
||||
|
||||
```bash
|
||||
nsys profile -o report.nsys-rep \
|
||||
--trace-fork-before-exec=true \
|
||||
--cuda-graph-trace=node \
|
||||
python benchmarks/benchmark_latency.py \
|
||||
vllm bench latency \
|
||||
--model meta-llama/Llama-3.1-8B-Instruct \
|
||||
--num-iters-warmup 5 \
|
||||
--num-iters 1 \
|
||||
@ -98,7 +101,7 @@ nsys profile -o report.nsys-rep \
|
||||
vllm serve meta-llama/Llama-3.1-8B-Instruct
|
||||
|
||||
# client
|
||||
python benchmarks/benchmark_serving.py \
|
||||
vllm bench serve \
|
||||
--backend vllm \
|
||||
--model meta-llama/Llama-3.1-8B-Instruct \
|
||||
--num-prompts 1 \
|
||||
@ -132,7 +135,7 @@ You can view these profiles either as summaries in the CLI, using `nsys stats [p
|
||||
...
|
||||
** CUDA GPU Kernel Summary (cuda_gpu_kern_sum):
|
||||
|
||||
Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
|
||||
Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
|
||||
-------- --------------- --------- ----------- ----------- -------- --------- ----------- ----------------------------------------------------------------------------------------------------
|
||||
46.3 10,327,352,338 17,505 589,965.9 144,383.0 27,040 3,126,460 944,263.8 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize128x128x64_warpgroupsize1x1x1_execute_segment_k_of…
|
||||
14.8 3,305,114,764 5,152 641,520.7 293,408.0 287,296 2,822,716 867,124.9 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize256x128x64_warpgroupsize2x1x1_execute_segment_k_of…
|
||||
@ -143,7 +146,7 @@ You can view these profiles either as summaries in the CLI, using `nsys stats [p
|
||||
2.6 587,283,113 37,824 15,526.7 3,008.0 2,719 2,517,756 139,091.1 std::enable_if<T2>(int)0&&vllm::_typeConvert<T1>::exists, void>::type vllm::fused_add_rms_norm_kern…
|
||||
1.9 418,362,605 18,912 22,121.5 3,871.0 3,328 2,523,870 175,248.2 void vllm::rotary_embedding_kernel<c10::BFloat16, (bool)1>(const long *, T1 *, T1 *, const T1 *, in…
|
||||
0.7 167,083,069 18,880 8,849.7 2,240.0 1,471 2,499,996 101,436.1 void vllm::reshape_and_cache_flash_kernel<__nv_bfloat16, __nv_bfloat16, (vllm::Fp8KVCacheDataType)0…
|
||||
...
|
||||
...
|
||||
```
|
||||
|
||||
GUI example:
|
||||
|
||||
@ -3,14 +3,14 @@ An implementation of xPyD with dynamic scaling based on point-to-point communica
|
||||
# Detailed Design
|
||||
|
||||
## Overall Process
|
||||
As shown in Figure 1, the overall process of this **PD disaggregation** solution is described through a request flow:
|
||||
As shown in Figure 1, the overall process of this **PD disaggregation** solution is described through a request flow:
|
||||
|
||||
1. The client sends an HTTP request to the Proxy/Router's `/v1/completions` interface.
|
||||
2. The Proxy/Router selects a **1P1D (1 Prefill instance + 1 Decode instance)** through either through round-robin or random selection, generates a `request_id` (rules to be introduced later), modifies the `max_tokens` in the HTTP request message to **1**, and then forwards the request to the **P instance**.
|
||||
3. Immediately afterward, the Proxy/Router forwards the **original HTTP request** to the **D instance**.
|
||||
4. The **P instance** performs **Prefill** and then **actively sends the generated KV cache** to the D instance (using **PUT_ASYNC** mode). The D instance's `zmq_addr` can be resolved through the `request_id`.
|
||||
5. The **D instance** has a **dedicated thread** for receiving the KV cache (to avoid blocking the main process). The received KV cache is saved into the **GPU memory buffer**, the size of which is determined by the vLLM startup parameter `kv_buffer_size`. When the GPU buffer is full, the KV cache is stored in the **local Tensor memory pool**.
|
||||
6. During the **Decode**, the D instance's main process retrieves the KV cache (transmitted by the P instance) from either the **GPU buffer** or the **memory pool**, thereby **skipping Prefill**.
|
||||
1. The client sends an HTTP request to the Proxy/Router's `/v1/completions` interface.
|
||||
2. The Proxy/Router selects a **1P1D (1 Prefill instance + 1 Decode instance)** through either through round-robin or random selection, generates a `request_id` (rules to be introduced later), modifies the `max_tokens` in the HTTP request message to **1**, and then forwards the request to the **P instance**.
|
||||
3. Immediately afterward, the Proxy/Router forwards the **original HTTP request** to the **D instance**.
|
||||
4. The **P instance** performs **Prefill** and then **actively sends the generated KV cache** to the D instance (using **PUT_ASYNC** mode). The D instance's `zmq_addr` can be resolved through the `request_id`.
|
||||
5. The **D instance** has a **dedicated thread** for receiving the KV cache (to avoid blocking the main process). The received KV cache is saved into the **GPU memory buffer**, the size of which is determined by the vLLM startup parameter `kv_buffer_size`. When the GPU buffer is full, the KV cache is stored in the **local Tensor memory pool**.
|
||||
6. During the **Decode**, the D instance's main process retrieves the KV cache (transmitted by the P instance) from either the **GPU buffer** or the **memory pool**, thereby **skipping Prefill**.
|
||||
7. After completing **Decode**, the D instance returns the result to the **Proxy/Router**, which then forwards it to the **client**.
|
||||
|
||||

|
||||
@ -291,7 +291,7 @@ curl -X POST -s http://10.0.1.1:10001/v1/completions \
|
||||
??? console "Command"
|
||||
|
||||
```shell
|
||||
python3 benchmark_serving.py \
|
||||
vllm bench serve \
|
||||
--backend vllm \
|
||||
--model base_model \
|
||||
--tokenizer meta-llama/Llama-3.1-8B-Instruct \
|
||||
|
||||
@ -177,6 +177,70 @@ Multi-image input can be extended to perform video captioning. We show this with
|
||||
You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary
|
||||
instead of using multi-image input.
|
||||
|
||||
Instead of NumPy arrays, you can also pass `'torch.Tensor'` instances, as shown in this example using Qwen2.5-VL:
|
||||
|
||||
??? code
|
||||
|
||||
```python
|
||||
from transformers import AutoProcessor
|
||||
from vllm import LLM, SamplingParams
|
||||
from qwen_vl_utils import process_vision_info
|
||||
|
||||
model_path = "Qwen/Qwen2.5-VL-3B-Instruct/"
|
||||
video_path = "https://content.pexels.com/videos/free-videos.mp4"
|
||||
|
||||
llm = LLM(
|
||||
model=model_path,
|
||||
gpu_memory_utilization=0.8,
|
||||
enforce_eager=True,
|
||||
limit_mm_per_prompt={"video": 1},
|
||||
)
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=1024,
|
||||
)
|
||||
|
||||
video_messages = [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": [
|
||||
{"type": "text", "text": "describe this video."},
|
||||
{
|
||||
"type": "video",
|
||||
"video": video_path,
|
||||
"total_pixels": 20480 * 28 * 28,
|
||||
"min_pixels": 16 * 28 * 28
|
||||
}
|
||||
]
|
||||
},
|
||||
]
|
||||
|
||||
messages = video_messages
|
||||
processor = AutoProcessor.from_pretrained(model_path)
|
||||
prompt = processor.apply_chat_template(
|
||||
messages,
|
||||
tokenize=False,
|
||||
add_generation_prompt=True,
|
||||
)
|
||||
|
||||
image_inputs, video_inputs = process_vision_info(messages)
|
||||
mm_data = {}
|
||||
if video_inputs is not None:
|
||||
mm_data["video"] = video_inputs
|
||||
|
||||
llm_inputs = {
|
||||
"prompt": prompt,
|
||||
"multi_modal_data": mm_data,
|
||||
}
|
||||
|
||||
outputs = llm.generate([llm_inputs], sampling_params=sampling_params)
|
||||
for o in outputs:
|
||||
generated_text = o.outputs[0].text
|
||||
print(generated_text)
|
||||
```
|
||||
|
||||
!!! note
|
||||
'process_vision_info' is only applicable to Qwen2.5-VL and similar models.
|
||||
|
||||
Full example: <gh-file:examples/offline_inference/vision_language.py>
|
||||
|
||||
### Audio Inputs
|
||||
|
||||
@ -6,6 +6,7 @@ Contents:
|
||||
|
||||
- [Supported Hardware](supported_hardware.md)
|
||||
- [AutoAWQ](auto_awq.md)
|
||||
- [AutoRound](auto_round.md)
|
||||
- [BitsAndBytes](bnb.md)
|
||||
- [BitBLAS](bitblas.md)
|
||||
- [GGUF](gguf.md)
|
||||
|
||||
103
docs/features/quantization/auto_round.md
Normal file
103
docs/features/quantization/auto_round.md
Normal file
@ -0,0 +1,103 @@
|
||||
# AutoRound
|
||||
|
||||
[AutoRound](https://github.com/intel/auto-round) is Intel’s advanced quantization algorithm designed to produce highly efficient **INT2, INT3, INT4, and INT8**
|
||||
quantized large language models—striking an optimal balance between accuracy and deployment performance.
|
||||
|
||||
AutoRound applies weight-only quantization to transformer-based models, enabling significant memory savings and faster
|
||||
inference while maintaining near-original accuracy. It supports a wide range of hardware platforms, including **CPUs,
|
||||
Intel GPUs, HPUs, and CUDA-enabled devices**.
|
||||
|
||||
Please refer to the [AutoRound guide](https://github.com/intel/auto-round/blob/main/docs/step_by_step.md) for more details.
|
||||
|
||||
Key Features:
|
||||
|
||||
✅ **AutoRound, AutoAWQ, AutoGPTQ, and GGUF** are supported
|
||||
|
||||
✅ **10+ vision-language models (VLMs)** are supported
|
||||
|
||||
✅ **Per-layer mixed-bit quantization** for fine-grained control
|
||||
|
||||
✅ **RTN (Round-To-Nearest) mode** for quick quantization with slight accuracy loss
|
||||
|
||||
✅ **Multiple quantization recipes**: best, base, and light
|
||||
|
||||
✅ Advanced utilities such as immediate packing and support for **10+ backends**
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
uv pip install auto-round
|
||||
```
|
||||
|
||||
## Quantizing a model
|
||||
|
||||
For VLMs, please change to `auto-round-mllm` in CLI usage and `AutoRoundMLLM` in API usage.
|
||||
|
||||
### CLI usage
|
||||
|
||||
```bash
|
||||
auto-round \
|
||||
--model Qwen/Qwen3-0.6B \
|
||||
--bits 4 \
|
||||
--group_size 128 \
|
||||
--format "auto_round" \
|
||||
--output_dir ./tmp_autoround
|
||||
```
|
||||
|
||||
```bash
|
||||
auto-round \
|
||||
--model Qwen/Qwen3-0.6B \
|
||||
--format "gguf:q4_k_m" \
|
||||
--output_dir ./tmp_autoround
|
||||
```
|
||||
|
||||
### API usage
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
from auto_round import AutoRound
|
||||
|
||||
model_name = "Qwen/Qwen3-0.6B"
|
||||
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
|
||||
bits, group_size, sym = 4, 128, True
|
||||
autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym)
|
||||
|
||||
# the best accuracy, 4-5X slower, low_gpu_mem_usage could save ~20G but ~30% slower
|
||||
# autoround = AutoRound(model, tokenizer, nsamples=512, iters=1000, low_gpu_mem_usage=True, bits=bits, group_size=group_size, sym=sym)
|
||||
|
||||
# 2-3X speedup, slight accuracy drop at W4G128
|
||||
# autoround = AutoRound(model, tokenizer, nsamples=128, iters=50, lr=5e-3, bits=bits, group_size=group_size, sym=sym )
|
||||
|
||||
output_dir = "./tmp_autoround"
|
||||
# format= 'auto_round'(default), 'auto_gptq', 'auto_awq'
|
||||
autoround.quantize_and_save(output_dir, format="auto_round")
|
||||
```
|
||||
|
||||
## Running a quantized model with vLLM
|
||||
|
||||
Here is some example code to run auto-round format in vLLM:
|
||||
|
||||
```python
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
]
|
||||
sampling_params = SamplingParams(temperature=0.6, top_p=0.95)
|
||||
model_name = "Intel/DeepSeek-R1-0528-Qwen3-8B-int4-AutoRound"
|
||||
llm = LLM(model=model_name)
|
||||
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
```
|
||||
|
||||
# Acknowledgement
|
||||
|
||||
Special thanks to open-source low precision libraries such as AutoGPTQ, AutoAWQ, GPTQModel, Triton, Marlin, and
|
||||
ExLLaMAV2 for providing low-precision CUDA kernels, which are leveraged in AutoRound.
|
||||
@ -33,7 +33,7 @@ Testing has been conducted on AWS Graviton3 instances for compatibility.
|
||||
# --8<-- [end:pre-built-images]
|
||||
# --8<-- [start:build-image-from-source]
|
||||
```bash
|
||||
docker build -f docker/Dockerfile.arm \
|
||||
docker build -f docker/Dockerfile.cpu \
|
||||
--tag vllm-cpu-env .
|
||||
|
||||
# Launching OpenAI server
|
||||
|
||||
@ -365,6 +365,7 @@ th {
|
||||
| `Grok1ModelForCausalLM` | Grok1 | `hpcai-tech/grok-1`. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `HunYuanDenseV1ForCausalLM` | Hunyuan-7B-Instruct-0124 | `tencent/Hunyuan-7B-Instruct-0124` | ✅︎ | | ✅︎ |
|
||||
| `HunYuanMoEV1ForCausalLM` | Hunyuan-80B-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | | ✅︎ |
|
||||
| `HCXVisionForCausalLM` | HyperCLOVAX-SEED-Vision-Instruct-3B | `naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B` | | | ✅︎ |
|
||||
| `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `InternLM3ForCausalLM` | InternLM3 | `internlm/internlm3-8b-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
@ -592,6 +593,7 @@ Specified using `--task generate`.
|
||||
| `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ |
|
||||
| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ |
|
||||
| `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, etc. | | ✅︎ | ✅︎ |
|
||||
| `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | | | ✅︎ |
|
||||
| `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | | ✅︎ |
|
||||
@ -612,6 +614,7 @@ Specified using `--task generate`.
|
||||
| `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ | ⚠️ |
|
||||
| `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ | ✅︎ |
|
||||
| `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Phi4MultimodalForCausalLM` | Phi-4-multimodal (HF Transformers) | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct` (with revision `refs/pr/70`), etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `PixtralForConditionalGeneration` | Mistral 3 (Mistral format), Pixtral (Mistral format) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistralai/Pixtral-12B-2409`, etc. | | ✅︎ | ✅︎ |
|
||||
| `QwenVLForConditionalGeneration`<sup>^</sup> | Qwen-VL | T + I<sup>E+</sup> | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A<sup>+</sup> | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ | ✅︎ |
|
||||
|
||||
@ -2,10 +2,14 @@
|
||||
|
||||
Reinforcement Learning from Human Feedback (RLHF) is a technique that fine-tunes language models using human-generated preference data to align model outputs with desired behaviors.
|
||||
|
||||
vLLM can be used to generate the completions for RLHF. The best way to do this is with libraries like [TRL](https://github.com/huggingface/trl), [OpenRLHF](https://github.com/OpenRLHF/OpenRLHF) and [verl](https://github.com/volcengine/verl).
|
||||
vLLM can be used to generate the completions for RLHF. Some ways to do this include using libraries like [TRL](https://github.com/huggingface/trl), [OpenRLHF](https://github.com/OpenRLHF/OpenRLHF), [verl](https://github.com/volcengine/verl) and [unsloth](https://github.com/unslothai/unsloth).
|
||||
|
||||
See the following basic examples to get started if you don't want to use an existing library:
|
||||
|
||||
- [Training and inference processes are located on separate GPUs (inspired by OpenRLHF)](../examples/offline_inference/rlhf.md)
|
||||
- [Training and inference processes are colocated on the same GPUs using Ray](../examples/offline_inference/rlhf_colocate.md)
|
||||
- [Utilities for performing RLHF with vLLM](../examples/offline_inference/rlhf_utils.md)
|
||||
|
||||
See the following notebooks showing how to use vLLM for GRPO:
|
||||
|
||||
- [Qwen-3 4B GRPO using Unsloth + vLLM](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_(4B)-GRPO.ipynb)
|
||||
|
||||
@ -190,6 +190,37 @@ def run_phi4mm(question: str, audio_count: int) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
def run_phi4_multimodal(question: str, audio_count: int) -> ModelRequestData:
|
||||
"""
|
||||
Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
|
||||
show how to process audio inputs.
|
||||
"""
|
||||
model_path = snapshot_download(
|
||||
"microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70"
|
||||
)
|
||||
# Since the vision-lora and speech-lora co-exist with the base model,
|
||||
# we have to manually specify the path of the lora weights.
|
||||
speech_lora_path = os.path.join(model_path, "speech-lora")
|
||||
placeholders = "<|audio|>" * audio_count
|
||||
|
||||
prompts = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_path,
|
||||
max_model_len=12800,
|
||||
max_num_seqs=2,
|
||||
enable_lora=True,
|
||||
max_lora_rank=320,
|
||||
limit_mm_per_prompt={"audio": audio_count},
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompts,
|
||||
lora_requests=[LoRARequest("speech", 1, speech_lora_path)],
|
||||
)
|
||||
|
||||
|
||||
# Qwen2-Audio
|
||||
def run_qwen2_audio(question: str, audio_count: int) -> ModelRequestData:
|
||||
model_name = "Qwen/Qwen2-Audio-7B-Instruct"
|
||||
@ -303,6 +334,7 @@ model_example_map = {
|
||||
"granite_speech": run_granite_speech,
|
||||
"minicpmo": run_minicpmo,
|
||||
"phi4_mm": run_phi4mm,
|
||||
"phi4_multimodal": run_phi4_multimodal,
|
||||
"qwen2_audio": run_qwen2_audio,
|
||||
"qwen2_5_omni": run_qwen2_5_omni,
|
||||
"ultravox": run_ultravox,
|
||||
|
||||
@ -3,12 +3,12 @@
|
||||
import argparse
|
||||
import datetime
|
||||
import os
|
||||
import re
|
||||
from typing import Union
|
||||
|
||||
import albumentations
|
||||
import numpy as np
|
||||
import rasterio
|
||||
import regex as re
|
||||
import torch
|
||||
from einops import rearrange
|
||||
from terratorch.datamodules import Sen1Floods11NonGeoDataModule
|
||||
|
||||
@ -29,6 +29,7 @@ import shutil
|
||||
from pathlib import Path
|
||||
|
||||
from vllm import LLM, EngineArgs
|
||||
from vllm.model_executor.model_loader import ShardedStateLoader
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
@ -39,7 +40,10 @@ def parse_args():
|
||||
"--output", "-o", required=True, type=str, help="path to output checkpoint"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--file-pattern", type=str, help="string pattern of saved filenames"
|
||||
"--file-pattern",
|
||||
type=str,
|
||||
default=ShardedStateLoader.DEFAULT_PATTERN,
|
||||
help="string pattern of saved filenames",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-file-size",
|
||||
|
||||
@ -316,6 +316,85 @@ def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
# naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B
|
||||
def run_hyperclovax_seed_vision(
|
||||
questions: list[str], modality: str
|
||||
) -> ModelRequestData:
|
||||
model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
max_model_len=8192 if modality == "image" else 16384,
|
||||
limit_mm_per_prompt={modality: 1},
|
||||
)
|
||||
|
||||
messages = list()
|
||||
for question in questions:
|
||||
if modality == "image":
|
||||
"""
|
||||
ocr: List the words in the image in raster order.
|
||||
Even if the word order feels unnatural for reading,
|
||||
the model will handle it as long as it follows raster order.
|
||||
e.g. "Naver, CLOVA, bigshane"
|
||||
lens_keywords: List the entity names in the image.
|
||||
e.g. "iPhone"
|
||||
lens_local_keywords: List the entity names with quads in the image.
|
||||
e.g. "[0.07, 0.21, 0.92, 0.90] iPhone"
|
||||
"""
|
||||
messages.append(
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image",
|
||||
"ocr": "",
|
||||
"lens_keywords": "",
|
||||
"lens_local_keywords": "",
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": question,
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
)
|
||||
elif modality == "video":
|
||||
messages.append(
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "video",
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": question,
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unsupported modality: {modality}")
|
||||
|
||||
prompts = tokenizer.apply_chat_template(
|
||||
messages,
|
||||
tokenize=False,
|
||||
add_generation_prompt=True,
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
stop_token_ids=None,
|
||||
)
|
||||
|
||||
|
||||
# Idefics3-8B-Llama3
|
||||
def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
@ -389,6 +468,39 @@ def run_tarsier(questions: list[str], modality: str) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
# Intern-S1
|
||||
def run_interns1(questions: list[str], modality: str) -> ModelRequestData:
|
||||
model_name = "internlm/Intern-S1"
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
limit_mm_per_prompt={modality: 1},
|
||||
enforce_eager=True,
|
||||
)
|
||||
|
||||
if modality == "image":
|
||||
placeholder = "<IMG_CONTEXT>"
|
||||
elif modality == "video":
|
||||
placeholder = "<video>"
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
||||
messages = [
|
||||
[{"role": "user", "content": f"{placeholder}\n{question}"}]
|
||||
for question in questions
|
||||
]
|
||||
prompts = tokenizer.apply_chat_template(
|
||||
messages, tokenize=False, add_generation_prompt=True
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# InternVL
|
||||
def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
|
||||
model_name = "OpenGVLab/InternVL3-2B"
|
||||
@ -987,6 +1099,41 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
# HF format Phi-4-multimodal-instruct
|
||||
def run_phi4_multimodal(questions: list[str], modality: str) -> ModelRequestData:
|
||||
"""
|
||||
Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
|
||||
show how to process image inputs.
|
||||
"""
|
||||
assert modality == "image"
|
||||
model_path = snapshot_download(
|
||||
"microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70"
|
||||
)
|
||||
# Since the vision-lora and speech-lora co-exist with the base model,
|
||||
# we have to manually specify the path of the lora weights.
|
||||
vision_lora_path = os.path.join(model_path, "vision-lora")
|
||||
prompts = [
|
||||
f"<|user|><|image|>{question}<|end|><|assistant|>" for question in questions
|
||||
]
|
||||
engine_args = EngineArgs(
|
||||
model=model_path,
|
||||
max_model_len=5120,
|
||||
max_num_seqs=2,
|
||||
max_num_batched_tokens=12800,
|
||||
enable_lora=True,
|
||||
max_lora_rank=320,
|
||||
# Note - mm_processor_kwargs can also be passed to generate/chat calls
|
||||
mm_processor_kwargs={"dynamic_hd": 16},
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
|
||||
)
|
||||
|
||||
|
||||
# Pixtral HF-format
|
||||
def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
@ -1222,7 +1369,9 @@ model_example_map = {
|
||||
"glm4v": run_glm4v,
|
||||
"glm4_1v": run_glm4_1v,
|
||||
"h2ovl_chat": run_h2ovl,
|
||||
"hyperclovax_seed_vision": run_hyperclovax_seed_vision,
|
||||
"idefics3": run_idefics3,
|
||||
"interns1": run_interns1,
|
||||
"internvl_chat": run_internvl,
|
||||
"nemotron_vl": run_nemotron_vl,
|
||||
"keye_vl": run_keye_vl,
|
||||
@ -1244,6 +1393,7 @@ model_example_map = {
|
||||
"paligemma2": run_paligemma2,
|
||||
"phi3_v": run_phi3v,
|
||||
"phi4_mm": run_phi4mm,
|
||||
"phi4_multimodal": run_phi4_multimodal,
|
||||
"pixtral_hf": run_pixtral_hf,
|
||||
"qwen_vl": run_qwen_vl,
|
||||
"qwen2_vl": run_qwen2_vl,
|
||||
|
||||
@ -253,6 +253,33 @@ def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
def load_interns1(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "internlm/Intern-S1"
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
max_model_len=4096,
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
)
|
||||
|
||||
placeholders = "\n".join(
|
||||
f"Image-{i}: <IMG_CONTEXT>\n" for i, _ in enumerate(image_urls, start=1)
|
||||
)
|
||||
messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
||||
prompt = tokenizer.apply_chat_template(
|
||||
messages, tokenize=False, add_generation_prompt=True
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
)
|
||||
|
||||
|
||||
def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "OpenGVLab/InternVL2-2B"
|
||||
|
||||
@ -289,6 +316,53 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
def load_hyperclovax_seed_vision(
|
||||
question: str, image_urls: list[str]
|
||||
) -> ModelRequestData:
|
||||
model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
max_model_len=16384,
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
)
|
||||
|
||||
message = {"role": "user", "content": list()}
|
||||
for _image_url in image_urls:
|
||||
message["content"].append(
|
||||
{
|
||||
"type": "image",
|
||||
"image": _image_url,
|
||||
"ocr": "",
|
||||
"lens_keywords": "",
|
||||
"lens_local_keywords": "",
|
||||
}
|
||||
)
|
||||
message["content"].append(
|
||||
{
|
||||
"type": "text",
|
||||
"text": question,
|
||||
}
|
||||
)
|
||||
|
||||
prompt = tokenizer.apply_chat_template(
|
||||
[
|
||||
message,
|
||||
],
|
||||
tokenize=False,
|
||||
add_generation_prompt=True,
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
stop_token_ids=None,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
)
|
||||
|
||||
|
||||
def load_llava(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
# NOTE: CAUTION! Original Llava models wasn't really trained on multi-image inputs,
|
||||
# it will generate poor response for multi-image inputs!
|
||||
@ -686,6 +760,40 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
def load_phi4_multimodal(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
"""
|
||||
Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
|
||||
show how to process multi images inputs.
|
||||
"""
|
||||
|
||||
model_path = snapshot_download(
|
||||
"microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70"
|
||||
)
|
||||
# Since the vision-lora and speech-lora co-exist with the base model,
|
||||
# we have to manually specify the path of the lora weights.
|
||||
vision_lora_path = os.path.join(model_path, "vision-lora")
|
||||
engine_args = EngineArgs(
|
||||
model=model_path,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
enable_lora=True,
|
||||
max_lora_rank=320,
|
||||
# Note - mm_processor_kwargs can also be passed to generate/chat calls
|
||||
mm_processor_kwargs={"dynamic_hd": 4},
|
||||
)
|
||||
|
||||
placeholders = "<|image|>" * len(image_urls)
|
||||
prompt = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
|
||||
)
|
||||
|
||||
|
||||
def load_qwen_vl_chat(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "Qwen/Qwen-VL-Chat"
|
||||
engine_args = EngineArgs(
|
||||
@ -899,7 +1007,9 @@ model_example_map = {
|
||||
"gemma3": load_gemma3,
|
||||
"h2ovl_chat": load_h2ovl,
|
||||
"idefics3": load_idefics3,
|
||||
"interns1": load_interns1,
|
||||
"internvl_chat": load_internvl,
|
||||
"hyperclovax_seed_vision": load_hyperclovax_seed_vision,
|
||||
"keye_vl": load_keye_vl,
|
||||
"kimi_vl": load_kimi_vl,
|
||||
"llava": load_llava,
|
||||
@ -912,6 +1022,7 @@ model_example_map = {
|
||||
"ovis": load_ovis,
|
||||
"phi3_v": load_phi3v,
|
||||
"phi4_mm": load_phi4mm,
|
||||
"phi4_multimodal": load_phi4_multimodal,
|
||||
"pixtral_hf": load_pixtral_hf,
|
||||
"qwen_vl_chat": load_qwen_vl_chat,
|
||||
"qwen2_vl": load_qwen2_vl,
|
||||
|
||||
@ -29,7 +29,7 @@ PROXY_PORT=${PROXY_PORT:-30001}
|
||||
PREFILL_GPUS=${PREFILL_GPUS:-0}
|
||||
DECODE_GPUS=${DECODE_GPUS:-1,2,3}
|
||||
PREFILL_PORTS=${PREFILL_PORTS:-20003}
|
||||
DECODE_PORTS=${DECODE_PORTS:-20005,20007,20009}
|
||||
DECODE_PORTS=${DECODE_PORTS:-20005,20007,20009}
|
||||
|
||||
echo "Warning: P2P NCCL disaggregated prefill XpYd support for vLLM v1 is experimental and subject to change."
|
||||
echo ""
|
||||
@ -164,7 +164,7 @@ main() {
|
||||
local gpu_id=${PREFILL_GPU_ARRAY[$i]}
|
||||
local port=${PREFILL_PORT_ARRAY[$i]}
|
||||
local kv_port=$((21001 + i))
|
||||
|
||||
|
||||
echo " Prefill server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
|
||||
CUDA_VISIBLE_DEVICES=$gpu_id VLLM_USE_V1=1 vllm serve $MODEL \
|
||||
--enforce-eager \
|
||||
@ -193,7 +193,7 @@ main() {
|
||||
local gpu_id=${DECODE_GPU_ARRAY[$i]}
|
||||
local port=${DECODE_PORT_ARRAY[$i]}
|
||||
local kv_port=$((22001 + i))
|
||||
|
||||
|
||||
echo " Decode server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
|
||||
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
|
||||
--enforce-eager \
|
||||
@ -233,7 +233,7 @@ main() {
|
||||
# Run Benchmark
|
||||
# =============================================================================
|
||||
cd ../../../benchmarks/
|
||||
python3 benchmark_serving.py --port 10001 --seed $(date +%s) \
|
||||
vllm bench serve --port 10001 --seed $(date +%s) \
|
||||
--model $MODEL \
|
||||
--dataset-name random --random-input-len 7500 --random-output-len 200 \
|
||||
--num-prompts 200 --burstiness 100 --request-rate 2 | tee benchmark.log
|
||||
@ -243,4 +243,4 @@ main() {
|
||||
cleanup
|
||||
}
|
||||
|
||||
main
|
||||
main
|
||||
|
||||
@ -28,7 +28,7 @@ Submit some sample requests to the server:
|
||||
```bash
|
||||
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||
|
||||
python3 ../../../benchmarks/benchmark_serving.py \
|
||||
vllm bench serve \
|
||||
--model mistralai/Mistral-7B-v0.1 \
|
||||
--tokenizer mistralai/Mistral-7B-v0.1 \
|
||||
--endpoint /v1/completions \
|
||||
|
||||
@ -122,7 +122,7 @@ main() {
|
||||
|
||||
# begin benchmark
|
||||
cd ../../../../benchmarks/
|
||||
python3 benchmark_serving.py --port 9000 --seed $(date +%s) \
|
||||
vllm bench serve --port 9000 --seed $(date +%s) \
|
||||
--model meta-llama/Llama-3.1-8B-Instruct \
|
||||
--dataset-name random --random-input-len 7500 --random-output-len 200 \
|
||||
--num-prompts 200 --burstiness 100 --request-rate 3.6 | tee benchmark.log
|
||||
@ -133,4 +133,4 @@ main() {
|
||||
|
||||
}
|
||||
|
||||
main
|
||||
main
|
||||
|
||||
@ -10,7 +10,8 @@ setuptools>=77.0.3,<80.0.0
|
||||
--extra-index-url https://download.pytorch.org/whl/cpu
|
||||
torch==2.6.0+cpu; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218
|
||||
torch==2.7.0; platform_system == "Darwin"
|
||||
torch==2.7.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
|
||||
torch==2.7.0; platform_machine == "ppc64le"
|
||||
torch==2.6.0; platform_machine == "aarch64" # for arm64 CPUs, torch 2.7.0 has a issue: https://github.com/vllm-project/vllm/issues/17960
|
||||
|
||||
# required for the image processor of minicpm-o-2_6, this must be updated alongside torch
|
||||
torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
|
||||
@ -25,3 +26,6 @@ datasets # for benchmark scripts
|
||||
intel-openmp==2024.2.1; platform_machine == "x86_64"
|
||||
intel_extension_for_pytorch==2.6.0; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218
|
||||
triton==3.2.0; platform_machine == "x86_64" # Triton is required for torch 2.6+cpu, as it is imported in torch.compile.
|
||||
|
||||
# Use this to gather CPU info and optimize based on ARM Neoverse cores
|
||||
py-cpuinfo; platform_machine == "aarch64"
|
||||
|
||||
@ -19,8 +19,8 @@ nixl==0.3.0
|
||||
--find-links https://storage.googleapis.com/libtpu-releases/index.html
|
||||
--find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
|
||||
--find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
|
||||
torch==2.9.0.dev20250716
|
||||
torchvision==0.24.0.dev20250716
|
||||
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250716-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
|
||||
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250716-cp312-cp312-linux_x86_64.whl ; python_version == "3.12"
|
||||
torch==2.9.0.dev20250724
|
||||
torchvision==0.24.0.dev20250724
|
||||
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250724-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
|
||||
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250724-cp312-cp312-linux_x86_64.whl ; python_version == "3.12"
|
||||
|
||||
|
||||
@ -1062,8 +1062,17 @@ class VllmRunner:
|
||||
return [req_output.outputs.score for req_output in req_outputs]
|
||||
|
||||
def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
|
||||
executor = self.llm.llm_engine.model_executor
|
||||
return executor.apply_model(func)
|
||||
if hasattr(self.llm.llm_engine, "model_executor"):
|
||||
# This works either in V0 or in V1 with
|
||||
# VLLM_ENABLE_V1_MULTIPROCESSING=0
|
||||
executor = self.llm.llm_engine.model_executor
|
||||
return executor.apply_model(func)
|
||||
|
||||
# This works in V1 with VLLM_ALLOW_INSECURE_SERIALIZATION=1
|
||||
def _apply_model(self):
|
||||
return func(self.get_model())
|
||||
|
||||
return self.llm.llm_engine.collective_rpc(_apply_model)
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
93
tests/entrypoints/openai/test_skip_tokenizer.py
Normal file
93
tests/entrypoints/openai/test_skip_tokenizer.py
Normal file
@ -0,0 +1,93 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import base64
|
||||
import io
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import requests
|
||||
import torch
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM"
|
||||
DTYPE = "float16"
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines):
|
||||
# Simple autouse wrapper to run both engines for each test
|
||||
# This can be promoted up to conftest.py to run for every
|
||||
# test in a package
|
||||
pass
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = [
|
||||
"--task",
|
||||
"embed",
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
DTYPE,
|
||||
"--enforce-eager",
|
||||
"--trust-remote-code",
|
||||
"--skip-tokenizer-init",
|
||||
"--max-num-seqs",
|
||||
"32"
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_single_request(server: RemoteOpenAIServer, model_name: str):
|
||||
|
||||
pixel_values = torch.full((6, 512, 512), 1.0, dtype=torch.float16)
|
||||
location_coords = torch.full((1, 2), 1.0, dtype=torch.float16)
|
||||
|
||||
buffer_tiff = io.BytesIO()
|
||||
torch.save(pixel_values, buffer_tiff)
|
||||
buffer_tiff.seek(0)
|
||||
binary_data = buffer_tiff.read()
|
||||
base64_tensor_embedding = base64.b64encode(binary_data).decode('utf-8')
|
||||
|
||||
buffer_coord = io.BytesIO()
|
||||
torch.save(location_coords, buffer_coord)
|
||||
buffer_coord.seek(0)
|
||||
binary_data = buffer_coord.read()
|
||||
base64_coord_embedding = base64.b64encode(binary_data).decode('utf-8')
|
||||
|
||||
prompt = {
|
||||
"model":
|
||||
model_name,
|
||||
"additional_data": {
|
||||
"prompt_token_ids": [1]
|
||||
},
|
||||
"encoding_format":
|
||||
"base64",
|
||||
"messages": [{
|
||||
"role":
|
||||
"user",
|
||||
"content": [{
|
||||
"type": "image_embeds",
|
||||
"image_embeds": {
|
||||
"pixel_values": base64_tensor_embedding,
|
||||
"location_coords": base64_coord_embedding,
|
||||
},
|
||||
}],
|
||||
}]
|
||||
}
|
||||
|
||||
# test single pooling
|
||||
response = requests.post(server.url_for("pooling"), json=prompt)
|
||||
response.raise_for_status()
|
||||
|
||||
output = response.json()["data"][0]['data']
|
||||
|
||||
np_response = np.frombuffer(base64.b64decode(output), dtype=np.float32)
|
||||
|
||||
assert len(np_response) == 524288
|
||||
191
tests/kernels/attention/test_aiter_flash_attn.py
Normal file
191
tests/kernels/attention/test_aiter_flash_attn.py
Normal file
@ -0,0 +1,191 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
import vllm.v1.attention.backends.rocm_aiter_fa # noqa: F401
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
NUM_HEADS = [(4, 4), (8, 2), (16, 2)]
|
||||
HEAD_SIZES = [128, 256]
|
||||
BLOCK_SIZES = [16, 32]
|
||||
DTYPES = [torch.float16, torch.bfloat16]
|
||||
QDTYPES = [None]
|
||||
# one value large enough to test overflow in index calculation.
|
||||
# one value small enough to test the schema op check
|
||||
NUM_BLOCKS = [32768, 2048]
|
||||
|
||||
|
||||
def ref_paged_attn(
|
||||
query: torch.Tensor,
|
||||
key_cache: torch.Tensor,
|
||||
value_cache: torch.Tensor,
|
||||
query_lens: list[int],
|
||||
kv_lens: list[int],
|
||||
block_tables: torch.Tensor,
|
||||
scale: float,
|
||||
sliding_window: Optional[int] = None,
|
||||
soft_cap: Optional[float] = None,
|
||||
) -> torch.Tensor:
|
||||
num_seqs = len(query_lens)
|
||||
block_tables = block_tables.cpu().numpy()
|
||||
_, block_size, num_kv_heads, head_size = key_cache.shape
|
||||
|
||||
outputs: list[torch.Tensor] = []
|
||||
start_idx = 0
|
||||
for i in range(num_seqs):
|
||||
query_len = query_lens[i]
|
||||
kv_len = kv_lens[i]
|
||||
q = query[start_idx:start_idx + query_len]
|
||||
q *= scale
|
||||
|
||||
num_kv_blocks = (kv_len + block_size - 1) // block_size
|
||||
block_indices = block_tables[i, :num_kv_blocks]
|
||||
|
||||
k = key_cache[block_indices].view(-1, num_kv_heads, head_size)
|
||||
k = k[:kv_len]
|
||||
v = value_cache[block_indices].view(-1, num_kv_heads, head_size)
|
||||
v = v[:kv_len]
|
||||
|
||||
if q.shape[1] != k.shape[1]:
|
||||
k = torch.repeat_interleave(k, q.shape[1] // k.shape[1], dim=1)
|
||||
v = torch.repeat_interleave(v, q.shape[1] // v.shape[1], dim=1)
|
||||
attn = torch.einsum("qhd,khd->hqk", q, k).float()
|
||||
empty_mask = torch.ones(query_len, kv_len)
|
||||
mask = torch.triu(empty_mask, diagonal=kv_len - query_len + 1).bool()
|
||||
if sliding_window is not None:
|
||||
sliding_window_mask = torch.triu(empty_mask,
|
||||
diagonal=kv_len -
|
||||
(query_len + sliding_window) +
|
||||
1).bool().logical_not()
|
||||
mask |= sliding_window_mask
|
||||
if soft_cap is not None:
|
||||
attn = soft_cap * torch.tanh(attn / soft_cap)
|
||||
attn.masked_fill_(mask, float("-inf"))
|
||||
attn = torch.softmax(attn, dim=-1).to(v.dtype)
|
||||
out = torch.einsum("hqk,khd->qhd", attn, v)
|
||||
|
||||
outputs.append(out)
|
||||
start_idx += query_len
|
||||
|
||||
return torch.cat(outputs, dim=0)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not current_platform.is_rocm(),
|
||||
reason="Only ROCm is supported")
|
||||
@pytest.mark.parametrize("seq_lens",
|
||||
[[(10, 1328), (5, 18),
|
||||
(129, 463)], [(8, 523), (24, 37), (3, 2011)]])
|
||||
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
||||
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
||||
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
|
||||
@pytest.mark.parametrize("sliding_window", [None, 256])
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("soft_cap", [None])
|
||||
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
|
||||
@pytest.mark.parametrize("q_dtype", QDTYPES)
|
||||
@torch.inference_mode()
|
||||
def test_varlen_with_paged_kv(
|
||||
seq_lens: list[tuple[int, int]],
|
||||
num_heads: tuple[int, int],
|
||||
head_size: int,
|
||||
sliding_window: Optional[int],
|
||||
dtype: torch.dtype,
|
||||
block_size: int,
|
||||
soft_cap: Optional[float],
|
||||
num_blocks: int,
|
||||
q_dtype: Optional[torch.dtype],
|
||||
) -> None:
|
||||
torch.set_default_device("cuda")
|
||||
current_platform.seed_everything(0)
|
||||
num_seqs = len(seq_lens)
|
||||
query_lens = [x[0] for x in seq_lens]
|
||||
kv_lens = [x[1] for x in seq_lens]
|
||||
num_query_heads = num_heads[0]
|
||||
num_kv_heads = num_heads[1]
|
||||
assert num_query_heads % num_kv_heads == 0
|
||||
max_query_len = max(query_lens)
|
||||
max_kv_len = max(kv_lens)
|
||||
window_size = ((sliding_window - 1, 0) if sliding_window is not None else
|
||||
(-1, -1))
|
||||
scale = head_size**-0.5
|
||||
|
||||
query = torch.randn(sum(query_lens),
|
||||
num_query_heads,
|
||||
head_size,
|
||||
dtype=dtype)
|
||||
key_cache = torch.randn(num_blocks,
|
||||
block_size,
|
||||
num_kv_heads,
|
||||
head_size,
|
||||
dtype=dtype)
|
||||
value_cache = torch.randn_like(key_cache)
|
||||
cu_query_lens = torch.tensor([0] + query_lens,
|
||||
dtype=torch.int32).cumsum(dim=0,
|
||||
dtype=torch.int32)
|
||||
|
||||
cu_seq_lens = torch.tensor([0] + kv_lens,
|
||||
dtype=torch.int32).cumsum(dim=0,
|
||||
dtype=torch.int32)
|
||||
kv_lens = torch.tensor(kv_lens, dtype=torch.int32)
|
||||
|
||||
max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
|
||||
block_tables = torch.randint(0,
|
||||
num_blocks,
|
||||
(num_seqs, max_num_blocks_per_seq),
|
||||
dtype=torch.int32)
|
||||
|
||||
output = torch.empty_like(query)
|
||||
|
||||
maybe_quantized_query = query
|
||||
maybe_quantized_key_cache = key_cache
|
||||
maybe_quantized_value_cache = value_cache
|
||||
k_descale = None
|
||||
v_descale = None
|
||||
if q_dtype is not None:
|
||||
# QKV are drawn from N(0, 1): no need for a fp8 scaling factor
|
||||
maybe_quantized_query = query.to(q_dtype)
|
||||
maybe_quantized_key_cache = key_cache.to(q_dtype)
|
||||
maybe_quantized_value_cache = value_cache.to(q_dtype)
|
||||
|
||||
scale_shape = (num_seqs, num_kv_heads)
|
||||
k_descale = torch.ones(scale_shape, dtype=torch.float32)
|
||||
v_descale = torch.ones(scale_shape, dtype=torch.float32)
|
||||
|
||||
torch.ops.vllm.flash_attn_varlen_func(
|
||||
maybe_quantized_query,
|
||||
maybe_quantized_key_cache,
|
||||
maybe_quantized_value_cache,
|
||||
out=output,
|
||||
cu_seqlens_q=cu_query_lens,
|
||||
max_seqlen_q=max_query_len,
|
||||
max_seqlen_k=max_kv_len,
|
||||
softmax_scale=scale,
|
||||
alibi_slopes=None,
|
||||
window_size=window_size,
|
||||
block_table=block_tables,
|
||||
cu_seqlens_k=cu_seq_lens,
|
||||
k_scale=k_descale,
|
||||
v_scale=v_descale,
|
||||
)
|
||||
|
||||
ref_output = ref_paged_attn(
|
||||
query=query,
|
||||
key_cache=key_cache,
|
||||
value_cache=value_cache,
|
||||
query_lens=query_lens,
|
||||
kv_lens=kv_lens,
|
||||
block_tables=block_tables,
|
||||
scale=scale,
|
||||
sliding_window=sliding_window,
|
||||
soft_cap=soft_cap,
|
||||
)
|
||||
|
||||
atol, rtol = 2e-2, 2e-2
|
||||
if q_dtype is not None:
|
||||
atol, rtol = 1.5e-1, 1.5e-1
|
||||
torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol), \
|
||||
f"{torch.max(torch.abs(output - ref_output))}"
|
||||
@ -17,28 +17,34 @@ from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
|
||||
moe_permute, moe_permute_unpermute_supported, moe_unpermute)
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
NUM_EXPERTS = [16, 64]
|
||||
NUM_EXPERTS = [16, 64, 256]
|
||||
TOP_KS = [2, 4, 6, 8]
|
||||
EP_SIZE = [1, 4, 16]
|
||||
current_platform.seed_everything(0)
|
||||
|
||||
|
||||
def torch_permute(hidden_states: torch.Tensor,
|
||||
topk_ids: torch.Tensor,
|
||||
token_expert_indices: torch.Tensor,
|
||||
topk: int,
|
||||
n_expert: int,
|
||||
n_local_expert: int,
|
||||
start_expert: int,
|
||||
expert_map: Optional[torch.Tensor] = None,
|
||||
align_block_size: Optional[int] = None,
|
||||
fill_invalid_expert: int = -1) -> list[torch.Tensor]:
|
||||
def torch_permute(
|
||||
hidden_states: torch.Tensor,
|
||||
topk_ids: torch.Tensor,
|
||||
# token_expert_indices: torch.Tensor,
|
||||
topk: int,
|
||||
n_expert: int,
|
||||
n_local_expert: int,
|
||||
start_expert: int,
|
||||
expert_map: Optional[torch.Tensor] = None,
|
||||
align_block_size: Optional[int] = None,
|
||||
fill_invalid_expert: int = -1) -> list[torch.Tensor]:
|
||||
n_token, n_hidden = hidden_states.shape[0], hidden_states.shape[1]
|
||||
if expert_map is not None:
|
||||
is_local_expert = (expert_map[topk_ids] != -1)
|
||||
not_local_expert = (expert_map[topk_ids] == -1)
|
||||
topk_ids = is_local_expert * (
|
||||
topk_ids - start_expert) + not_local_expert * (topk_ids + n_expert)
|
||||
token_expert_indices = torch.arange(0,
|
||||
n_token * topk,
|
||||
dtype=torch.int32,
|
||||
device=hidden_states.device).reshape(
|
||||
(n_token, topk))
|
||||
|
||||
sorted_topk_ids, sorted_indices = torch.sort(topk_ids.flatten(),
|
||||
stable=True)
|
||||
@ -59,8 +65,8 @@ def torch_permute(hidden_states: torch.Tensor,
|
||||
valid_row_idx = []
|
||||
if align_block_size is None:
|
||||
|
||||
permuted_hidden_states = hidden_states[dst_row_id2src_row_id_map %
|
||||
n_token, ...]
|
||||
permuted_hidden_states = hidden_states[dst_row_id2src_row_id_map //
|
||||
topk, ...]
|
||||
permuted_row_size = permuted_hidden_states.shape[0]
|
||||
m_indices = torch.empty(permuted_row_size,
|
||||
device="cuda",
|
||||
@ -73,14 +79,21 @@ def torch_permute(hidden_states: torch.Tensor,
|
||||
0, n_token * topk, device="cuda",
|
||||
dtype=torch.int32)[src2dst_idx].reshape((n_token, topk))
|
||||
valid_row_idx += [i for i in range(expert_first_token_offset[-1])]
|
||||
dst_row_id2src_row_id_map[
|
||||
expert_first_token_offset[-1]:] = n_token * topk
|
||||
return [
|
||||
permuted_hidden_states, expert_first_token_offset,
|
||||
src_row_id2dst_row_id_map, m_indices, valid_row_idx
|
||||
src_row_id2dst_row_id_map, dst_row_id2src_row_id_map, m_indices,
|
||||
valid_row_idx
|
||||
]
|
||||
else:
|
||||
permuted_row_size = (topk * n_token + n_expert *
|
||||
(align_block_size - 1) + align_block_size -
|
||||
1) // align_block_size * align_block_size
|
||||
permuted_idx = torch.full((permuted_row_size, ),
|
||||
n_token * topk,
|
||||
dtype=torch.int32,
|
||||
device=hidden_states.device)
|
||||
permuted_hidden_states = torch.empty((permuted_row_size, n_hidden),
|
||||
device="cuda",
|
||||
dtype=hidden_states.dtype)
|
||||
@ -105,13 +118,16 @@ def torch_permute(hidden_states: torch.Tensor,
|
||||
align_first_token_offset = align_expert_first_token_offset[i - 1]
|
||||
align_last_token_offset = align_expert_first_token_offset[i]
|
||||
dst_row_id2src_row_id_in_expert = dst_row_id2src_row_id_map[
|
||||
first_token_offset:first_token_offset +
|
||||
n_token_in_expert] % n_token
|
||||
first_token_offset:first_token_offset + n_token_in_expert]
|
||||
# store token in current expert with align_first_token_offset
|
||||
permuted_hidden_states[align_first_token_offset:\
|
||||
align_first_token_offset+n_token_in_expert,\
|
||||
...] = hidden_states[\
|
||||
dst_row_id2src_row_id_in_expert, ...]
|
||||
dst_row_id2src_row_id_in_expert // topk,\
|
||||
...]
|
||||
permuted_idx[align_first_token_offset:\
|
||||
align_first_token_offset+\
|
||||
n_token_in_expert] = dst_row_id2src_row_id_in_expert
|
||||
# set current expert m_indices
|
||||
m_indices[align_first_token_offset:align_last_token_offset] = i - 1
|
||||
valid_row_idx += [
|
||||
@ -135,7 +151,7 @@ def torch_permute(hidden_states: torch.Tensor,
|
||||
src2dst_idx].reshape((n_token, topk))
|
||||
return [
|
||||
permuted_hidden_states, align_expert_first_token_offset,
|
||||
align_src_row_id2dst_row_id, m_indices, valid_row_idx
|
||||
align_src_row_id2dst_row_id, permuted_idx, m_indices, valid_row_idx
|
||||
]
|
||||
|
||||
|
||||
@ -146,15 +162,18 @@ def torch_unpermute(permuted_hidden_states: torch.Tensor,
|
||||
valid_row_idx: torch.Tensor, topk: int,
|
||||
n_expert: int) -> torch.Tensor:
|
||||
# ignore invalid row
|
||||
n_hidden = permuted_hidden_states.shape[1]
|
||||
mask = torch.zeros(permuted_hidden_states.shape[0],
|
||||
dtype=bool,
|
||||
device="cuda")
|
||||
mask[valid_row_idx] = True
|
||||
permuted_hidden_states[~mask] = 0
|
||||
idx = src_row_id2dst_row_id_map.flatten()[
|
||||
token_expert_indices.flatten()].reshape(token_expert_indices.shape)
|
||||
output = permuted_hidden_states[idx, ...] * topk_weights[..., None]
|
||||
output = output.sum(dim=1).to(permuted_hidden_states.dtype)
|
||||
|
||||
permuted_hidden_states = permuted_hidden_states[
|
||||
src_row_id2dst_row_id_map.flatten(), ...]
|
||||
permuted_hidden_states = permuted_hidden_states.view(-1, topk, n_hidden)
|
||||
output = (permuted_hidden_states * topk_weights.unsqueeze(2)).sum(1).to(
|
||||
permuted_hidden_states.dtype)
|
||||
return output
|
||||
|
||||
|
||||
@ -184,43 +203,56 @@ def test_moe_permute_unpermute(n_token: int, n_hidden: int, topk: int,
|
||||
gating_output = torch.randn((n_token, n_expert), device="cuda").to(dtype)
|
||||
topk_weights, topk_ids, token_expert_indices = fused_topk(
|
||||
hidden_states, gating_output, topk, False)
|
||||
gold0, gold1, gold2, gold3, valid_row_idx = torch_permute(
|
||||
hidden_states,
|
||||
topk_ids,
|
||||
token_expert_indices,
|
||||
topk,
|
||||
n_expert,
|
||||
n_local_expert,
|
||||
start_expert,
|
||||
expert_map=expert_map,
|
||||
align_block_size=align_block_size,
|
||||
fill_invalid_expert=fill_invalid_expert)
|
||||
(gold_permuted_hidden_states, gold_expert_first_token_offset,
|
||||
gold_inv_permuted_idx, gold_permuted_idx, gold_m_indices,
|
||||
valid_row_idx) = torch_permute(
|
||||
hidden_states,
|
||||
topk_ids,
|
||||
# token_expert_indices,
|
||||
topk,
|
||||
n_expert,
|
||||
n_local_expert,
|
||||
start_expert,
|
||||
expert_map=expert_map,
|
||||
align_block_size=align_block_size,
|
||||
fill_invalid_expert=fill_invalid_expert)
|
||||
|
||||
result0, result1, result2, result3 = moe_permute(
|
||||
hidden_states, topk_weights, topk_ids, token_expert_indices, topk,
|
||||
n_expert, n_local_expert, expert_map, align_block_size,
|
||||
fill_invalid_expert)
|
||||
(permuted_hidden_states, _, expert_first_token_offset, inv_permuted_idx,
|
||||
m_indices) = moe_permute(hidden_states=hidden_states,
|
||||
a1q_scale=None,
|
||||
topk_ids=topk_ids,
|
||||
n_expert=n_expert,
|
||||
n_local_expert=n_local_expert,
|
||||
expert_map=expert_map,
|
||||
align_block_size=align_block_size,
|
||||
fill_invalid_expert=fill_invalid_expert)
|
||||
|
||||
# check expert_first_token_offset
|
||||
torch.testing.assert_close(gold1, result1, atol=0, rtol=0)
|
||||
# check src_row_id2dst_row_id_map
|
||||
torch.testing.assert_close(gold2, result2, atol=0, rtol=0)
|
||||
# check mindice
|
||||
torch.testing.assert_close(gold3, result3, atol=0, rtol=0)
|
||||
# check permuted_hidden_states, only valid token
|
||||
torch.testing.assert_close(gold0[valid_row_idx],
|
||||
result0[valid_row_idx],
|
||||
torch.testing.assert_close(gold_expert_first_token_offset,
|
||||
expert_first_token_offset,
|
||||
atol=0,
|
||||
rtol=0)
|
||||
# check src_row_id2dst_row_id_map
|
||||
torch.testing.assert_close(gold_inv_permuted_idx.flatten(),
|
||||
inv_permuted_idx,
|
||||
atol=0,
|
||||
rtol=0)
|
||||
# check mindice
|
||||
torch.testing.assert_close(gold_m_indices, m_indices, atol=0, rtol=0)
|
||||
# check permuted_hidden_states, only valid token
|
||||
torch.testing.assert_close(gold_permuted_hidden_states[valid_row_idx],
|
||||
permuted_hidden_states[valid_row_idx],
|
||||
atol=0,
|
||||
rtol=0)
|
||||
|
||||
# add a random tensor to simulate group gemm
|
||||
result0 = 0.5 * result0 + torch.randn_like(result0)
|
||||
result0 = 0.5 * permuted_hidden_states + torch.randn_like(
|
||||
permuted_hidden_states)
|
||||
result4 = torch.empty_like(hidden_states)
|
||||
moe_unpermute(result4, result0, topk_weights, inv_permuted_idx,
|
||||
expert_first_token_offset)
|
||||
|
||||
result4 = moe_unpermute(result0, topk_weights, topk_ids, result2, result1,
|
||||
topk, n_expert, n_local_expert)
|
||||
gold4 = torch_unpermute(result0, topk_weights, topk_ids,
|
||||
token_expert_indices, result2, valid_row_idx, topk,
|
||||
n_local_expert)
|
||||
|
||||
token_expert_indices, inv_permuted_idx,
|
||||
valid_row_idx, topk, n_local_expert)
|
||||
# check unpermuted hidden
|
||||
torch.testing.assert_close(result4, gold4, atol=2e-2, rtol=0)
|
||||
|
||||
@ -22,10 +22,12 @@ REVISION_ROBERTA = os.environ.get("REVISION", "main")
|
||||
|
||||
@pytest.mark.skipif(current_platform.is_rocm(),
|
||||
reason="Xformers backend is not supported on ROCm.")
|
||||
def test_model_loading_with_params(vllm_runner):
|
||||
def test_model_loading_with_params(vllm_runner, monkeypatch):
|
||||
"""
|
||||
Test parameter weight loading with tp>1.
|
||||
"""
|
||||
# to use apply_model
|
||||
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||
with vllm_runner(model_name=MODEL_NAME,
|
||||
revision=REVISION,
|
||||
dtype="float16",
|
||||
@ -61,10 +63,12 @@ def test_model_loading_with_params(vllm_runner):
|
||||
|
||||
@pytest.mark.skipif(current_platform.is_rocm(),
|
||||
reason="Xformers backend is not supported on ROCm.")
|
||||
def test_roberta_model_loading_with_params(vllm_runner):
|
||||
def test_roberta_model_loading_with_params(vllm_runner, monkeypatch):
|
||||
"""
|
||||
Test parameter weight loading with tp>1.
|
||||
"""
|
||||
# to use apply_model
|
||||
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||
with vllm_runner(model_name=MODEL_NAME_ROBERTA,
|
||||
revision=REVISION_ROBERTA,
|
||||
dtype="float16",
|
||||
@ -101,10 +105,12 @@ def test_roberta_model_loading_with_params(vllm_runner):
|
||||
|
||||
@pytest.mark.skipif(current_platform.is_rocm(),
|
||||
reason="Xformers backend is not supported on ROCm.")
|
||||
def test_facebook_roberta_model_loading_with_params(vllm_runner):
|
||||
def test_facebook_roberta_model_loading_with_params(vllm_runner, monkeypatch):
|
||||
"""
|
||||
Test loading roberta-base model with no lm_head.
|
||||
"""
|
||||
# to use apply_model
|
||||
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||
model_name = "FacebookAI/roberta-base"
|
||||
with vllm_runner(model_name=model_name,
|
||||
dtype="float16",
|
||||
|
||||
@ -39,17 +39,9 @@ def v1(run_with_both_engines):
|
||||
pytest.param("ssmits/Qwen2-7B-Instruct-embed-base",
|
||||
marks=[pytest.mark.skip_v0, pytest.mark.cpu_model]),
|
||||
# [Encoder-only]
|
||||
pytest.param(
|
||||
"BAAI/bge-base-en-v1.5",
|
||||
marks=[
|
||||
# CPU only supports V1
|
||||
pytest.mark.core_model,
|
||||
pytest.mark.skip_v1
|
||||
]),
|
||||
pytest.param("sentence-transformers/all-MiniLM-L12-v2",
|
||||
marks=[pytest.mark.skip_v1]),
|
||||
pytest.param("intfloat/multilingual-e5-small",
|
||||
marks=[pytest.mark.skip_v1]),
|
||||
pytest.param("BAAI/bge-base-en-v1.5", marks=[pytest.mark.core_model]),
|
||||
pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
|
||||
pytest.param("intfloat/multilingual-e5-small"),
|
||||
pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
||||
marks=[pytest.mark.skip_v1]),
|
||||
# [Cross-Encoder]
|
||||
|
||||
@ -23,6 +23,14 @@ RERANK_MODELS = [
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines):
|
||||
# Simple autouse wrapper to run both engines for each test
|
||||
# This can be promoted up to conftest.py to run for every
|
||||
# test in a package
|
||||
pass
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
|
||||
def test_embed_models_mteb(hf_runner, vllm_runner,
|
||||
model_info: EmbedModelInfo) -> None:
|
||||
|
||||
@ -677,6 +677,7 @@ VLM_TEST_SETTINGS = {
|
||||
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
|
||||
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
|
||||
multi_image_prompt="Picture 1: <vlm_image>\nPicture 2: <vlm_image>\nDescribe these two images with one paragraph respectively.", # noqa: E501
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForVision2Seq,
|
||||
|
||||
@ -22,6 +22,9 @@ from transformers import (AutoConfig, AutoProcessor, AutoTokenizer,
|
||||
GenerationConfig)
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.v1.executor.abstract import Executor
|
||||
from vllm.v1.kv_cache_interface import (ChunkedLocalAttentionSpec,
|
||||
FullAttentionSpec)
|
||||
|
||||
from ....utils import multi_gpu_test
|
||||
|
||||
@ -69,6 +72,26 @@ def run_maverick_serving(model: str):
|
||||
raise
|
||||
|
||||
|
||||
def get_rope_layers_config(model_path: str) -> list[int]:
|
||||
"""
|
||||
Get the interleaved RoPE configuration from HuggingFace config
|
||||
|
||||
Args:
|
||||
model_path: Path to the local directory containing the reduced
|
||||
Maverick model checkpoint
|
||||
|
||||
Returns:
|
||||
List of 0 or 1 indicating whether each layer uses RoPE and local attn
|
||||
0 indicates that RoPE is not used while 1 indicates that RoPE is used.
|
||||
"""
|
||||
config_path = Path(model_path) / "config.json"
|
||||
model_config = json.loads(config_path.read_text())
|
||||
text_config = model_config["text_config"]
|
||||
no_rope_layers = text_config["no_rope_layers"]
|
||||
print(f"Found no_rope_layers: {no_rope_layers}")
|
||||
return no_rope_layers
|
||||
|
||||
|
||||
def create_reduced_maverick_model(
|
||||
original_model_name:
|
||||
str = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
||||
@ -113,7 +136,6 @@ def create_reduced_maverick_model(
|
||||
print("Loading original model configuration...")
|
||||
original_config = AutoConfig.from_pretrained(original_model_name,
|
||||
trust_remote_code=True)
|
||||
|
||||
print("Creating reduced configuration...")
|
||||
reduced_config = create_reduced_config(original_config, text_layers,
|
||||
num_experts, vision_layers)
|
||||
@ -510,21 +532,32 @@ def save_weights_to_safetensors(weights: dict[str, torch.Tensor],
|
||||
f"{index_data['metadata']['total_size'] / (1024**3):.2f} GB")
|
||||
|
||||
|
||||
def run_reduced_model(model_path: str,
|
||||
should_profile: bool = False,
|
||||
**kwargs) -> None:
|
||||
"""Test the created reduced model with vLLM."""
|
||||
|
||||
print(f"\nTesting reduced model at {model_path}...")
|
||||
|
||||
llm = LLM(
|
||||
model=model_path,
|
||||
trust_remote_code=True,
|
||||
max_model_len=512, # Small context for testing
|
||||
gpu_memory_utilization=0.3, # Conservative memory usage
|
||||
**kwargs,
|
||||
def check_attention_spec_interleaved_rope(
|
||||
llm: LLM,
|
||||
num_attention_layers: int,
|
||||
num_ranks: int,
|
||||
rope_layers: list[int],
|
||||
):
|
||||
"""Check that the attention spec is correct."""
|
||||
assert isinstance(llm.llm_engine.model_executor, Executor)
|
||||
kv_cache_specs_per_rank = llm.llm_engine.model_executor.get_kv_cache_specs(
|
||||
)
|
||||
for rank in range(num_ranks):
|
||||
kv_cache_specs = kv_cache_specs_per_rank[rank]
|
||||
assert len(kv_cache_specs.keys()) == num_attention_layers
|
||||
for i in range(num_attention_layers):
|
||||
if rope_layers[i] == 0:
|
||||
expected_spec = FullAttentionSpec
|
||||
else:
|
||||
expected_spec = ChunkedLocalAttentionSpec
|
||||
assert isinstance(
|
||||
kv_cache_specs[
|
||||
f"language_model.model.layers.{i}.self_attn.attn"],
|
||||
expected_spec)
|
||||
|
||||
|
||||
def run_reduced_model(llm: LLM, should_profile: bool = False) -> None:
|
||||
"""Test the created reduced model with vLLM."""
|
||||
sampling_params = SamplingParams(temperature=0.8,
|
||||
top_p=0.95,
|
||||
max_tokens=50)
|
||||
@ -551,6 +584,7 @@ def run_reduced_model(model_path: str,
|
||||
@pytest.mark.parametrize("tp,ep", [(2, True)])
|
||||
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
|
||||
def test_dummy_maverick(
|
||||
monkeypatch,
|
||||
original_model_name: str,
|
||||
text_layers: int,
|
||||
num_experts: int,
|
||||
@ -562,6 +596,10 @@ def test_dummy_maverick(
|
||||
force_recreate: bool = True,
|
||||
profile: bool = False,
|
||||
) -> None:
|
||||
# Disable multiprocessing allows us to access model executor from LLM engine
|
||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
||||
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
|
||||
|
||||
model_path = create_reduced_maverick_model(
|
||||
original_model_name=original_model_name,
|
||||
output_dir=output_dir,
|
||||
@ -573,11 +611,27 @@ def test_dummy_maverick(
|
||||
|
||||
print(f"\nReduced model created successfully at: {model_path}")
|
||||
|
||||
run_reduced_model(model_path=model_path,
|
||||
should_profile=profile,
|
||||
enforce_eager=enforce_eager,
|
||||
tensor_parallel_size=tp,
|
||||
enable_expert_parallel=ep)
|
||||
rope_layers = get_rope_layers_config(model_path)
|
||||
|
||||
llm = LLM(
|
||||
model=model_path,
|
||||
trust_remote_code=True,
|
||||
max_model_len=512, # Small context for testing
|
||||
gpu_memory_utilization=0.3, # Conservative memory usage
|
||||
enforce_eager=enforce_eager,
|
||||
tensor_parallel_size=tp,
|
||||
enable_expert_parallel=ep,
|
||||
)
|
||||
|
||||
check_attention_spec_interleaved_rope(
|
||||
llm,
|
||||
text_layers,
|
||||
tp,
|
||||
rope_layers,
|
||||
)
|
||||
|
||||
print(f"\nTesting reduced model at {model_path}...")
|
||||
run_reduced_model(llm=llm, should_profile=profile)
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
252
tests/models/multimodal/generation/test_phi4_multimodal.py
Normal file
252
tests/models/multimodal/generation/test_phi4_multimodal.py
Normal file
@ -0,0 +1,252 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
from collections.abc import Sequence
|
||||
from typing import Optional
|
||||
|
||||
import librosa
|
||||
import pytest
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.multimodal.image import rescale_image_size
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ....conftest import (IMAGE_ASSETS, HfRunner, PromptAudioInput,
|
||||
PromptImageInput, VllmRunner)
|
||||
from ....utils import large_gpu_test
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||
"stop_sign":
|
||||
"<|user|>\n<|image|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n", # noqa: E501
|
||||
"cherry_blossom":
|
||||
"<|user|>\n<|image|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n", # noqa: E501
|
||||
})
|
||||
HF_MULTIIMAGE_IMAGE_PROMPT = "<|user|>\n<|image|>\n<|image|>\nDescribe these images.<|end|>\n<|assistant|>\n" # noqa: E501
|
||||
|
||||
model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct",
|
||||
revision="refs/pr/70")
|
||||
# Since the vision-lora and speech-lora co-exist with the base model,
|
||||
# we have to manually specify the path of the lora weights.
|
||||
vision_lora_path = os.path.join(model_path, "vision-lora")
|
||||
speech_question = os.path.join(model_path, "examples",
|
||||
"what_is_shown_in_this_image.wav")
|
||||
models = [model_path]
|
||||
|
||||
target_dtype = "half"
|
||||
|
||||
# ROCm Triton FA can run into shared memory issues with these models,
|
||||
# use other backends in the meantime
|
||||
# FIXME (mattwong, gshtrasb, hongxiayan)
|
||||
if current_platform.is_rocm():
|
||||
os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
|
||||
|
||||
|
||||
def run_test(
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
inputs: Sequence[tuple[list[str], PromptImageInput,
|
||||
Optional[PromptAudioInput]]],
|
||||
model: str,
|
||||
*,
|
||||
max_model_len: int,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
mm_limit: int,
|
||||
tensor_parallel_size: int,
|
||||
distributed_executor_backend: Optional[str] = None,
|
||||
):
|
||||
"""Inference result should be the same between hf and vllm.
|
||||
|
||||
All the image fixtures for the test are from IMAGE_ASSETS.
|
||||
For huggingface runner, we provide the PIL images as input.
|
||||
For vllm runner, we provide MultiModalDataDict objects
|
||||
and corresponding MultiModalConfig as input.
|
||||
Note, the text input is also adjusted to abide by vllm contract.
|
||||
The text output is sanitized to be able to compare with hf.
|
||||
"""
|
||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
# max_model_len should be greater than image_feature_size
|
||||
with vllm_runner(
|
||||
model,
|
||||
task="generate",
|
||||
max_model_len=max_model_len,
|
||||
max_num_seqs=2,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={"image": mm_limit},
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enable_lora=True,
|
||||
max_lora_rank=320,
|
||||
gpu_memory_utilization=0.8, # set to 0.8 to avoid OOM in CI
|
||||
enforce_eager=True,
|
||||
trust_remote_code=False,
|
||||
) as vllm_model:
|
||||
lora_request = LoRARequest("vision", 1, vision_lora_path)
|
||||
vllm_outputs_per_case = [
|
||||
vllm_model.generate_greedy_logprobs(prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images,
|
||||
audios=audios,
|
||||
lora_request=lora_request)
|
||||
for prompts, images, audios in inputs
|
||||
]
|
||||
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_model.model.load_adapter(
|
||||
vision_lora_path,
|
||||
adapter_name="vision",
|
||||
)
|
||||
hf_processor = hf_model.processor
|
||||
eos_token_id = hf_processor.tokenizer.eos_token_id
|
||||
hf_outputs_per_case = [
|
||||
hf_model.generate_greedy_logprobs_limit(prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images,
|
||||
audios=audios,
|
||||
eos_token_id=eos_token_id)
|
||||
for prompts, images, audios in inputs
|
||||
]
|
||||
|
||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
|
||||
vllm_outputs_per_case):
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize(
|
||||
"size_factors",
|
||||
[
|
||||
# No image
|
||||
[],
|
||||
# Single-scale
|
||||
[1.0],
|
||||
# Single-scale, batched
|
||||
[1.0, 1.0, 1.0],
|
||||
# Multi-scale
|
||||
[0.25, 0.5, 1.0],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_model_len", [12800])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
|
||||
dtype: str, max_model_len: int, max_tokens: int,
|
||||
num_logprobs: int) -> None:
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
inputs_per_image = [(
|
||||
[prompt for _ in size_factors],
|
||||
[rescale_image_size(image, factor) for factor in size_factors],
|
||||
None,
|
||||
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
|
||||
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
inputs_per_image,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_model_len=max_model_len,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
mm_limit=1,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
|
||||
|
||||
@large_gpu_test(min_gb=48)
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize(
|
||||
"size_factors",
|
||||
[
|
||||
# No image
|
||||
# [],
|
||||
# Single-scale
|
||||
[1.0],
|
||||
# Single-scale, batched
|
||||
[1.0, 1.0, 1.0],
|
||||
# Multi-scale
|
||||
[0.25, 0.5, 1.0],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_model_len", [25600])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
|
||||
size_factors, dtype: str, max_model_len: int,
|
||||
max_tokens: int, num_logprobs: int) -> None:
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
inputs_per_case = [
|
||||
(
|
||||
[HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
|
||||
[[rescale_image_size(image, factor) for image in images]
|
||||
for factor in size_factors],
|
||||
None,
|
||||
),
|
||||
]
|
||||
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
inputs_per_case,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_model_len=max_model_len,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
mm_limit=2,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_model_len", [12800])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_vision_speech_models(hf_runner, vllm_runner, model, dtype: str,
|
||||
max_model_len: int, max_tokens: int,
|
||||
num_logprobs: int) -> None:
|
||||
|
||||
# use the example speech question so that the model outputs are reasonable
|
||||
audio = librosa.load(speech_question, sr=16000)
|
||||
image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
|
||||
|
||||
inputs_vision_speech = [
|
||||
(
|
||||
["<|user|><|image|><|audio|><|end|><|assistant|>"],
|
||||
[image],
|
||||
[audio],
|
||||
),
|
||||
]
|
||||
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
inputs_vision_speech,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_model_len=max_model_len,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
mm_limit=1,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
@ -41,12 +41,18 @@ def glm4_1v_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
|
||||
|
||||
|
||||
def _test_processing_correctness(
|
||||
model_id: str,
|
||||
model_id_or_arch: str,
|
||||
hit_rate: float,
|
||||
num_batches: int,
|
||||
simplify_rate: float,
|
||||
):
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
|
||||
if model_id_or_arch in HF_EXAMPLE_MODELS.get_supported_archs():
|
||||
# Use model architecture to get the default model id
|
||||
model_info = HF_EXAMPLE_MODELS.get_hf_info(model_id_or_arch)
|
||||
model_id = model_info.default
|
||||
else:
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id_or_arch)
|
||||
model_id = model_id_or_arch
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
|
||||
@ -58,7 +64,7 @@ def _test_processing_correctness(
|
||||
trust_remote_code=model_info.trust_remote_code,
|
||||
seed=0,
|
||||
dtype="auto",
|
||||
revision=None,
|
||||
revision=model_info.revision,
|
||||
hf_overrides=model_info.hf_overrides,
|
||||
)
|
||||
|
||||
@ -272,12 +278,14 @@ def _test_processing_correctness_one(
|
||||
"THUDM/GLM-4.1V-9B-Thinking",
|
||||
"ibm-granite/granite-speech-3.3-2b",
|
||||
"h2oai/h2ovl-mississippi-800m",
|
||||
"internlm/Intern-S1",
|
||||
"OpenGVLab/InternVL2-1B",
|
||||
"OpenGVLab/InternVL3-1B",
|
||||
"HuggingFaceM4/Idefics3-8B-Llama3",
|
||||
"HuggingFaceTB/SmolVLM2-2.2B-Instruct",
|
||||
"moonshotai/Kimi-VL-A3B-Instruct",
|
||||
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
||||
"naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
|
||||
"llava-hf/llava-1.5-7b-hf",
|
||||
"llava-hf/llava-v1.6-mistral-7b-hf",
|
||||
"llava-hf/LLaVA-NeXT-Video-7B-hf",
|
||||
@ -330,6 +338,28 @@ def test_processing_correctness(
|
||||
)
|
||||
|
||||
|
||||
# Phi4MultimodalForCausalLM share same model repo with original format
|
||||
# Phi4MMForCausalLM, so we add it as a separate test case
|
||||
# Remove this test after conversion PR merged:
|
||||
# https://huggingface.co/microsoft/Phi-4-multimodal-instruct/discussions/70
|
||||
@pytest.mark.parametrize("model_arch", ["Phi4MultimodalForCausalLM"])
|
||||
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
|
||||
@pytest.mark.parametrize("num_batches", [32])
|
||||
@pytest.mark.parametrize("simplify_rate", [1.0])
|
||||
def test_processing_correctness_phi4_multimodal(
|
||||
model_arch: str,
|
||||
hit_rate: float,
|
||||
num_batches: int,
|
||||
simplify_rate: float,
|
||||
):
|
||||
_test_processing_correctness(
|
||||
model_arch,
|
||||
hit_rate=hit_rate,
|
||||
num_batches=num_batches,
|
||||
simplify_rate=simplify_rate,
|
||||
)
|
||||
|
||||
|
||||
def _assert_inputs_equal(
|
||||
a: MultiModalInputs,
|
||||
b: MultiModalInputs,
|
||||
|
||||
@ -201,6 +201,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
|
||||
trust_remote_code=True),
|
||||
"HunYuanDenseV1ForCausalLM":_HfExamplesInfo("tencent/Hunyuan-7B-Instruct-0124",
|
||||
trust_remote_code=True),
|
||||
"HCXVisionForCausalLM": _HfExamplesInfo(
|
||||
"naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
|
||||
trust_remote_code=True),
|
||||
"InternLMForCausalLM": _HfExamplesInfo("internlm/internlm-chat-7b",
|
||||
trust_remote_code=True),
|
||||
"InternLM2ForCausalLM": _HfExamplesInfo("internlm/internlm2-chat-7b",
|
||||
@ -218,6 +221,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
|
||||
"fp8": "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"}), # noqa: E501
|
||||
"LLaMAForCausalLM": _HfExamplesInfo("decapoda-research/llama-7b-hf",
|
||||
is_available_online=False),
|
||||
"Llama4ForCausalLM": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501
|
||||
is_available_online=False),
|
||||
"MambaForCausalLM": _HfExamplesInfo("state-spaces/mamba-130m-hf"),
|
||||
"Mamba2ForCausalLM": _HfExamplesInfo("mistralai/Mamba-Codestral-7B-v0.1"),
|
||||
"FalconMambaForCausalLM": _HfExamplesInfo("tiiuae/falcon-mamba-7b-instruct"), # noqa: E501
|
||||
@ -376,6 +381,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
extras={"2B": "OpenGVLab/InternVL2-2B",
|
||||
"3.0": "OpenGVLab/InternVL3-1B"}, # noqa: E501
|
||||
trust_remote_code=True),
|
||||
"InternS1ForConditionalGeneration": _HfExamplesInfo("internlm/Intern-S1",
|
||||
trust_remote_code=True),
|
||||
"Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3", # noqa: E501
|
||||
{"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}), # noqa: E501
|
||||
"KeyeForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501
|
||||
@ -426,6 +433,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
"1.6-gemma": "AIDC-AI/Ovis1.6-Gemma2-9B"}), # noqa: E501
|
||||
"Phi4MMForCausalLM": _HfExamplesInfo("microsoft/Phi-4-multimodal-instruct",
|
||||
trust_remote_code=True),
|
||||
"Phi4MultimodalForCausalLM": _HfExamplesInfo("microsoft/Phi-4-multimodal-instruct", # noqa: E501
|
||||
revision="refs/pr/70"),
|
||||
"PixtralForConditionalGeneration": _HfExamplesInfo("mistralai/Pixtral-12B-2409", # noqa: E501
|
||||
tokenizer_mode="mistral"),
|
||||
"QwenVLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen-VL",
|
||||
|
||||
@ -17,7 +17,9 @@ from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tenso
|
||||
CompressedTensorsW4A4Fp4, CompressedTensorsW4A16Fp4,
|
||||
CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8,
|
||||
CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8,
|
||||
CompressedTensorsWNA16, cutlass_fp4_supported)
|
||||
CompressedTensorsWNA16)
|
||||
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||
cutlass_fp4_supported)
|
||||
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
|
||||
sparse_cutlass_supported)
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
@ -8,7 +8,10 @@ import pytest
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
|
||||
MODELS = ["microsoft/Phi-3-mini-4k-instruct"]
|
||||
MODELS = [
|
||||
"microsoft/Phi-3-mini-4k-instruct", # dense model
|
||||
"ai21labs/Jamba-tiny-dev", # MoE model
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_quant_method_supported("rtn"),
|
||||
|
||||
@ -4,6 +4,7 @@
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.models.fuyu import FuyuImagePatchInputs
|
||||
from vllm.model_executor.models.phi3v import Phi3VImagePixelInputs
|
||||
|
||||
|
||||
@ -124,3 +125,24 @@ def test_tensor_schema_with_invalid_resolve_binding_dims():
|
||||
"w": 336
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def test_tensor_schema_with_list_of_symbolic_dim():
|
||||
flat_data = torch.stack([torch.randn(768) for _ in range(3)]) # (bn=3, fn)
|
||||
patches_per_image = [64, 64, 64] # len = bn = 3
|
||||
|
||||
FuyuImagePatchInputs(
|
||||
flat_data=flat_data,
|
||||
patches_per_image=patches_per_image,
|
||||
)
|
||||
|
||||
|
||||
def test_tensor_schema_with_list_of_symbolic_dim_mismatch_in_length():
|
||||
flat_data = torch.stack([torch.randn(768) for _ in range(4)]) # (bn=4, fn)
|
||||
patches_per_image = [64, 64, 64] # len = 3 ≠ bn
|
||||
|
||||
with pytest.raises(ValueError, match="expected 'bn'=4, got 3"):
|
||||
FuyuImagePatchInputs(
|
||||
flat_data=flat_data,
|
||||
patches_per_image=patches_per_image,
|
||||
)
|
||||
@ -93,6 +93,7 @@ def create_common_attn_metadata(
|
||||
max_query_len=max_query_len,
|
||||
block_table_tensor=block_table_tensor,
|
||||
slot_mapping=slot_mapping,
|
||||
causal=True,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@ -13,7 +13,6 @@ UNSUPPORTED_MODELS_V1 = [
|
||||
"openai/whisper-large-v3", # transcription
|
||||
"facebook/bart-large-cnn", # encoder decoder
|
||||
"state-spaces/mamba-130m-hf", # mamba1
|
||||
"BAAI/bge-m3", # embedding
|
||||
]
|
||||
|
||||
MODEL = "meta-llama/Llama-3.2-1B-Instruct"
|
||||
|
||||
@ -1,9 +1,8 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import re
|
||||
|
||||
import pytest
|
||||
import regex as re
|
||||
import requests
|
||||
import torch
|
||||
|
||||
|
||||
@ -59,7 +59,7 @@ def test_basic(
|
||||
# actually test chunked prompt
|
||||
max_num_batched_tokens=1024,
|
||||
max_model_len=8192,
|
||||
gpu_memory_utilization=0.95,
|
||||
gpu_memory_utilization=0.7,
|
||||
max_num_seqs=max_num_seqs,
|
||||
tensor_parallel_size=tensor_parallel_size) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts,
|
||||
|
||||
@ -67,4 +67,9 @@ class InfEncoder(json.JSONEncoder):
|
||||
|
||||
def write_to_json(filename: str, records: list) -> None:
|
||||
with open(filename, "w") as f:
|
||||
json.dump(records, f, cls=InfEncoder)
|
||||
json.dump(
|
||||
records,
|
||||
f,
|
||||
cls=InfEncoder,
|
||||
default=lambda o: f"<{type(o).__name__} is not JSON serializable>",
|
||||
)
|
||||
|
||||
@ -4790,26 +4790,26 @@ class VllmConfig:
|
||||
|
||||
def __str__(self):
|
||||
return (
|
||||
f"model={self.model_config.model!r},"
|
||||
f" speculative_config={self.speculative_config!r},"
|
||||
f" tokenizer={self.model_config.tokenizer!r}, "
|
||||
f"skip_tokenizer_init={self.model_config.skip_tokenizer_init},"
|
||||
f" tokenizer_mode={self.model_config.tokenizer_mode}, "
|
||||
f"model={self.model_config.model!r}, "
|
||||
f"speculative_config={self.speculative_config!r}, "
|
||||
f"tokenizer={self.model_config.tokenizer!r}, "
|
||||
f"skip_tokenizer_init={self.model_config.skip_tokenizer_init}, "
|
||||
f"tokenizer_mode={self.model_config.tokenizer_mode}, "
|
||||
f"revision={self.model_config.revision}, "
|
||||
f"override_neuron_config={self.model_config.override_neuron_config},"
|
||||
f" tokenizer_revision={self.model_config.tokenizer_revision}, "
|
||||
f"override_neuron_config={self.model_config.override_neuron_config}, " # noqa
|
||||
f"tokenizer_revision={self.model_config.tokenizer_revision}, "
|
||||
f"trust_remote_code={self.model_config.trust_remote_code}, "
|
||||
f"dtype={self.model_config.dtype}, "
|
||||
f"max_seq_len={self.model_config.max_model_len},"
|
||||
f" download_dir={self.load_config.download_dir!r}, "
|
||||
f"max_seq_len={self.model_config.max_model_len}, "
|
||||
f"download_dir={self.load_config.download_dir!r}, "
|
||||
f"load_format={self.load_config.load_format}, "
|
||||
f"tensor_parallel_size={self.parallel_config.tensor_parallel_size},"
|
||||
f" pipeline_parallel_size={self.parallel_config.pipeline_parallel_size}, " # noqa
|
||||
f"tensor_parallel_size={self.parallel_config.tensor_parallel_size}, " # noqa
|
||||
f"pipeline_parallel_size={self.parallel_config.pipeline_parallel_size}, " # noqa
|
||||
f"disable_custom_all_reduce={self.parallel_config.disable_custom_all_reduce}, " # noqa
|
||||
f"quantization={self.model_config.quantization}, "
|
||||
f"enforce_eager={self.model_config.enforce_eager}, "
|
||||
f"kv_cache_dtype={self.cache_config.cache_dtype}, "
|
||||
f" device_config={self.device_config.device}, "
|
||||
f"device_config={self.device_config.device}, "
|
||||
f"decoding_config={self.decoding_config!r}, "
|
||||
f"observability_config={self.observability_config!r}, "
|
||||
f"seed={self.model_config.seed}, "
|
||||
|
||||
@ -156,8 +156,16 @@ class SharedStorageConnector(KVConnectorBase_V1):
|
||||
logger.info("Inject KV cache of %d tokens to the paged memory",
|
||||
len(request.slot_mapping))
|
||||
for layer_name in forward_context.no_compile_layers:
|
||||
attn_layer = forward_context.no_compile_layers[layer_name]
|
||||
kv_cache_layer = attn_layer.kv_cache[\
|
||||
layer = forward_context.no_compile_layers[layer_name]
|
||||
|
||||
# Only process layers that have kv_cache
|
||||
# attribute (attention layers) Skip non-attention
|
||||
# layers like FusedMoE/MLP etc.
|
||||
kv_cache_attr = getattr(layer, 'kv_cache', None)
|
||||
if kv_cache_attr is None:
|
||||
continue
|
||||
|
||||
kv_cache_layer = kv_cache_attr[ \
|
||||
forward_context.virtual_engine]
|
||||
|
||||
filename = self._generate_filename_debug(
|
||||
|
||||
@ -1649,7 +1649,8 @@ class EngineArgs:
|
||||
|
||||
if (self.max_num_seqs is None
|
||||
and usage_context in default_max_num_seqs):
|
||||
self.max_num_seqs = default_max_num_seqs[usage_context]
|
||||
self.max_num_seqs = min(default_max_num_seqs[usage_context],
|
||||
self.max_num_batched_tokens or sys.maxsize)
|
||||
|
||||
logger.debug("Setting max_num_seqs to %d for %s usage context.",
|
||||
self.max_num_seqs, use_context_value)
|
||||
|
||||
@ -97,11 +97,16 @@ class MQLLMEngineClient(EngineClient):
|
||||
self.model_config = engine_config.model_config
|
||||
self.decoding_config = engine_config.decoding_config
|
||||
|
||||
# Create the tokenizer group.
|
||||
self.tokenizer = init_tokenizer_from_configs(
|
||||
model_config=self.model_config,
|
||||
scheduler_config=engine_config.scheduler_config,
|
||||
lora_config=engine_config.lora_config)
|
||||
if self.vllm_config.model_config.skip_tokenizer_init:
|
||||
self.tokenizer = None
|
||||
|
||||
else:
|
||||
# Create the tokenizer group.
|
||||
self.tokenizer = init_tokenizer_from_configs(
|
||||
model_config=self.model_config,
|
||||
scheduler_config=engine_config.scheduler_config,
|
||||
lora_config=engine_config.lora_config)
|
||||
|
||||
self.input_preprocessor = InputPreprocessor(self.model_config,
|
||||
self.tokenizer)
|
||||
|
||||
@ -375,7 +380,10 @@ class MQLLMEngineClient(EngineClient):
|
||||
return self.input_preprocessor
|
||||
|
||||
async def get_tokenizer(self, lora_request: Optional[LoRARequest] = None):
|
||||
return await self.tokenizer.get_lora_tokenizer_async(lora_request)
|
||||
if self.tokenizer is None:
|
||||
return None
|
||||
else:
|
||||
return await self.tokenizer.get_lora_tokenizer_async(lora_request)
|
||||
|
||||
async def get_vllm_config(self) -> VllmConfig:
|
||||
return self.vllm_config
|
||||
|
||||
@ -14,6 +14,7 @@ from pydantic import ValidationError
|
||||
from tqdm.auto import tqdm
|
||||
from typing_extensions import TypeVar, deprecated
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput,
|
||||
BeamSearchSequence,
|
||||
create_sort_beams_key_function)
|
||||
@ -44,9 +45,10 @@ from vllm.model_executor.layers.quantization import QuantizationMethods
|
||||
from vllm.outputs import (ClassificationRequestOutput, EmbeddingRequestOutput,
|
||||
PoolingRequestOutput, RequestOutput,
|
||||
ScoringRequestOutput)
|
||||
from vllm.pooling_params import PoolingParams, PoolingTask
|
||||
from vllm.pooling_params import PoolingParams
|
||||
from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams,
|
||||
RequestOutputKind, SamplingParams)
|
||||
from vllm.tasks import PoolingTask
|
||||
from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
|
||||
get_cached_tokenizer)
|
||||
from vllm.usage.usage_lib import UsageContext
|
||||
@ -277,6 +279,16 @@ class LLM:
|
||||
self.request_counter = Counter()
|
||||
self.default_sampling_params: Union[dict[str, Any], None] = None
|
||||
|
||||
if envs.VLLM_USE_V1:
|
||||
supported_tasks = self.llm_engine \
|
||||
.get_supported_tasks() # type: ignore
|
||||
else:
|
||||
supported_tasks = self.llm_engine.model_config.supported_tasks
|
||||
|
||||
logger.info("Supported_tasks: %s", supported_tasks)
|
||||
|
||||
self.supported_tasks = supported_tasks
|
||||
|
||||
def get_tokenizer(
|
||||
self,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
@ -1170,8 +1182,7 @@ class LLM:
|
||||
A list of `EmbeddingRequestOutput` objects containing the
|
||||
embedding vectors in the same order as the input prompts.
|
||||
"""
|
||||
model_config = self.llm_engine.model_config
|
||||
if "embed" not in model_config.supported_tasks:
|
||||
if "embed" not in self.supported_tasks:
|
||||
raise ValueError("Embedding API is not supported by this model. "
|
||||
"Please set `--task embed`.")
|
||||
|
||||
@ -1215,8 +1226,7 @@ class LLM:
|
||||
A list of `ClassificationRequestOutput` objects containing the
|
||||
embedding vectors in the same order as the input prompts.
|
||||
"""
|
||||
model_config = self.llm_engine.model_config
|
||||
if "classify" not in model_config.supported_tasks:
|
||||
if "classify" not in self.supported_tasks:
|
||||
raise ValueError(
|
||||
"Classification API is not supported by this model. "
|
||||
"Please set `--task classify`.")
|
||||
@ -1397,8 +1407,8 @@ class LLM:
|
||||
|
||||
raise ValueError(" ".join(messages))
|
||||
|
||||
if all(t not in model_config.supported_tasks
|
||||
for t in ("embed", "classify")):
|
||||
supported_tasks = self.supported_tasks
|
||||
if all(t not in supported_tasks for t in ("embed", "classify")):
|
||||
raise ValueError("Score API is not supported by this model. "
|
||||
"Please set `--task embed` or `--task classify`.")
|
||||
|
||||
|
||||
@ -1586,6 +1586,14 @@ async def init_app_state(
|
||||
state.vllm_config = vllm_config
|
||||
model_config = vllm_config.model_config
|
||||
|
||||
if envs.VLLM_USE_V1:
|
||||
supported_tasks = await engine_client \
|
||||
.get_supported_tasks() # type: ignore
|
||||
else:
|
||||
supported_tasks = model_config.supported_tasks
|
||||
|
||||
logger.info("Supported_tasks: %s", supported_tasks)
|
||||
|
||||
resolved_chat_template = load_chat_template(args.chat_template)
|
||||
if resolved_chat_template is not None:
|
||||
# Get the tokenizer to check official template
|
||||
@ -1647,7 +1655,7 @@ async def init_app_state(
|
||||
reasoning_parser=args.reasoning_parser,
|
||||
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
|
||||
enable_force_include_usage=args.enable_force_include_usage,
|
||||
) if "generate" in model_config.supported_tasks else None
|
||||
) if "generate" in supported_tasks else None
|
||||
state.openai_serving_chat = OpenAIServingChat(
|
||||
engine_client,
|
||||
model_config,
|
||||
@ -1664,7 +1672,7 @@ async def init_app_state(
|
||||
reasoning_parser=args.reasoning_parser,
|
||||
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
|
||||
enable_force_include_usage=args.enable_force_include_usage,
|
||||
) if "generate" in model_config.supported_tasks else None
|
||||
) if "generate" in supported_tasks else None
|
||||
state.openai_serving_completion = OpenAIServingCompletion(
|
||||
engine_client,
|
||||
model_config,
|
||||
@ -1673,7 +1681,7 @@ async def init_app_state(
|
||||
return_tokens_as_token_ids=args.return_tokens_as_token_ids,
|
||||
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
|
||||
enable_force_include_usage=args.enable_force_include_usage,
|
||||
) if "generate" in model_config.supported_tasks else None
|
||||
) if "generate" in supported_tasks else None
|
||||
state.openai_serving_pooling = OpenAIServingPooling(
|
||||
engine_client,
|
||||
model_config,
|
||||
@ -1681,7 +1689,7 @@ async def init_app_state(
|
||||
request_logger=request_logger,
|
||||
chat_template=resolved_chat_template,
|
||||
chat_template_content_format=args.chat_template_content_format,
|
||||
) if "encode" in model_config.supported_tasks else None
|
||||
) if "encode" in supported_tasks else None
|
||||
state.openai_serving_embedding = OpenAIServingEmbedding(
|
||||
engine_client,
|
||||
model_config,
|
||||
@ -1689,24 +1697,22 @@ async def init_app_state(
|
||||
request_logger=request_logger,
|
||||
chat_template=resolved_chat_template,
|
||||
chat_template_content_format=args.chat_template_content_format,
|
||||
) if "embed" in model_config.supported_tasks else None
|
||||
) if "embed" in supported_tasks else None
|
||||
state.openai_serving_classification = ServingClassification(
|
||||
engine_client,
|
||||
model_config,
|
||||
state.openai_serving_models,
|
||||
request_logger=request_logger,
|
||||
) if "classify" in model_config.supported_tasks else None
|
||||
) if "classify" in supported_tasks else None
|
||||
|
||||
enable_serving_reranking = ("classify" in model_config.supported_tasks
|
||||
and getattr(model_config.hf_config,
|
||||
"num_labels", 0) == 1)
|
||||
enable_serving_reranking = ("classify" in supported_tasks and getattr(
|
||||
model_config.hf_config, "num_labels", 0) == 1)
|
||||
state.openai_serving_scores = ServingScores(
|
||||
engine_client,
|
||||
model_config,
|
||||
state.openai_serving_models,
|
||||
request_logger=request_logger,
|
||||
) if ("embed" in model_config.supported_tasks
|
||||
or enable_serving_reranking) else None
|
||||
) if ("embed" in supported_tasks or enable_serving_reranking) else None
|
||||
|
||||
state.openai_serving_tokenization = OpenAIServingTokenization(
|
||||
engine_client,
|
||||
@ -1721,13 +1727,13 @@ async def init_app_state(
|
||||
model_config,
|
||||
state.openai_serving_models,
|
||||
request_logger=request_logger,
|
||||
) if "transcription" in model_config.supported_tasks else None
|
||||
) if "transcription" in supported_tasks else None
|
||||
state.openai_serving_translation = OpenAIServingTranslation(
|
||||
engine_client,
|
||||
model_config,
|
||||
state.openai_serving_models,
|
||||
request_logger=request_logger,
|
||||
) if "transcription" in model_config.supported_tasks else None
|
||||
) if "transcription" in supported_tasks else None
|
||||
state.task = model_config.task
|
||||
|
||||
state.enable_server_load_tracking = args.enable_server_load_tracking
|
||||
|
||||
@ -1007,6 +1007,13 @@ class CompletionRequest(OpenAIBaseModel):
|
||||
"default: 0). Any priority other than 0 will raise an error "
|
||||
"if the served model does not use priority scheduling."),
|
||||
)
|
||||
request_id: str = Field(
|
||||
default_factory=lambda: f"{random_uuid()}",
|
||||
description=(
|
||||
"The request_id related to this request. If the caller does "
|
||||
"not set it, a random_uuid will be generated. This id is used "
|
||||
"through out the inference process and return in response."),
|
||||
)
|
||||
logits_processors: Optional[LogitsProcessors] = Field(
|
||||
default=None,
|
||||
description=(
|
||||
@ -1251,6 +1258,13 @@ class EmbeddingCompletionRequest(OpenAIBaseModel):
|
||||
"default: 0). Any priority other than 0 will raise an error "
|
||||
"if the served model does not use priority scheduling."),
|
||||
)
|
||||
request_id: str = Field(
|
||||
default_factory=lambda: f"{random_uuid()}",
|
||||
description=(
|
||||
"The request_id related to this request. If the caller does "
|
||||
"not set it, a random_uuid will be generated. This id is used "
|
||||
"through out the inference process and return in response."),
|
||||
)
|
||||
|
||||
# --8<-- [end:embedding-extra-params]
|
||||
|
||||
@ -1302,6 +1316,13 @@ class EmbeddingChatRequest(OpenAIBaseModel):
|
||||
"default: 0). Any priority other than 0 will raise an error "
|
||||
"if the served model does not use priority scheduling."),
|
||||
)
|
||||
request_id: str = Field(
|
||||
default_factory=lambda: f"{random_uuid()}",
|
||||
description=(
|
||||
"The request_id related to this request. If the caller does "
|
||||
"not set it, a random_uuid will be generated. This id is used "
|
||||
"through out the inference process and return in response."),
|
||||
)
|
||||
# --8<-- [end:chat-embedding-extra-params]
|
||||
|
||||
@model_validator(mode="before")
|
||||
|
||||
@ -14,6 +14,7 @@ import torch
|
||||
from prometheus_client import start_http_server
|
||||
from tqdm import tqdm
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs, optional_type
|
||||
from vllm.engine.protocol import EngineClient
|
||||
@ -335,6 +336,14 @@ async def run_batch(
|
||||
|
||||
model_config = vllm_config.model_config
|
||||
|
||||
if envs.VLLM_USE_V1:
|
||||
supported_tasks = await engine_client \
|
||||
.get_supported_tasks() # type: ignore
|
||||
else:
|
||||
supported_tasks = model_config.supported_tasks
|
||||
|
||||
logger.info("Supported_tasks: %s", supported_tasks)
|
||||
|
||||
# Create the openai serving objects.
|
||||
openai_serving_models = OpenAIServingModels(
|
||||
engine_client=engine_client,
|
||||
@ -351,7 +360,7 @@ async def run_batch(
|
||||
chat_template=None,
|
||||
chat_template_content_format="auto",
|
||||
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
|
||||
) if "generate" in model_config.supported_tasks else None
|
||||
) if "generate" in supported_tasks else None
|
||||
openai_serving_embedding = OpenAIServingEmbedding(
|
||||
engine_client,
|
||||
model_config,
|
||||
@ -359,19 +368,17 @@ async def run_batch(
|
||||
request_logger=request_logger,
|
||||
chat_template=None,
|
||||
chat_template_content_format="auto",
|
||||
) if "embed" in model_config.supported_tasks else None
|
||||
) if "embed" in supported_tasks else None
|
||||
|
||||
enable_serving_reranking = ("classify" in model_config.supported_tasks
|
||||
and getattr(model_config.hf_config,
|
||||
"num_labels", 0) == 1)
|
||||
enable_serving_reranking = ("classify" in supported_tasks and getattr(
|
||||
model_config.hf_config, "num_labels", 0) == 1)
|
||||
|
||||
openai_serving_scores = ServingScores(
|
||||
engine_client,
|
||||
model_config,
|
||||
openai_serving_models,
|
||||
request_logger=request_logger,
|
||||
) if ("embed" in model_config.supported_tasks
|
||||
or enable_serving_reranking) else None
|
||||
) if ("embed" in supported_tasks or enable_serving_reranking) else None
|
||||
|
||||
tracker = BatchProgressTracker()
|
||||
logger.info("Reading batch from %s...", args.input_file)
|
||||
|
||||
@ -113,7 +113,9 @@ class OpenAIServingCompletion(OpenAIServing):
|
||||
return self.create_error_response(
|
||||
"Echo is unsupported with prompt embeds.")
|
||||
|
||||
request_id = f"cmpl-{self._base_request_id(raw_request)}"
|
||||
request_id = (
|
||||
f"cmpl-"
|
||||
f"{self._base_request_id(raw_request, request.request_id)}")
|
||||
created_time = int(time.time())
|
||||
|
||||
request_metadata = RequestResponseMetadata(request_id=request_id)
|
||||
|
||||
@ -163,8 +163,9 @@ class OpenAIServingEmbedding(EmbeddingMixin):
|
||||
for the API specification. This API mimics the OpenAI Embedding API.
|
||||
"""
|
||||
model_name = self._get_model_name(request.model)
|
||||
request_id = (f"{self.request_id_prefix}-"
|
||||
f"{self._base_request_id(raw_request)}")
|
||||
request_id = (
|
||||
f"{self.request_id_prefix}-"
|
||||
f"{self._base_request_id(raw_request, request.request_id)}")
|
||||
|
||||
ctx = EmbeddingServeContext(
|
||||
request=request,
|
||||
|
||||
@ -880,7 +880,10 @@ class OpenAIServing:
|
||||
_chat_template_kwargs.update(chat_template_kwargs or {})
|
||||
|
||||
request_prompt: Union[str, list[int]]
|
||||
if isinstance(tokenizer, MistralTokenizer):
|
||||
|
||||
if tokenizer is None:
|
||||
request_prompt = "placeholder"
|
||||
elif isinstance(tokenizer, MistralTokenizer):
|
||||
request_prompt = apply_mistral_chat_template(
|
||||
tokenizer,
|
||||
messages=messages,
|
||||
@ -910,7 +913,14 @@ class OpenAIServing:
|
||||
request = tool_parser(tokenizer).adjust_request( # type: ignore
|
||||
request=request)
|
||||
|
||||
if isinstance(request_prompt, str):
|
||||
if tokenizer is None:
|
||||
assert isinstance(request_prompt, str), (
|
||||
"Prompt has to be a string", \
|
||||
"when the tokenizer is not initialised"
|
||||
)
|
||||
prompt_inputs = TextTokensPrompt(prompt=request_prompt,
|
||||
prompt_token_ids=[1])
|
||||
elif isinstance(request_prompt, str):
|
||||
prompt_inputs = await self._tokenize_prompt_input_async(
|
||||
request,
|
||||
tokenizer,
|
||||
@ -947,9 +957,11 @@ class OpenAIServing:
|
||||
def _load_and_validate_embed(embed: bytes) -> EmbedsPrompt:
|
||||
tensor = torch.load(io.BytesIO(base64.b64decode(embed)),
|
||||
weights_only=True)
|
||||
assert isinstance(
|
||||
tensor,
|
||||
(torch.FloatTensor, torch.BFloat16Tensor, torch.HalfTensor))
|
||||
assert isinstance(tensor, torch.Tensor) and tensor.dtype in (
|
||||
torch.float32,
|
||||
torch.bfloat16,
|
||||
torch.float16,
|
||||
)
|
||||
if tensor.dim() > 2:
|
||||
tensor = tensor.squeeze(0)
|
||||
assert tensor.dim() == 2
|
||||
|
||||
@ -96,7 +96,11 @@ class OpenAIServingPooling(OpenAIServing):
|
||||
self.max_model_len, truncate_prompt_tokens)
|
||||
lora_request = self._maybe_get_adapters(request)
|
||||
|
||||
tokenizer = await self.engine_client.get_tokenizer(lora_request)
|
||||
if self.model_config.skip_tokenizer_init:
|
||||
tokenizer = None
|
||||
else:
|
||||
tokenizer = await self.engine_client.get_tokenizer(lora_request
|
||||
)
|
||||
|
||||
if isinstance(request, PoolingChatRequest):
|
||||
(
|
||||
|
||||
@ -16,8 +16,8 @@ from vllm.config import VllmConfig
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.model_executor.layers.sampler import SamplerOutput
|
||||
from vllm.pooling_params import PoolingTask
|
||||
from vllm.sequence import ExecuteModelRequest, PoolerOutput
|
||||
from vllm.tasks import SupportedTask
|
||||
from vllm.utils import make_async
|
||||
from vllm.worker.worker_base import WorkerBase
|
||||
|
||||
@ -136,9 +136,9 @@ class ExecutorBase(ABC):
|
||||
return self.collective_rpc(rpc_func)
|
||||
|
||||
@cached_property # Avoid unnecessary RPC calls
|
||||
def supported_pooling_tasks(self) -> tuple[PoolingTask, ...]:
|
||||
output = self.collective_rpc("get_supported_pooling_tasks")
|
||||
return tuple({task for tasks in output for task in tasks})
|
||||
def supported_tasks(self) -> tuple[SupportedTask, ...]:
|
||||
output = self.collective_rpc("get_supported_tasks")
|
||||
return output[0]
|
||||
|
||||
def execute_model(
|
||||
self, execute_model_req: ExecuteModelRequest
|
||||
|
||||
@ -1127,6 +1127,7 @@ def flashinfer_fused_moe_blockscale_fp8(
|
||||
tile_tokens_dim=_get_tile_tokens_dim(x.shape[0], top_k,
|
||||
global_num_experts),
|
||||
routing_method_type=2, # DeepSeek-styled routing method
|
||||
use_shuffled_weight=False,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@ -5,144 +5,8 @@ from typing import Optional
|
||||
import torch
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.triton_utils import tl, triton
|
||||
from vllm.utils import cdiv, round_up
|
||||
|
||||
|
||||
@triton.jit
|
||||
def moe_align_block_size_stage1(
|
||||
topk_ids_ptr,
|
||||
tokens_cnts_ptr,
|
||||
num_experts: tl.constexpr,
|
||||
numel: tl.constexpr,
|
||||
tokens_per_thread: tl.constexpr,
|
||||
):
|
||||
pid = tl.program_id(0)
|
||||
|
||||
start_idx = pid * tokens_per_thread
|
||||
|
||||
off_c = (pid + 1) * num_experts
|
||||
|
||||
for i in range(tokens_per_thread):
|
||||
if start_idx + i < numel:
|
||||
idx = tl.load(topk_ids_ptr + start_idx + i)
|
||||
token_cnt = tl.load(tokens_cnts_ptr + off_c + idx)
|
||||
tl.store(tokens_cnts_ptr + off_c + idx, token_cnt + 1)
|
||||
|
||||
|
||||
@triton.jit
|
||||
def moe_align_block_size_stage2(
|
||||
tokens_cnts_ptr,
|
||||
num_experts: tl.constexpr,
|
||||
):
|
||||
pid = tl.program_id(0)
|
||||
|
||||
last_cnt = 0
|
||||
for i in range(1, num_experts + 1):
|
||||
token_cnt = tl.load(tokens_cnts_ptr + i * num_experts + pid)
|
||||
last_cnt = last_cnt + token_cnt
|
||||
tl.store(tokens_cnts_ptr + i * num_experts + pid, last_cnt)
|
||||
|
||||
|
||||
@triton.jit
|
||||
def moe_align_block_size_stage3(
|
||||
total_tokens_post_pad_ptr,
|
||||
tokens_cnts_ptr,
|
||||
cumsum_ptr,
|
||||
num_experts: tl.constexpr,
|
||||
block_size: tl.constexpr,
|
||||
):
|
||||
last_cumsum = 0
|
||||
off_cnt = num_experts * num_experts
|
||||
for i in range(1, num_experts + 1):
|
||||
token_cnt = tl.load(tokens_cnts_ptr + off_cnt + i - 1)
|
||||
last_cumsum = last_cumsum + tl.cdiv(token_cnt, block_size) * block_size
|
||||
tl.store(cumsum_ptr + i, last_cumsum)
|
||||
tl.store(total_tokens_post_pad_ptr, last_cumsum)
|
||||
|
||||
|
||||
@triton.jit
|
||||
def moe_align_block_size_stage4(
|
||||
topk_ids_ptr,
|
||||
sorted_token_ids_ptr,
|
||||
expert_ids_ptr,
|
||||
tokens_cnts_ptr,
|
||||
cumsum_ptr,
|
||||
num_experts: tl.constexpr,
|
||||
block_size: tl.constexpr,
|
||||
numel: tl.constexpr,
|
||||
tokens_per_thread: tl.constexpr,
|
||||
):
|
||||
pid = tl.program_id(0)
|
||||
start_idx = tl.load(cumsum_ptr + pid)
|
||||
end_idx = tl.load(cumsum_ptr + pid + 1)
|
||||
|
||||
for i in range(start_idx, end_idx, block_size):
|
||||
tl.store(expert_ids_ptr + i // block_size, pid)
|
||||
|
||||
start_idx = pid * tokens_per_thread
|
||||
off_t = pid * num_experts
|
||||
|
||||
for i in range(start_idx, tl.minimum(start_idx + tokens_per_thread,
|
||||
numel)):
|
||||
expert_id = tl.load(topk_ids_ptr + i)
|
||||
token_cnt = tl.load(tokens_cnts_ptr + off_t + expert_id)
|
||||
rank_post_pad = token_cnt + tl.load(cumsum_ptr + expert_id)
|
||||
tl.store(sorted_token_ids_ptr + rank_post_pad, i)
|
||||
tl.store(tokens_cnts_ptr + off_t + expert_id, token_cnt + 1)
|
||||
|
||||
|
||||
# Triton implementation based on:
|
||||
# https://github.com/sgl-project/sglang/commit/ba5112ff691d791a9e38c6c71f59324a5fcb49d0
|
||||
def moe_align_block_size_triton(
|
||||
topk_ids: torch.Tensor,
|
||||
num_experts: int,
|
||||
block_size: int,
|
||||
sorted_token_ids: torch.Tensor,
|
||||
expert_ids: torch.Tensor,
|
||||
num_tokens_post_pad: torch.Tensor,
|
||||
) -> None:
|
||||
numel = topk_ids.numel()
|
||||
grid = (num_experts, )
|
||||
tokens_cnts = torch.zeros((num_experts + 1, num_experts),
|
||||
dtype=torch.int32,
|
||||
device=topk_ids.device)
|
||||
cumsum = torch.zeros((num_experts + 1, ),
|
||||
dtype=torch.int32,
|
||||
device=topk_ids.device)
|
||||
tokens_per_thread = cdiv(numel, num_experts)
|
||||
sorted_token_ids.fill_(numel)
|
||||
expert_ids.zero_()
|
||||
|
||||
moe_align_block_size_stage1[grid](
|
||||
topk_ids,
|
||||
tokens_cnts,
|
||||
num_experts,
|
||||
numel,
|
||||
tokens_per_thread,
|
||||
)
|
||||
moe_align_block_size_stage2[grid](
|
||||
tokens_cnts,
|
||||
num_experts,
|
||||
)
|
||||
moe_align_block_size_stage3[(1, )](
|
||||
num_tokens_post_pad,
|
||||
tokens_cnts,
|
||||
cumsum,
|
||||
num_experts,
|
||||
block_size,
|
||||
)
|
||||
moe_align_block_size_stage4[grid](
|
||||
topk_ids,
|
||||
sorted_token_ids,
|
||||
expert_ids,
|
||||
tokens_cnts,
|
||||
cumsum,
|
||||
num_experts,
|
||||
block_size,
|
||||
numel,
|
||||
tokens_per_thread,
|
||||
)
|
||||
from vllm.triton_utils import triton
|
||||
from vllm.utils import round_up
|
||||
|
||||
|
||||
def moe_align_block_size(
|
||||
|
||||
@ -76,43 +76,43 @@ def _moe_unpermute_and_reduce(
|
||||
|
||||
def moe_permute(
|
||||
hidden_states: torch.Tensor,
|
||||
topk_weights: torch.Tensor,
|
||||
a1q_scale: Optional[torch.Tensor],
|
||||
topk_ids: torch.Tensor,
|
||||
token_expert_indices: torch.Tensor,
|
||||
topk: int,
|
||||
n_expert: int,
|
||||
n_local_expert: int,
|
||||
n_local_expert: int = -1,
|
||||
expert_map: Optional[torch.Tensor] = None,
|
||||
align_block_size: Optional[int] = None,
|
||||
fill_invalid_expert: int = -1
|
||||
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||
) -> tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor, torch.Tensor,
|
||||
torch.Tensor]:
|
||||
"""
|
||||
This function expands and permutes activation to gather uncontinuous tokens
|
||||
for each expert.
|
||||
Parameters:
|
||||
- hidden_states (torch.Tensor): The input tensor to the MoE layer.
|
||||
- topk_weights (torch.Tensor): topk expert route weight for each token.
|
||||
- a1q_scale (Optional[torch.Tensor]): quant scale for hidden_states
|
||||
- topk_ids (torch.Tensor): topk expert route id for each token.
|
||||
- token_expert_indices (torch.Tensor): indice for expanded hidden.
|
||||
- topk (int): The number of top-k experts to select.
|
||||
- n_expert (int): The number of expert.
|
||||
- n_local_expert (int): The number of expert in current EP rank.
|
||||
- expert_map (Optional[torch.Tensor]): A tensor mapping expert indices
|
||||
from the global expert space to the local expert space of the expert
|
||||
from the global expert space to the local expert space of the expert
|
||||
parallel shard.
|
||||
- align_block_size (Optional[int]): align group gemm block size for deepgemm
|
||||
- fill_invalid_expert(int): fill expert id in m_indices for invalid expert
|
||||
to workaround DeepGemm unsupported -1 in m_indices
|
||||
Returns:
|
||||
- permuted_hidden_states (torch.Tensor): permuted activation.
|
||||
- a1q_scale (Optional[torch.Tensor]): quant scale for hidden_states
|
||||
- expert_first_token_offset (torch.Tensor): offset of the first token
|
||||
of each expert for standard grouped gemm. if enable 'align_block_size'
|
||||
expert_first_token_offset will align up to 'align_block_size'.
|
||||
- src_row_id2dst_row_id_map (torch.Tensor): idx map for moe_unpermute.
|
||||
- inv_permuted_idx (torch.Tensor): idx map for moe_unpermute.
|
||||
- permuted_idx (torch.Tensor): idx map from hidden to permuted_hidden.
|
||||
- m_indices: m_indices for grouped gemm in deepgemm,`m_indices[i]` records
|
||||
the group which the j-th row of the LHS belong to.`
|
||||
"""
|
||||
n_token, n_hidden = hidden_states.size()
|
||||
topk = topk_ids.size(1)
|
||||
assert (n_hidden * hidden_states.element_size()
|
||||
) % 16 == 0, "permue kernel need hidden dim align to 16B"
|
||||
permuted_row_size = n_token * topk
|
||||
@ -120,12 +120,19 @@ def moe_permute(
|
||||
permuted_row_size = (permuted_row_size + n_expert *
|
||||
(align_block_size - 1) + align_block_size -
|
||||
1) // align_block_size * align_block_size
|
||||
|
||||
if n_local_expert == -1:
|
||||
n_local_expert = n_expert
|
||||
permuted_hidden_states = torch.empty(
|
||||
(permuted_row_size, n_hidden),
|
||||
dtype=hidden_states.dtype,
|
||||
device=hidden_states.device,
|
||||
)
|
||||
token_expert_indices = torch.arange(0,
|
||||
n_token * topk,
|
||||
dtype=torch.int32,
|
||||
device=hidden_states.device).reshape(
|
||||
(n_token, topk))
|
||||
|
||||
m_indices = torch.full((permuted_row_size, ),
|
||||
fill_invalid_expert,
|
||||
dtype=torch.int32,
|
||||
@ -133,57 +140,54 @@ def moe_permute(
|
||||
expert_first_token_offset = torch.empty(n_local_expert + 1,
|
||||
dtype=torch.int64,
|
||||
device=hidden_states.device)
|
||||
src_row_id2dst_row_id_map = torch.empty((n_token, topk),
|
||||
dtype=torch.int32,
|
||||
device=hidden_states.device)
|
||||
torch.ops._moe_C.moe_permute(hidden_states, topk_weights, topk_ids,
|
||||
token_expert_indices, expert_map, n_expert,
|
||||
n_local_expert, topk, align_block_size,
|
||||
permuted_hidden_states,
|
||||
expert_first_token_offset,
|
||||
src_row_id2dst_row_id_map, m_indices)
|
||||
return (permuted_hidden_states, expert_first_token_offset,
|
||||
src_row_id2dst_row_id_map, m_indices)
|
||||
permuted_idx = torch.full((permuted_row_size, ),
|
||||
n_token * topk,
|
||||
dtype=torch.int32,
|
||||
device=hidden_states.device)
|
||||
inv_permuted_idx = torch.empty((n_token, topk),
|
||||
dtype=torch.int32,
|
||||
device=hidden_states.device)
|
||||
topk_ids = topk_ids.to(torch.int32)
|
||||
torch.ops._moe_C.moe_permute(hidden_states, topk_ids, token_expert_indices,
|
||||
expert_map, n_expert, n_local_expert, topk,
|
||||
align_block_size, permuted_hidden_states,
|
||||
expert_first_token_offset, inv_permuted_idx,
|
||||
permuted_idx, m_indices)
|
||||
if a1q_scale is not None:
|
||||
a1q_scale = a1q_scale[permuted_idx.clamp(max=n_token * topk - 1) //
|
||||
topk]
|
||||
return (permuted_hidden_states, a1q_scale, expert_first_token_offset,
|
||||
inv_permuted_idx.flatten(), m_indices)
|
||||
|
||||
|
||||
def moe_unpermute(
|
||||
out: torch.Tensor,
|
||||
permuted_hidden_states: torch.Tensor,
|
||||
topk_weights: torch.Tensor,
|
||||
topk_ids: torch.Tensor,
|
||||
src_row_id2dst_row_id_map: torch.Tensor,
|
||||
expert_first_token_offset: torch.Tensor,
|
||||
topk: int,
|
||||
n_expert: int,
|
||||
n_local_expert: int,
|
||||
) -> torch.Tensor:
|
||||
inv_permuted_idx: torch.Tensor,
|
||||
expert_first_token_offset: Optional[torch.Tensor] = None,
|
||||
) -> None:
|
||||
"""
|
||||
This function expands and permutes activation to gathering uncontinuous
|
||||
tokens for each expert.
|
||||
Parameters:
|
||||
- out (torch.Tensor): output tensor
|
||||
- permuted_hidden_states (torch.Tensor): permuted activation.
|
||||
- topk_weights (torch.Tensor): topk expert route weight for each token.
|
||||
- topk_ids (torch.Tensor): topk expert route id for each token.
|
||||
- expert_first_token_offset (torch.Tensor): offset of the first token
|
||||
of each expert for grouped gemm.
|
||||
- topk (int): The number of top-k experts to select.
|
||||
- n_expert (int): The number of expert.
|
||||
- n_local_expert (int): The number of expert in current EP rank.
|
||||
- inv_permuted_idx (torch.Tensor): row idx map for moe_unpermute.
|
||||
- expert_first_token_offset (Optional[torch.Tensor]): offset of the first
|
||||
token of each expert for grouped gemm.
|
||||
Returns:
|
||||
- hidden_states (torch.Tensor): The reduced and unpermuted activation
|
||||
tensor.
|
||||
"""
|
||||
n_token, n_hidden = topk_weights.size(0), permuted_hidden_states.size(-1)
|
||||
topk = topk_weights.size(1)
|
||||
n_hidden = permuted_hidden_states.size(-1)
|
||||
assert (n_hidden * permuted_hidden_states.element_size()
|
||||
) % 16 == 0, "unpermue kernel need hidden dim align to 16B"
|
||||
hidden_states = torch.empty((n_token, n_hidden),
|
||||
dtype=permuted_hidden_states.dtype,
|
||||
device=permuted_hidden_states.device)
|
||||
|
||||
torch.ops._moe_C.moe_unpermute(permuted_hidden_states, topk_weights,
|
||||
topk_ids, src_row_id2dst_row_id_map,
|
||||
expert_first_token_offset, n_expert,
|
||||
n_local_expert, topk, hidden_states)
|
||||
return hidden_states
|
||||
inv_permuted_idx, expert_first_token_offset,
|
||||
topk, out)
|
||||
|
||||
|
||||
def moe_permute_unpermute_supported():
|
||||
|
||||
@ -24,6 +24,7 @@ from vllm.model_executor.layers.mamba.mamba_utils import (
|
||||
extra_groups_for_head_shards, get_mamba_state_shape)
|
||||
from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
|
||||
causal_conv1d_fn, causal_conv1d_update)
|
||||
from vllm.model_executor.layers.mamba.ops.layernorm_gated import rms_norm_gated
|
||||
from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
|
||||
selective_state_update)
|
||||
from vllm.model_executor.layers.mamba.ops.ssd_combined import (
|
||||
@ -133,21 +134,15 @@ class Mixer2RMSNormGated(CustomOp):
|
||||
return x * nn.functional.silu(gate.to(
|
||||
torch.float32)).to(input_dtype)
|
||||
|
||||
if self.tp_size > 1 or self.n_groups != 1:
|
||||
if (((self.n_groups % self.tp_size) != 0) or self.n_groups != 1):
|
||||
return self.forward_native(x, gate)
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
|
||||
# cast x and gate to float32 before silu
|
||||
out = torch.empty_like(x)
|
||||
y = x * nn.functional.silu(gate.to(torch.float32))
|
||||
ops.rms_norm(
|
||||
out,
|
||||
y.to(x.dtype),
|
||||
self.weight.data,
|
||||
self.variance_epsilon,
|
||||
)
|
||||
return out
|
||||
return rms_norm_gated(x,
|
||||
self.weight.data,
|
||||
bias=None,
|
||||
z=gate,
|
||||
eps=self.variance_epsilon,
|
||||
norm_before_gate=False)
|
||||
|
||||
|
||||
def mamba_v2_sharded_weight_loader(
|
||||
|
||||
168
vllm/model_executor/layers/mamba/ops/layernorm_gated.py
Normal file
168
vllm/model_executor/layers/mamba/ops/layernorm_gated.py
Normal file
@ -0,0 +1,168 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# Copyright (c) 2024, Tri Dao.
|
||||
# Adapted from https://github.com/state-spaces/mamba/blob/60dadf2e0ee730ac337035d5533de10bc26e4847/mamba_ssm/ops/triton/layernorm_gated.py
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.triton_utils import tl, triton
|
||||
|
||||
|
||||
@triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None})
|
||||
@triton.heuristics({"HAS_Z": lambda args: args["Z"] is not None})
|
||||
@triton.jit
|
||||
def _layer_norm_fwd_1pass_kernel(
|
||||
X, # pointer to the input
|
||||
Y, # pointer to the output
|
||||
W, # pointer to the weights
|
||||
B, # pointer to the biases
|
||||
Z, # pointer to the other branch
|
||||
Mean, # pointer to the mean
|
||||
Rstd, # pointer to the 1/std
|
||||
stride_x_row: tl.int64,
|
||||
stride_y_row: tl.int64,
|
||||
stride_z_row: tl.int64,
|
||||
M: tl.int64, # number of rows in X
|
||||
N: tl.int64, # number of columns in X
|
||||
eps, # epsilon to avoid division by zero
|
||||
BLOCK_N: tl.constexpr,
|
||||
HAS_BIAS: tl.constexpr,
|
||||
HAS_Z: tl.constexpr,
|
||||
NORM_BEFORE_GATE: tl.constexpr,
|
||||
IS_RMS_NORM: tl.constexpr,
|
||||
):
|
||||
# Map the program id to the row of X and Y it should compute.
|
||||
row = tl.program_id(0)
|
||||
group = tl.program_id(1)
|
||||
X += row * stride_x_row + group * N
|
||||
Y += row * stride_y_row + group * N
|
||||
if HAS_Z:
|
||||
Z += row * stride_z_row + group * N
|
||||
if not IS_RMS_NORM:
|
||||
Mean += group * M
|
||||
Rstd += group * M
|
||||
W += group * N
|
||||
if HAS_BIAS:
|
||||
B += group * N
|
||||
# Compute mean and variance
|
||||
cols = tl.arange(0, BLOCK_N)
|
||||
x = tl.load(X + cols, mask=cols < N, other=0.).to(tl.float32)
|
||||
if HAS_Z and not NORM_BEFORE_GATE:
|
||||
z = tl.load(Z + cols, mask=cols < N).to(tl.float32)
|
||||
x *= z * tl.sigmoid(z)
|
||||
if not IS_RMS_NORM:
|
||||
mean = tl.sum(x, axis=0) / N
|
||||
tl.store(Mean + row, mean)
|
||||
xbar = tl.where(cols < N, x - mean, 0.)
|
||||
var = tl.sum(xbar * xbar, axis=0) / N
|
||||
else:
|
||||
xbar = tl.where(cols < N, x, 0.)
|
||||
var = tl.sum(xbar * xbar, axis=0) / N
|
||||
rstd = 1 / tl.sqrt(var + eps)
|
||||
tl.store(Rstd + row, rstd)
|
||||
# Normalize and apply linear transformation
|
||||
mask = cols < N
|
||||
w = tl.load(W + cols, mask=mask).to(tl.float32)
|
||||
if HAS_BIAS:
|
||||
b = tl.load(B + cols, mask=mask).to(tl.float32)
|
||||
x_hat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd
|
||||
y = x_hat * w + b if HAS_BIAS else x_hat * w
|
||||
if HAS_Z and NORM_BEFORE_GATE:
|
||||
z = tl.load(Z + cols, mask=mask).to(tl.float32)
|
||||
y *= z * tl.sigmoid(z)
|
||||
# Write output
|
||||
tl.store(Y + cols, y, mask=mask)
|
||||
|
||||
|
||||
def _layer_norm_fwd(x,
|
||||
weight,
|
||||
bias,
|
||||
eps,
|
||||
z=None,
|
||||
out=None,
|
||||
group_size=None,
|
||||
norm_before_gate=True,
|
||||
is_rms_norm=False):
|
||||
M, N = x.shape
|
||||
if group_size is None:
|
||||
group_size = N
|
||||
assert N % group_size == 0
|
||||
ngroups = N // group_size
|
||||
assert x.stride(-1) == 1
|
||||
if z is not None:
|
||||
assert z.stride(-1) == 1
|
||||
assert z.shape == (M, N)
|
||||
assert weight.shape == (N, )
|
||||
assert weight.stride(-1) == 1
|
||||
if bias is not None:
|
||||
assert bias.stride(-1) == 1
|
||||
assert bias.shape == (N, )
|
||||
# allocate output
|
||||
if out is not None:
|
||||
assert out.shape == x.shape
|
||||
else:
|
||||
out = torch.empty_like(x)
|
||||
assert out.stride(-1) == 1
|
||||
mean = torch.empty((ngroups * M, ), dtype=torch.float32,
|
||||
device=x.device) if not is_rms_norm else None
|
||||
rstd = torch.empty((ngroups * M, ), dtype=torch.float32, device=x.device)
|
||||
# Less than 64KB per feature: enqueue fused kernel
|
||||
MAX_FUSED_SIZE = 65536 // x.element_size()
|
||||
BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(group_size))
|
||||
if group_size > BLOCK_N:
|
||||
raise RuntimeError(
|
||||
"This layer norm doesn't support feature dim >= 64KB.")
|
||||
# heuristics for number of warps
|
||||
num_warps = min(max(BLOCK_N // 256, 1), 8)
|
||||
grid = (M, ngroups)
|
||||
with torch.cuda.device(x.device.index):
|
||||
_layer_norm_fwd_1pass_kernel[grid](x,
|
||||
out,
|
||||
weight,
|
||||
bias,
|
||||
z,
|
||||
mean,
|
||||
rstd,
|
||||
x.stride(0),
|
||||
out.stride(0),
|
||||
z.stride(0) if z is not None else 0,
|
||||
M,
|
||||
group_size,
|
||||
eps,
|
||||
BLOCK_N=BLOCK_N,
|
||||
NORM_BEFORE_GATE=norm_before_gate,
|
||||
IS_RMS_NORM=is_rms_norm,
|
||||
num_warps=num_warps)
|
||||
return out, mean, rstd
|
||||
|
||||
|
||||
def rms_norm_gated(x,
|
||||
weight,
|
||||
bias,
|
||||
z=None,
|
||||
eps=1e-6,
|
||||
group_size=None,
|
||||
norm_before_gate=True):
|
||||
x_shape_og = x.shape
|
||||
# reshape input data into 2D tensor
|
||||
x = x.reshape(-1, x.shape[-1])
|
||||
if x.stride(-1) != 1:
|
||||
x = x.contiguous()
|
||||
if z is not None:
|
||||
assert z.shape == x_shape_og
|
||||
z = z.reshape(-1, z.shape[-1])
|
||||
if z.stride(-1) != 1:
|
||||
z = z.contiguous()
|
||||
weight = weight.contiguous()
|
||||
if bias is not None:
|
||||
bias = bias.contiguous()
|
||||
y, _, _ = _layer_norm_fwd(x,
|
||||
weight,
|
||||
bias,
|
||||
eps,
|
||||
z=z,
|
||||
group_size=group_size,
|
||||
norm_before_gate=norm_before_gate,
|
||||
is_rms_norm=True)
|
||||
|
||||
return y.reshape(x_shape_og)
|
||||
@ -16,8 +16,9 @@ from vllm.config import ModelConfig, PoolerConfig
|
||||
from vllm.model_executor.pooling_metadata import ( # noqa: E501
|
||||
PoolingMetadata as V0PoolingMetadata)
|
||||
from vllm.model_executor.pooling_metadata import PoolingTensors
|
||||
from vllm.pooling_params import PoolingParams, PoolingTask
|
||||
from vllm.pooling_params import PoolingParams
|
||||
from vllm.sequence import PoolerOutput, PoolingSequenceGroupOutput
|
||||
from vllm.tasks import PoolingTask
|
||||
from vllm.utils import resolve_obj_by_qualname
|
||||
from vllm.v1.pool.metadata import PoolingMetadata as V1PoolingMetadata
|
||||
|
||||
|
||||
@ -33,7 +33,7 @@ from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
|
||||
find_matched_target, is_activation_quantization_format,
|
||||
should_ignore_layer)
|
||||
from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
|
||||
from vllm.model_executor.layers.quantization.utils.nvfp4_emulation_utils import ( # noqa: E501
|
||||
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||
cutlass_fp4_supported)
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
|
||||
@ -27,8 +27,8 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
|
||||
prepare_moe_fp4_layer_for_marlin)
|
||||
from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
|
||||
prepare_moe_fp8_layer_for_marlin)
|
||||
from vllm.model_executor.layers.quantization.utils.nvfp4_emulation_utils import ( # noqa: E501
|
||||
cutlass_fp4_supported)
|
||||
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||
cutlass_fp4_supported, swizzle_blockscale)
|
||||
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
|
||||
all_close_1d, normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize)
|
||||
from vllm.model_executor.utils import set_weight_attrs
|
||||
@ -193,29 +193,6 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
|
||||
{"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
|
||||
set_weight_attrs(w2_input_scale, extra_weight_attrs)
|
||||
|
||||
def swizzle_blockscale(self, scale: torch.tensor):
|
||||
assert (scale.dtype == torch.float8_e4m3fn)
|
||||
# Pad and blockwise interleave weight_scale
|
||||
scale_ndim = scale.ndim
|
||||
if scale.ndim == 2:
|
||||
scale = scale.unsqueeze(0)
|
||||
assert scale.ndim == 3
|
||||
B, M, K = scale.shape
|
||||
round_up_multiple = lambda x, m: (x + m - 1) // m * m
|
||||
M_padded = round_up_multiple(M, 128)
|
||||
K_padded = round_up_multiple(K, 4)
|
||||
padded_scale = torch.zeros((B, M_padded, K_padded), dtype=scale.dtype)
|
||||
padded_scale[:B, :M, :K] = scale
|
||||
batches, rows, cols = padded_scale.shape
|
||||
assert rows % 128 == 0
|
||||
assert cols % 4 == 0
|
||||
padded_scale = padded_scale.reshape(batches, rows // 128, 4, 32,
|
||||
cols // 4, 4)
|
||||
swizzled_scale = padded_scale.permute((0, 1, 4, 3, 2, 5))
|
||||
swizzled_scale = swizzled_scale.contiguous().cuda()
|
||||
return (swizzled_scale.reshape(M, K)
|
||||
if scale_ndim == 2 else swizzled_scale.reshape(B, M, K))
|
||||
|
||||
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
|
||||
|
||||
# From packed to weight
|
||||
@ -243,13 +220,13 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
|
||||
return
|
||||
|
||||
# swizzle weight scales
|
||||
layer.w13_blockscale_swizzled = torch.nn.Parameter(
|
||||
self.swizzle_blockscale(layer.w13_weight_scale),
|
||||
requires_grad=False)
|
||||
layer.w13_blockscale_swizzled = torch.nn.Parameter(swizzle_blockscale(
|
||||
layer.w13_weight_scale),
|
||||
requires_grad=False)
|
||||
|
||||
layer.w2_blockscale_swizzled = torch.nn.Parameter(
|
||||
self.swizzle_blockscale(layer.w2_weight_scale),
|
||||
requires_grad=False)
|
||||
layer.w2_blockscale_swizzled = torch.nn.Parameter(swizzle_blockscale(
|
||||
layer.w2_weight_scale),
|
||||
requires_grad=False)
|
||||
|
||||
# w13
|
||||
w13_input_global_scale = layer.w13_input_global_scale.max(
|
||||
|
||||
@ -9,8 +9,7 @@ from torch.nn.parameter import Parameter
|
||||
|
||||
import vllm.envs as envs
|
||||
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
|
||||
from vllm._custom_ops import (cutlass_scaled_fp4_mm,
|
||||
cutlass_scaled_mm_supports_fp4, scaled_fp4_quant)
|
||||
from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
|
||||
from vllm.distributed import get_ep_group
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig
|
||||
@ -28,7 +27,7 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
|
||||
apply_fp4_marlin_linear, is_fp4_marlin_supported,
|
||||
prepare_fp4_layer_for_marlin, prepare_moe_fp4_layer_for_marlin)
|
||||
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||
GroupShape, is_layer_skipped)
|
||||
GroupShape, cutlass_fp4_supported, is_layer_skipped, swizzle_blockscale)
|
||||
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
|
||||
Fp8LinearOp, requantize_with_max_scale)
|
||||
from vllm.model_executor.parameter import (ModelWeightParameter,
|
||||
@ -667,14 +666,6 @@ class ModelOptNvFp4Config(QuantizationConfig):
|
||||
return None
|
||||
|
||||
|
||||
def cutlass_fp4_supported() -> bool:
|
||||
if not current_platform.is_cuda():
|
||||
return False
|
||||
capability_tuple = current_platform.get_device_capability()
|
||||
capability = -1 if capability_tuple is None else capability_tuple.to_int()
|
||||
return cutlass_scaled_mm_supports_fp4(capability)
|
||||
|
||||
|
||||
class ModelOptFp8KVCacheMethod(BaseKVCacheMethod):
|
||||
"""
|
||||
Supports loading kv-cache scaling factors from FP8 checkpoints.
|
||||
@ -772,29 +763,6 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
|
||||
|
||||
layer.register_parameter("weight_scale", weight_scale)
|
||||
|
||||
def swizzle_blockscale(self, scale: torch.tensor):
|
||||
assert (scale.dtype == torch.float8_e4m3fn)
|
||||
# Pad and blockwise interleave weight_scale
|
||||
scale_ndim = scale.ndim
|
||||
if scale.ndim == 2:
|
||||
scale = scale.unsqueeze(0)
|
||||
assert scale.ndim == 3
|
||||
B, M, K = scale.shape
|
||||
round_up_multiple = lambda x, m: (x + m - 1) // m * m
|
||||
M_padded = round_up_multiple(M, 128)
|
||||
K_padded = round_up_multiple(K, 4)
|
||||
padded_scale = torch.zeros((B, M_padded, K_padded), dtype=scale.dtype)
|
||||
padded_scale[:B, :M, :K] = scale
|
||||
batches, rows, cols = padded_scale.shape
|
||||
assert rows % 128 == 0
|
||||
assert cols % 4 == 0
|
||||
padded_scale = padded_scale.reshape(batches, rows // 128, 4, 32,
|
||||
cols // 4, 4)
|
||||
swizzled_scale = padded_scale.permute((0, 1, 4, 3, 2, 5))
|
||||
swizzled_scale = swizzled_scale.contiguous().cuda()
|
||||
return (swizzled_scale.reshape(M, K)
|
||||
if scale_ndim == 2 else swizzled_scale.reshape(B, M, K))
|
||||
|
||||
def process_weights_after_loading(self, layer: Module) -> None:
|
||||
|
||||
# global scales:
|
||||
@ -814,7 +782,7 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
|
||||
"Expected weight_scale.dim(1) to be divisible by 16")
|
||||
assert (layer.weight_scale.dtype == torch.float8_e4m3fn), (
|
||||
"Weight Block scale must be represented as FP8-E4M3")
|
||||
swizzled_weight_scale = self.swizzle_blockscale(layer.weight_scale)
|
||||
swizzled_weight_scale = swizzle_blockscale(layer.weight_scale)
|
||||
|
||||
layer.weight_scale_swizzled = Parameter(swizzled_weight_scale,
|
||||
requires_grad=False)
|
||||
@ -1060,29 +1028,6 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
|
||||
weight_loader=weight_loader)
|
||||
layer.register_parameter("w2_input_scale", w2_input_scale)
|
||||
|
||||
def swizzle_blockscale(self, scale: torch.tensor):
|
||||
assert (scale.dtype == torch.float8_e4m3fn)
|
||||
# Pad and blockwise interleave weight_scale
|
||||
scale_ndim = scale.ndim
|
||||
if scale.ndim == 2:
|
||||
scale = scale.unsqueeze(0)
|
||||
assert scale.ndim == 3
|
||||
B, M, K = scale.shape
|
||||
round_up_multiple = lambda x, m: (x + m - 1) // m * m
|
||||
M_padded = round_up_multiple(M, 128)
|
||||
K_padded = round_up_multiple(K, 4)
|
||||
padded_scale = torch.zeros((B, M_padded, K_padded), dtype=scale.dtype)
|
||||
padded_scale[:B, :M, :K] = scale
|
||||
batches, rows, cols = padded_scale.shape
|
||||
assert rows % 128 == 0
|
||||
assert cols % 4 == 0
|
||||
padded_scale = padded_scale.reshape(batches, rows // 128, 4, 32,
|
||||
cols // 4, 4)
|
||||
swizzled_scale = padded_scale.permute((0, 1, 4, 3, 2, 5))
|
||||
swizzled_scale = swizzled_scale.contiguous().cuda()
|
||||
return (swizzled_scale.reshape(M, K)
|
||||
if scale_ndim == 2 else swizzled_scale.reshape(B, M, K))
|
||||
|
||||
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
|
||||
# GEMM 1
|
||||
# The FlashInfer Cutlass fused MoE kernel expects the combined weights
|
||||
@ -1128,8 +1073,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
|
||||
"Expected weight_scale.dim(1) to be divisible by 16")
|
||||
assert (layer.w13_weight_scale.dtype == torch.float8_e4m3fn), (
|
||||
"Weight Blockscale must be represented as FP8-E4M3")
|
||||
w13_blockscale_swizzled = self.swizzle_blockscale(
|
||||
layer.w13_weight_scale)
|
||||
w13_blockscale_swizzled = swizzle_blockscale(layer.w13_weight_scale)
|
||||
|
||||
layer.w13_blockscale_swizzled = Parameter(w13_blockscale_swizzled,
|
||||
requires_grad=False)
|
||||
@ -1151,7 +1095,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
|
||||
"Expected weight_scale.dim(1) to be divisible by 16")
|
||||
assert (layer.w2_weight_scale.dtype == torch.float8_e4m3fn), (
|
||||
"Weight Blockscale must be represented as FP8-E4M3")
|
||||
w2_blockscale_swizzled = self.swizzle_blockscale(layer.w2_weight_scale)
|
||||
w2_blockscale_swizzled = swizzle_blockscale(layer.w2_weight_scale)
|
||||
|
||||
layer.w2_blockscale_swizzled = Parameter(w2_blockscale_swizzled,
|
||||
requires_grad=False)
|
||||
|
||||
@ -3,18 +3,19 @@
|
||||
# Copyright © 2025, Oracle and/or its affiliates.
|
||||
|
||||
import os
|
||||
from typing import Any, Optional
|
||||
from typing import Any, Callable, Optional
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from torch.nn.parameter import Parameter
|
||||
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.fused_moe import FusedMoE, FusedMoEMethodBase
|
||||
from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
|
||||
set_weight_attrs)
|
||||
from vllm.model_executor.layers.quantization import QuantizationMethods
|
||||
from vllm.model_executor.layers.quantization.base_config import (
|
||||
QuantizationConfig)
|
||||
QuantizationConfig, QuantizeMethodBase)
|
||||
|
||||
logger = init_logger(__name__)
|
||||
"""By default, use 8 bit as target precision, but it can be
|
||||
@ -71,9 +72,11 @@ class RTNConfig(QuantizationConfig):
|
||||
return cls(weight_bits, group_size)
|
||||
|
||||
def get_quant_method(self, layer: torch.nn.Module,
|
||||
prefix: str) -> Optional["RTNLinearMethod"]:
|
||||
prefix: str) -> Optional["QuantizeMethodBase"]:
|
||||
if isinstance(layer, LinearBase):
|
||||
return RTNLinearMethod(self)
|
||||
elif isinstance(layer, FusedMoE):
|
||||
return RTNMoEMethod(self)
|
||||
return None
|
||||
|
||||
|
||||
@ -94,11 +97,18 @@ class RTNTensor:
|
||||
self.data.narrow(dim, start // factor, length // factor),
|
||||
self.scale.narrow(dim, start, length), self.quant_config)
|
||||
|
||||
def __getitem__(self, key):
|
||||
return RTNTensor(self.data[key], self.scale[key], self.quant_config)
|
||||
|
||||
@property
|
||||
def shape(self):
|
||||
shape = self.data.shape
|
||||
factor = 1 if self.quant_config.weight_bits == 8 else 2
|
||||
return torch.Size((shape[0] * factor, shape[1]))
|
||||
batch_present = len(shape) == 3
|
||||
if batch_present:
|
||||
return torch.Size((shape[0], shape[1] * factor, shape[2]))
|
||||
else:
|
||||
return torch.Size((shape[0] * factor, shape[1]))
|
||||
|
||||
def copy_(self, loaded_weight: torch.Tensor) -> None:
|
||||
qweight, weight_scale = rtn_quantize(loaded_weight.cuda(),
|
||||
@ -165,7 +175,7 @@ class RTNLinearMethod(LinearMethodBase):
|
||||
weight = RTNParameter(data=torch.empty(output_size_per_partition //
|
||||
factor,
|
||||
input_size_per_partition,
|
||||
dtype=torch.int8),
|
||||
dtype=torch.uint8),
|
||||
scale=scale,
|
||||
quant_config=self.quant_config)
|
||||
|
||||
@ -180,18 +190,7 @@ class RTNLinearMethod(LinearMethodBase):
|
||||
layer.output_size_per_partition = output_size_per_partition
|
||||
|
||||
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
|
||||
"""torch.compile does not know how to deal with a Parameter subclass
|
||||
(aka RTNParameter). As we don't really need RTNParameters for the
|
||||
forward pass, we replace them with equivalent instances of Parameters.
|
||||
"""
|
||||
old_weight = layer.weight
|
||||
assert isinstance(old_weight, RTNParameter)
|
||||
data = old_weight.data.data
|
||||
|
||||
delattr(layer, "weight")
|
||||
|
||||
new_weight = Parameter(data=data, requires_grad=False)
|
||||
layer.register_parameter("weight", new_weight)
|
||||
fix_weights(layer, "weight")
|
||||
|
||||
def apply(self,
|
||||
layer: torch.nn.Module,
|
||||
@ -209,6 +208,128 @@ class RTNLinearMethod(LinearMethodBase):
|
||||
return out
|
||||
|
||||
|
||||
class RTNMoEMethod(FusedMoEMethodBase):
|
||||
|
||||
def __init__(self, quant_config: RTNConfig):
|
||||
self.quant_config = quant_config
|
||||
|
||||
def create_weights(self, layer: torch.nn.Module, num_experts: int,
|
||||
hidden_size: int, intermediate_size_per_partition: int,
|
||||
params_dtype: torch.dtype, **extra_weight_attrs):
|
||||
|
||||
factor = 1 if self.quant_config.weight_bits == 8 else 2
|
||||
|
||||
# Fused gate_up_proj (column parallel)
|
||||
num_groups_per_col = (hidden_size // self.quant_config.group_size
|
||||
if self.quant_config.group_size != -1 else 1)
|
||||
w13_scale = Parameter(
|
||||
torch.empty(num_experts,
|
||||
2 * intermediate_size_per_partition,
|
||||
num_groups_per_col,
|
||||
dtype=params_dtype),
|
||||
requires_grad=False,
|
||||
)
|
||||
layer.register_parameter("w13_scale", w13_scale)
|
||||
|
||||
w13_weight = RTNParameter(data=torch.empty(
|
||||
num_experts,
|
||||
2 * intermediate_size_per_partition // factor,
|
||||
hidden_size,
|
||||
dtype=torch.uint8),
|
||||
scale=w13_scale,
|
||||
quant_config=self.quant_config)
|
||||
layer.register_parameter("w13_weight", w13_weight)
|
||||
set_weight_attrs(w13_weight, extra_weight_attrs)
|
||||
|
||||
# down_proj (row parallel)
|
||||
num_groups_per_col = (intermediate_size_per_partition //
|
||||
self.quant_config.group_size
|
||||
if self.quant_config.group_size != -1 else 1)
|
||||
w2_scale = Parameter(torch.zeros(num_experts,
|
||||
hidden_size,
|
||||
num_groups_per_col,
|
||||
dtype=params_dtype),
|
||||
requires_grad=False)
|
||||
layer.register_parameter("w2_scale", w2_scale)
|
||||
|
||||
w2_weight = RTNParameter(data=torch.empty(
|
||||
num_experts,
|
||||
hidden_size // factor,
|
||||
intermediate_size_per_partition,
|
||||
dtype=torch.uint8),
|
||||
scale=w2_scale,
|
||||
quant_config=self.quant_config)
|
||||
layer.register_parameter("w2_weight", w2_weight)
|
||||
set_weight_attrs(w2_weight, extra_weight_attrs)
|
||||
|
||||
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
|
||||
weight_bits = self.quant_config.weight_bits
|
||||
fix_weights(layer, "w13_weight", weight_bits == 4)
|
||||
fix_weights(layer, "w2_weight", weight_bits == 4)
|
||||
|
||||
def apply(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
x: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
top_k: int,
|
||||
renormalize: bool,
|
||||
use_grouped_topk: bool = False,
|
||||
topk_group: Optional[int] = None,
|
||||
num_expert_group: Optional[int] = None,
|
||||
global_num_experts: int = -1,
|
||||
expert_map: Optional[torch.Tensor] = None,
|
||||
custom_routing_function: Optional[Callable] = None,
|
||||
scoring_func: str = "softmax",
|
||||
e_score_correction_bias: Optional[torch.Tensor] = None,
|
||||
apply_router_weight_on_input: bool = False,
|
||||
activation: str = "silu",
|
||||
enable_eplb: bool = False,
|
||||
expert_load_view: Optional[torch.Tensor] = None,
|
||||
logical_to_physical_map: Optional[torch.Tensor] = None,
|
||||
logical_replica_count: Optional[torch.Tensor] = None,
|
||||
) -> torch.Tensor:
|
||||
if enable_eplb:
|
||||
raise NotImplementedError(
|
||||
"EPLB not supported for `RTNMoEMethod` yet.")
|
||||
|
||||
from vllm.model_executor.layers.fused_moe import fused_experts
|
||||
|
||||
topk_weights, topk_ids = FusedMoE.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
use_grouped_topk=use_grouped_topk,
|
||||
top_k=top_k,
|
||||
renormalize=renormalize,
|
||||
topk_group=topk_group,
|
||||
num_expert_group=num_expert_group,
|
||||
custom_routing_function=custom_routing_function,
|
||||
scoring_func=scoring_func,
|
||||
e_score_correction_bias=e_score_correction_bias)
|
||||
|
||||
weight_bits = self.quant_config.weight_bits
|
||||
group_size = self.quant_config.group_size
|
||||
|
||||
ret = fused_experts(
|
||||
x,
|
||||
layer.w13_weight,
|
||||
layer.w2_weight,
|
||||
topk_weights=topk_weights,
|
||||
topk_ids=topk_ids,
|
||||
inplace=True,
|
||||
activation=activation,
|
||||
use_int4_w4a16=weight_bits == 4,
|
||||
use_int8_w8a16=weight_bits == 8,
|
||||
global_num_experts=global_num_experts,
|
||||
w1_scale=layer.w13_scale,
|
||||
w2_scale=layer.w2_scale,
|
||||
apply_router_weight_on_input=apply_router_weight_on_input,
|
||||
expert_map=expert_map,
|
||||
block_shape=[0, group_size])
|
||||
|
||||
return ret
|
||||
|
||||
|
||||
def rtn_quantize(tensor: torch.Tensor, num_bits: int,
|
||||
group_size: int) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
"""Quantize a tensor using per-group static scaling factor.
|
||||
@ -221,34 +342,44 @@ def rtn_quantize(tensor: torch.Tensor, num_bits: int,
|
||||
If equal to -1, each row in the input tensor is treated
|
||||
as one group.
|
||||
"""
|
||||
batch_present = len(tensor.shape) == 3
|
||||
if not batch_present:
|
||||
tensor = tensor.unsqueeze(0)
|
||||
|
||||
q_range = 2**num_bits
|
||||
num_groups = (tensor.shape[0] * tensor.shape[1] //
|
||||
group_size if group_size != -1 else tensor.shape[0])
|
||||
num_groups = (tensor.shape[1] * tensor.shape[2] //
|
||||
group_size if group_size != -1 else tensor.shape[1])
|
||||
"""Calculate a scaling factor per input group.
|
||||
"""
|
||||
input_flat = tensor.reshape(num_groups, -1)
|
||||
input_min = torch.min(input_flat, dim=1, keepdim=True)[0]
|
||||
input_max = torch.max(input_flat, dim=1, keepdim=True)[0]
|
||||
input_flat = tensor.reshape(tensor.shape[0], num_groups, -1)
|
||||
input_min = torch.min(input_flat, dim=2, keepdim=True)[0]
|
||||
input_max = torch.max(input_flat, dim=2, keepdim=True)[0]
|
||||
input_max_abs = torch.max(input_min.abs(), input_max.abs())
|
||||
scale = (input_max_abs * 2.0 / (q_range - 1))
|
||||
"""Scale each input group, truncate and round to the nearest integer.
|
||||
"""Scale each input group, round to the nearest integer, shift
|
||||
the range and truncate.
|
||||
"""
|
||||
scaled_input = input_flat / scale
|
||||
scaled_input = scaled_input.clamp(-q_range // 2, q_range // 2 - 1)
|
||||
scaled_input = scaled_input.round()
|
||||
scaled_input += q_range // 2
|
||||
scaled_input = scaled_input.clamp(0, q_range - 1)
|
||||
|
||||
scale = scale.reshape(tensor.shape[0], -1).contiguous()
|
||||
inputs_q = scaled_input.reshape(tensor.shape).to(torch.int8)
|
||||
scale = scale.reshape(tensor.shape[0], tensor.shape[1], -1).contiguous()
|
||||
inputs_q = scaled_input.reshape(tensor.shape).to(torch.uint8)
|
||||
inputs_q = inputs_q.contiguous()
|
||||
|
||||
if num_bits == 4:
|
||||
"""Pack two 4-bit values into each byte.
|
||||
"""
|
||||
inputs_q = (inputs_q[:, 1::2] << 4) | (inputs_q[:, ::2] & 0xf)
|
||||
inputs_q = inputs_q.reshape(tensor.shape[0] // 2, tensor.shape[1])
|
||||
inputs_q = (inputs_q[:, :, 1::2] << 4) | (inputs_q[:, :, ::2] & 0xf)
|
||||
inputs_q = inputs_q.reshape(tensor.shape[0], tensor.shape[1] // 2,
|
||||
tensor.shape[2])
|
||||
inputs_q = inputs_q.contiguous()
|
||||
|
||||
if not batch_present:
|
||||
inputs_q = inputs_q.squeeze(0)
|
||||
scale = scale.squeeze(0)
|
||||
|
||||
return inputs_q, scale
|
||||
|
||||
|
||||
@ -259,31 +390,60 @@ def rtn_dequantize(tensor: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
|
||||
tensor: The input tensor.
|
||||
scale: The tensor with per-group scale factors.
|
||||
"""
|
||||
batch_present = len(tensor.shape) == 3
|
||||
if not batch_present:
|
||||
tensor = tensor.unsqueeze(0)
|
||||
scale = scale.unsqueeze(0)
|
||||
|
||||
num_groups = scale.size(0) * scale.size(1)
|
||||
input_dim, output_dim = tensor.shape
|
||||
num_groups = scale.size(1) * scale.size(2)
|
||||
batch, input_dim, output_dim = tensor.shape
|
||||
|
||||
num_bits = 8 if input_dim == scale.size(0) else 4
|
||||
num_bits = 8 if input_dim == scale.size(1) else 4
|
||||
q_range = 2**num_bits
|
||||
if num_bits == 4:
|
||||
input_dim *= 2
|
||||
|
||||
data = torch.empty((input_dim, output_dim),
|
||||
data = torch.empty((batch, input_dim, output_dim),
|
||||
dtype=scale.dtype,
|
||||
device=tensor.device)
|
||||
|
||||
if num_bits == 8:
|
||||
data.copy_(tensor)
|
||||
data -= q_range // 2
|
||||
else:
|
||||
"""Unpack two 4-bit values from each byte.
|
||||
"""
|
||||
tensor = tensor.reshape(input_dim, output_dim // 2)
|
||||
tensor = tensor.reshape(batch, input_dim, output_dim // 2)
|
||||
for i in range(2):
|
||||
data[:, i::2] = (tensor << 4 * (1 - i)) >> 4
|
||||
data[:, :, i::2] = ((tensor << 4 *
|
||||
(1 - i)) >> 4).to(torch.int8) - q_range // 2
|
||||
"""Scale each input group with its scaling factor.
|
||||
"""
|
||||
scale = scale.reshape(num_groups, -1)
|
||||
data = data.reshape(num_groups, -1)
|
||||
scale = scale.reshape(batch, num_groups, -1)
|
||||
data = data.reshape(batch, num_groups, -1)
|
||||
data = torch.mul(data, scale)
|
||||
|
||||
input_deq = data.reshape((input_dim, output_dim)).contiguous()
|
||||
input_deq = data.reshape((batch, input_dim, output_dim)).contiguous()
|
||||
if not batch_present:
|
||||
input_deq = input_deq.squeeze(0)
|
||||
|
||||
return input_deq
|
||||
|
||||
|
||||
def fix_weights(layer: torch.nn.Module,
|
||||
param_name: str,
|
||||
reshape: bool = False):
|
||||
"""torch.compile does not know how to deal with a Parameter subclass
|
||||
(aka RTNParameter). As we don't really need RTNParameters for the
|
||||
forward pass, we replace them with equivalent instances of Parameters.
|
||||
"""
|
||||
old_weight = getattr(layer, param_name)
|
||||
assert isinstance(old_weight, RTNParameter)
|
||||
data = old_weight.data.data
|
||||
|
||||
delattr(layer, param_name)
|
||||
|
||||
if reshape:
|
||||
data = data.reshape(old_weight.shape[0], old_weight.shape[1] * 2, -1)
|
||||
new_weight = Parameter(data=data, requires_grad=False)
|
||||
layer.register_parameter(param_name, new_weight)
|
||||
|
||||
@ -238,13 +238,20 @@ def per_token_group_quant_int8(
|
||||
int8_min = iinfo.min
|
||||
|
||||
x_q = torch.empty_like(x, device=x.device, dtype=dtype)
|
||||
M = x.numel() // group_size
|
||||
N = group_size
|
||||
x_s = torch.empty(
|
||||
x.shape[:-1] + (x.shape[-1] // group_size, ),
|
||||
device=x.device,
|
||||
dtype=torch.float32,
|
||||
)
|
||||
# prefer CUDA kernel if available
|
||||
if current_platform.is_cuda():
|
||||
torch.ops._C.per_token_group_quant_int8(x, x_q, x_s, group_size, eps,
|
||||
float(int8_min),
|
||||
float(int8_max))
|
||||
return x_q, x_s
|
||||
|
||||
M = x.numel() // group_size
|
||||
N = group_size
|
||||
|
||||
BLOCK = triton.next_power_of_2(N)
|
||||
# heuristics for number of warps
|
||||
|
||||
@ -2,13 +2,12 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import torch
|
||||
|
||||
from vllm._custom_ops import cutlass_scaled_mm_supports_fp4
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.scalar_type import scalar_types
|
||||
|
||||
__all__ = [
|
||||
"break_fp4_bytes", "dequantize_to_dtype", "ref_nvfp4_quant",
|
||||
"cutlass_fp4_supported"
|
||||
"break_fp4_bytes",
|
||||
"dequantize_to_dtype",
|
||||
"ref_nvfp4_quant",
|
||||
]
|
||||
|
||||
FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
|
||||
@ -17,14 +16,6 @@ kE2M1ToFloat = torch.tensor([0., 0.5, 1., 1.5, 2., 3., 4., 6.],
|
||||
dtype=torch.float32)
|
||||
|
||||
|
||||
def cutlass_fp4_supported() -> bool:
|
||||
if not current_platform.is_cuda():
|
||||
return False
|
||||
capability_tuple = current_platform.get_device_capability()
|
||||
capability = -1 if capability_tuple is None else capability_tuple.to_int()
|
||||
return cutlass_scaled_mm_supports_fp4(capability)
|
||||
|
||||
|
||||
def break_fp4_bytes(a, dtype):
|
||||
assert a.dtype == torch.uint8
|
||||
m, n = a.shape
|
||||
|
||||
@ -8,8 +8,10 @@ from typing import ClassVar, NamedTuple, Optional
|
||||
import numpy
|
||||
import torch
|
||||
|
||||
from vllm._custom_ops import cutlass_scaled_mm_supports_fp4
|
||||
from vllm.model_executor.layers.quantization.qqq import (
|
||||
MARLIN_QQQ_SUPPORTED_NUM_BITS)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.scalar_type import ScalarType, scalar_types
|
||||
|
||||
|
||||
@ -592,3 +594,56 @@ def awq_pack(
|
||||
q_w = q_w.reshape((-1, size_n)).contiguous()
|
||||
|
||||
return pack_cols(q_w, num_bits, size_k, size_n)
|
||||
|
||||
|
||||
def swizzle_blockscale(scale: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Pad and block-interleave the FP4 block-scales so that they match the data
|
||||
layout expected by the CUTLASS / FlashInfer kernels.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
scale: torch.Tensor
|
||||
|
||||
Returns
|
||||
-------
|
||||
torch.Tensor
|
||||
The swizzled tensor with the same logical shape as *scale*.
|
||||
"""
|
||||
assert scale.dtype == torch.float8_e4m3fn, (
|
||||
"swizzle_blockscale expects the input tensor to be in "
|
||||
"torch.float8_e4m3fn format.")
|
||||
|
||||
scale_ndim = scale.ndim
|
||||
if scale_ndim == 2:
|
||||
scale = scale.unsqueeze(0) # (1, M, K)
|
||||
assert scale.ndim == 3, "Expected a 2-D or 3-D tensor for block scales."
|
||||
|
||||
B, M, K = scale.shape
|
||||
|
||||
def _round_up(x: int, m: int) -> int:
|
||||
return (x + m - 1) // m * m
|
||||
|
||||
M_padded = _round_up(M, 128)
|
||||
K_padded = _round_up(K, 4)
|
||||
|
||||
padded = torch.zeros((B, M_padded, K_padded),
|
||||
dtype=scale.dtype,
|
||||
device=scale.device)
|
||||
padded[:B, :M, :K] = scale
|
||||
|
||||
# Reshape / permute to the layout required by the kernel.
|
||||
padded = padded.reshape(B, M_padded // 128, 4, 32, K_padded // 4, 4)
|
||||
swizzled = padded.permute(0, 1, 4, 3, 2, 5).contiguous().cuda()
|
||||
|
||||
if scale_ndim == 2:
|
||||
return swizzled.reshape(M, K)
|
||||
return swizzled.reshape(B, M, K)
|
||||
|
||||
|
||||
def cutlass_fp4_supported() -> bool:
|
||||
if not current_platform.is_cuda():
|
||||
return False
|
||||
capability_tuple = current_platform.get_device_capability()
|
||||
capability = -1 if capability_tuple is None else capability_tuple.to_int()
|
||||
return cutlass_scaled_mm_supports_fp4(capability)
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from collections.abc import Iterable, Mapping, Sequence
|
||||
from typing import Optional, TypedDict, Union
|
||||
from typing import Annotated, Optional, Union
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
@ -29,6 +29,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||
PromptUpdate)
|
||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
||||
|
||||
# yapf: disable
|
||||
from .idefics2_vision_model import Idefics2VisionConfig
|
||||
@ -42,15 +43,26 @@ from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
|
||||
merge_multimodal_embeddings)
|
||||
|
||||
|
||||
class AriaImagePixelInputs(TypedDict):
|
||||
pixel_values: torch.Tensor
|
||||
pixel_mask: Optional[torch.Tensor]
|
||||
class AriaImagePixelInputs(TensorSchema):
|
||||
"""
|
||||
Shape:
|
||||
pixel_values: `(batch_size * num_images, num_channels, height, width)`
|
||||
pixel_mask: `(batch_size * num_images, height, width)`
|
||||
Dimensions:
|
||||
- b: Batch size
|
||||
- n: Number of images
|
||||
- c: Number of channels
|
||||
- h: Height of each image
|
||||
- w: Width of each image
|
||||
"""
|
||||
|
||||
pixel_values: Annotated[
|
||||
torch.Tensor,
|
||||
TensorShape("bn", 3, "h", "w"),
|
||||
]
|
||||
|
||||
pixel_mask: Annotated[
|
||||
Optional[torch.Tensor],
|
||||
TensorShape("bn", "h", "w"),
|
||||
]
|
||||
|
||||
|
||||
class AriaVisionTransformer(Idefics3VisionTransformer, SupportsQuant):
|
||||
packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
|
||||
@ -540,12 +552,6 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
|
||||
self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
|
||||
self.vocab_size, logit_scale)
|
||||
|
||||
def _validate_image_sizes(
|
||||
self, images: list[torch.Tensor]) -> list[torch.Tensor]:
|
||||
if not all(img.shape == images[0].shape for img in images):
|
||||
raise ValueError("All images must be the same size")
|
||||
return images
|
||||
|
||||
def _parse_and_validate_image_input(
|
||||
self, **kwargs: object) -> Optional[AriaImagePixelInputs]:
|
||||
pixel_values = kwargs.pop("pixel_values", None)
|
||||
@ -554,23 +560,9 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
|
||||
if pixel_values is None:
|
||||
return None
|
||||
|
||||
if not isinstance(pixel_values, (torch.Tensor, list)):
|
||||
raise ValueError("Incorrect type of pixel values. "
|
||||
f"Got type: {type(pixel_values)}")
|
||||
|
||||
pixel_values = self._validate_image_sizes(pixel_values)
|
||||
pixel_values = flatten_bn(pixel_values, concat=True)
|
||||
|
||||
if pixel_mask is not None:
|
||||
if not isinstance(pixel_mask, (torch.Tensor, list)):
|
||||
raise ValueError("Incorrect type of pixel mask. "
|
||||
f"Got type: {type(pixel_mask)}")
|
||||
|
||||
pixel_mask = flatten_bn(pixel_mask, concat=True)
|
||||
|
||||
return AriaImagePixelInputs(
|
||||
pixel_values=pixel_values,
|
||||
pixel_mask=pixel_mask,
|
||||
pixel_values=flatten_bn(pixel_values, concat=True),
|
||||
pixel_mask=flatten_bn(pixel_mask, concat=True),
|
||||
)
|
||||
|
||||
def _create_patch_attention_mask(
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user