diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md index cdf6a645147e5..ae42f70077cec 100644 --- a/.buildkite/nightly-benchmarks/README.md +++ b/.buildkite/nightly-benchmarks/README.md @@ -74,7 +74,7 @@ Here is an example of one test inside `latency-tests.json`: In this example: - The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`. -- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15` +- The `parameters` attribute control the command line arguments to be used for `vllm bench latency`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `vllm bench latency`. For example, the corresponding command line arguments for `vllm bench latency` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15` Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly. @@ -82,13 +82,13 @@ WARNING: The benchmarking script will save json results by itself, so please do ### Throughput test -The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`. +The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `vllm bench throughput`. The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot. ### Serving test -We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example: +We test the throughput by using `vllm bench serve` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example: ```json [ @@ -118,8 +118,8 @@ Inside this example: - The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`. - The `server-parameters` includes the command line arguments for vLLM server. -- The `client-parameters` includes the command line arguments for `benchmark_serving.py`. -- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `benchmark_serving.py` +- The `client-parameters` includes the command line arguments for `vllm bench serve`. +- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `vllm bench serve` The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly. diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py index 724b53056ca8f..05623879c0c2c 100644 --- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py +++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py @@ -100,7 +100,7 @@ if __name__ == "__main__": raw_result = json.loads(f.read()) if "serving" in str(test_file): - # this result is generated via `benchmark_serving.py` + # this result is generated via `vllm bench serve` command # attach the benchmarking command to raw_result try: @@ -120,7 +120,7 @@ if __name__ == "__main__": continue elif "latency" in f.name: - # this result is generated via `benchmark_latency.py` + # this result is generated via `vllm bench latency` command # attach the benchmarking command to raw_result try: @@ -148,7 +148,7 @@ if __name__ == "__main__": continue elif "throughput" in f.name: - # this result is generated via `benchmark_throughput.py` + # this result is generated via `vllm bench throughput` command # attach the benchmarking command to raw_result try: diff --git a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh index 4d01a314adc47..06d7b5ed484da 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh @@ -73,7 +73,7 @@ get_current_llm_serving_engine() { echo "Container: vllm" # move to a completely irrelevant directory, to avoid import vllm from current folder export CURRENT_LLM_SERVING_ENGINE=vllm - + return fi } @@ -95,12 +95,14 @@ json2args() { } kill_gpu_processes() { - pkill -f python - pkill -f python3 - pkill -f tritonserver - pkill -f pt_main_thread - pkill -f text-generation - pkill -f lmdeploy + pkill -f '[p]ython' + pkill -f '[p]ython3' + pkill -f '[t]ritonserver' + pkill -f '[p]t_main_thread' + pkill -f '[t]ext-generation' + pkill -f '[l]mdeploy' + # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445 + pkill -f '[V]LLM' while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do sleep 1 @@ -125,7 +127,7 @@ ensure_installed() { } run_serving_tests() { - # run serving tests using `benchmark_serving.py` + # run serving tests using `vllm bench serve` command # $1: a json file specifying serving test cases local serving_test_file @@ -225,7 +227,7 @@ run_serving_tests() { if [[ "$dataset_name" = "sharegpt" ]]; then - client_command="python3 benchmark_serving.py \ + client_command="vllm bench serve \ --backend $backend \ --tokenizer /tokenizer_cache \ --model $model \ @@ -246,7 +248,7 @@ run_serving_tests() { sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len') sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len') - client_command="python3 benchmark_serving.py \ + client_command="vllm bench serve \ --backend $backend \ --tokenizer /tokenizer_cache \ --model $model \ @@ -265,13 +267,13 @@ run_serving_tests() { $client_args" else - + echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name." exit 1 fi - + echo "Running test case $test_name with qps $qps" echo "Client command: $client_command" @@ -302,7 +304,7 @@ run_serving_tests() { } run_genai_perf_tests() { - # run genai-perf tests + # run genai-perf tests # $1: a json file specifying genai-perf test cases local genai_perf_test_file @@ -311,14 +313,14 @@ run_genai_perf_tests() { # Iterate over genai-perf tests jq -c '.[]' "$genai_perf_test_file" | while read -r params; do # get the test name, and append the GPU type back to it. - test_name=$(echo "$params" | jq -r '.test_name') - + test_name=$(echo "$params" | jq -r '.test_name') + # if TEST_SELECTOR is set, only run the test cases that match the selector if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then echo "Skip test case $test_name." continue fi - + # prepend the current serving engine to the test name test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name} @@ -369,10 +371,10 @@ run_genai_perf_tests() { qps=$num_prompts echo "now qps is $qps" fi - + new_test_name=$test_name"_qps_"$qps backend=$CURRENT_LLM_SERVING_ENGINE - + if [[ "$backend" == *"vllm"* ]]; then backend="vllm" fi @@ -413,7 +415,7 @@ prepare_dataset() { do cat sonnet.txt >> sonnet_4x.txt done - + } main() { diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh index f05040618981c..b515ee43934d1 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh @@ -126,7 +126,8 @@ kill_gpu_processes() { ps -aux lsof -t -i:8000 | xargs -r kill -9 pgrep python3 | xargs -r kill -9 - + # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445 + pgrep VLLM | xargs -r kill -9 # wait until GPU memory usage smaller than 1GB if command -v nvidia-smi; then @@ -164,7 +165,7 @@ upload_to_buildkite() { } run_latency_tests() { - # run latency tests using `benchmark_latency.py` + # run latency tests using `vllm bench latency` command # $1: a json file specifying latency test cases local latency_test_file @@ -205,7 +206,7 @@ run_latency_tests() { fi fi - latency_command=" $latency_envs python3 benchmark_latency.py \ + latency_command=" $latency_envs vllm bench latency \ --output-json $RESULTS_FOLDER/${test_name}.json \ $latency_args" @@ -231,7 +232,7 @@ run_latency_tests() { } run_throughput_tests() { - # run throughput tests using `benchmark_throughput.py` + # run throughput tests using `vllm bench throughput` # $1: a json file specifying throughput test cases local throughput_test_file @@ -272,7 +273,7 @@ run_throughput_tests() { fi fi - throughput_command=" $throughput_envs python3 benchmark_throughput.py \ + throughput_command=" $throughput_envs vllm bench throughput \ --output-json $RESULTS_FOLDER/${test_name}.json \ $throughput_args" @@ -297,7 +298,7 @@ run_throughput_tests() { } run_serving_tests() { - # run serving tests using `benchmark_serving.py` + # run serving tests using `vllm bench serve` command # $1: a json file specifying serving test cases local serving_test_file @@ -393,7 +394,7 @@ run_serving_tests() { # pass the tensor parallel size to the client so that it can be displayed # on the benchmark dashboard - client_command="python3 benchmark_serving.py \ + client_command="vllm bench serve \ --save-result \ --result-dir $RESULTS_FOLDER \ --result-filename ${new_test_name}.json \ @@ -447,7 +448,7 @@ main() { (which jq) || (apt-get update && apt-get -y install jq) (which lsof) || (apt-get update && apt-get install -y lsof) - # get the current IP address, required by benchmark_serving.py + # get the current IP address, required by `vllm bench serve` command export VLLM_HOST_IP=$(hostname -I | awk '{print $1}') # turn of the reporting of the status of each request, to clean up the terminal output export VLLM_LOGGING_LEVEL="WARNING" diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh index 90cc9c8446223..7c7dbb461ce0d 100644 --- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh @@ -13,9 +13,9 @@ NUMA_NODE=${NUMA_NODE:-1} export CMAKE_BUILD_PARALLEL_LEVEL=32 # Setup cleanup -remove_docker_container() { - set -e; - docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true; +remove_docker_container() { + set -e; + docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true; } trap remove_docker_container EXIT remove_docker_container @@ -69,7 +69,7 @@ function cpu_tests() { docker exec cpu-test-"$NUMA_NODE" bash -c " set -e pytest -s -v \ - tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]" + tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]" # Note: disable it until supports V1 # Run AWQ test @@ -83,7 +83,7 @@ function cpu_tests() { set -e VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 & timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1 - python3 benchmarks/benchmark_serving.py \ + vllm bench serve \ --backend vllm \ --dataset-name random \ --model meta-llama/Llama-3.2-3B-Instruct \ diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh new file mode 100755 index 0000000000000..d998c1f73b514 --- /dev/null +++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh @@ -0,0 +1,166 @@ +#!/bin/bash + +set -xu + + +remove_docker_container() { + docker rm -f tpu-test || true; + docker rm -f vllm-tpu || true; +} + +trap remove_docker_container EXIT + +# Remove the container that might not be cleaned up in the previous run. +remove_docker_container + +# Build the docker image. +docker build -f docker/Dockerfile.tpu -t vllm-tpu . + +# Set up cleanup. +cleanup_docker() { + # Get Docker's root directory + docker_root=$(docker info -f '{{.DockerRootDir}}') + if [ -z "$docker_root" ]; then + echo "Failed to determine Docker root directory." + exit 1 + fi + echo "Docker root directory: $docker_root" + # Check disk usage of the filesystem where Docker's root directory is located + disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//') + # Define the threshold + threshold=70 + if [ "$disk_usage" -gt "$threshold" ]; then + echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..." + # Remove dangling images (those that are not tagged and not used by any container) + docker image prune -f + # Remove unused volumes / force the system prune for old images as well. + docker volume prune -f && docker system prune --force --filter "until=72h" --all + echo "Docker images and volumes cleanup completed." + else + echo "Disk usage is below $threshold%. No cleanup needed." + fi +} +cleanup_docker + +# For HF_TOKEN. +source /etc/environment + +docker run --privileged --net host --shm-size=16G -it \ + -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \ + vllm-tpu /bin/bash -c ' +set -e # Exit immediately if a command exits with a non-zero status. +set -u # Treat unset variables as an error. + +echo "--- Starting script inside Docker container ---" + +# Create results directory +RESULTS_DIR=$(mktemp -d) +# If mktemp fails, set -e will cause the script to exit. +echo "Results will be stored in: $RESULTS_DIR" + +# Install dependencies +echo "--- Installing Python dependencies ---" +python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \ + && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \ + && python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 \ + && python3 -m pip install --progress-bar off hf-transfer +echo "--- Python dependencies installed ---" +export VLLM_USE_V1=1 +export VLLM_XLA_CHECK_RECOMPILATION=1 +export VLLM_XLA_CACHE_PATH= +echo "Using VLLM V1" + +echo "--- Hardware Information ---" +# tpu-info +echo "--- Starting Tests ---" +set +e +overall_script_exit_code=0 + +# --- Test Definitions --- +# If a test fails, this function will print logs and will not cause the main script to exit. +run_test() { + local test_num=$1 + local test_name=$2 + local test_command=$3 + local log_file="$RESULTS_DIR/test_${test_num}.log" + local actual_exit_code + + echo "--- TEST_$test_num: Running $test_name ---" + + # Execute the test command. + eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2) + actual_exit_code=$? + + echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log + echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log + + if [ "$actual_exit_code" -ne 0 ]; then + echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2 + echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2 + if [ -f "$log_file" ]; then + cat "$log_file" >&2 + else + echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2 + fi + echo "--- End of log for TEST_$test_num ($test_name) ---" >&2 + return "$actual_exit_code" # Return the failure code + else + echo "TEST_$test_num ($test_name) PASSED." + return 0 # Return success + fi +} + +# Helper function to call run_test and update the overall script exit code +run_and_track_test() { + local test_num_arg="$1" + local test_name_arg="$2" + local test_command_arg="$3" + + # Run the test + run_test "$test_num_arg" "$test_name_arg" "$test_command_arg" + local test_specific_exit_code=$? + + # If the test failed, set the overall script exit code to 1 + if [ "$test_specific_exit_code" -ne 0 ]; then + # No need for extra echo here, run_test already logged the failure. + overall_script_exit_code=1 + fi +} + +# --- Actual Test Execution --- +run_and_track_test 1 "test_struct_output_generate.py" \ + "HF_HUB_DISABLE_XET=1 python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\"" +run_and_track_test 2 "test_moe_pallas.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py" +run_and_track_test 3 "test_lora.py" \ + "VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py" +run_and_track_test 4 "test_tpu_qkv_linear.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py" +run_and_track_test 5 "test_spmd_model_weight_loading.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py" +run_and_track_test 6 "test_kv_cache_update_kernel.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py" + +# After all tests have been attempted, exit with the overall status. +if [ "$overall_script_exit_code" -ne 0 ]; then + echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---" +else + echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---" +fi +exit "$overall_script_exit_code" +' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct. + +# Capture the exit code of the docker run command +DOCKER_RUN_EXIT_CODE=$? + +# The trap will run for cleanup. +# Exit the main script with the Docker run command's exit code. +if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then + echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE." + exit "$DOCKER_RUN_EXIT_CODE" +else + echo "Docker run command completed successfully." + exit 0 +fi +# TODO: This test fails because it uses RANDOM_SEED sampling +# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \ diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh index 5514d7770cff8..e565d4b246945 100755 --- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh +++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh @@ -150,18 +150,6 @@ run_and_track_test 9 "test_multimodal.py" \ "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py" run_and_track_test 10 "test_pallas.py" \ "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py" -run_and_track_test 11 "test_struct_output_generate.py" \ - "HF_HUB_DISABLE_XET=1 python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\"" -run_and_track_test 12 "test_moe_pallas.py" \ - "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py" -run_and_track_test 13 "test_lora.py" \ - "VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py" -run_and_track_test 14 "test_tpu_qkv_linear.py" \ - "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py" -run_and_track_test 15 "test_spmd_model_weight_loading.py" \ - "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py" -run_and_track_test 16 "test_kv_cache_update_kernel.py" \ - "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py" # After all tests have been attempted, exit with the overall status. if [ "$overall_script_exit_code" -ne 0 ]; then diff --git a/.buildkite/scripts/run-benchmarks.sh b/.buildkite/scripts/run-benchmarks.sh index 195a8063fd743..72812218cb668 100644 --- a/.buildkite/scripts/run-benchmarks.sh +++ b/.buildkite/scripts/run-benchmarks.sh @@ -11,10 +11,10 @@ cd "$(dirname "${BASH_SOURCE[0]}")/../.." (which wget && which curl) || (apt-get update && apt-get install -y wget curl) # run python-based benchmarks and upload the result to buildkite -python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt +vllm bench latency --output-json latency_results.json 2>&1 | tee benchmark_latency.txt bench_latency_exit_code=$? -python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt +vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt bench_throughput_exit_code=$? # run server-based benchmarks and upload the result to buildkite @@ -24,7 +24,7 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r # wait for server to start, timeout after 600 seconds timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1 -python3 benchmarks/benchmark_serving.py \ +vllm bench serve \ --backend vllm \ --dataset-name sharegpt \ --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \ diff --git a/.buildkite/scripts/tpu/run_bm.sh b/.buildkite/scripts/tpu/run_bm.sh index 877669cd956ac..beecaf7a740ae 100755 --- a/.buildkite/scripts/tpu/run_bm.sh +++ b/.buildkite/scripts/tpu/run_bm.sh @@ -77,7 +77,7 @@ done echo "run benchmark test..." echo "logging to $BM_LOG" echo -python benchmarks/benchmark_serving.py \ +vllm bench serve \ --backend vllm \ --model $MODEL \ --dataset-name sonnet \ diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml index 74a7a3a3530f5..d5736c0aee208 100644 --- a/.github/workflows/lint-and-deploy.yaml +++ b/.github/workflows/lint-and-deploy.yaml @@ -7,7 +7,7 @@ permissions: jobs: lint-and-deploy: - runs-on: ubuntu-latest + runs-on: ubuntu-24.04-arm steps: - name: Checkout uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 diff --git a/benchmarks/README.md b/benchmarks/README.md index fb8690d42db98..3b10963c3e014 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -98,7 +98,7 @@ Then run the benchmarking script ```bash # download dataset # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json -python3 vllm/benchmarks/benchmark_serving.py \ +vllm bench serve \ --backend vllm \ --model NousResearch/Hermes-3-Llama-3.1-8B \ --endpoint /v1/completions \ @@ -111,25 +111,25 @@ If successful, you will see the following output ``` ============ Serving Benchmark Result ============ -Successful requests: 10 -Benchmark duration (s): 5.78 -Total input tokens: 1369 -Total generated tokens: 2212 -Request throughput (req/s): 1.73 -Output token throughput (tok/s): 382.89 -Total Token throughput (tok/s): 619.85 +Successful requests: 10 +Benchmark duration (s): 5.78 +Total input tokens: 1369 +Total generated tokens: 2212 +Request throughput (req/s): 1.73 +Output token throughput (tok/s): 382.89 +Total Token throughput (tok/s): 619.85 ---------------Time to First Token---------------- -Mean TTFT (ms): 71.54 -Median TTFT (ms): 73.88 -P99 TTFT (ms): 79.49 +Mean TTFT (ms): 71.54 +Median TTFT (ms): 73.88 +P99 TTFT (ms): 79.49 -----Time per Output Token (excl. 1st token)------ -Mean TPOT (ms): 7.91 -Median TPOT (ms): 7.96 -P99 TPOT (ms): 8.03 +Mean TPOT (ms): 7.91 +Median TPOT (ms): 7.96 +P99 TPOT (ms): 8.03 ---------------Inter-token Latency---------------- -Mean ITL (ms): 7.74 -Median ITL (ms): 7.70 -P99 ITL (ms): 8.39 +Mean ITL (ms): 7.74 +Median ITL (ms): 7.70 +P99 ITL (ms): 8.39 ================================================== ``` @@ -141,7 +141,7 @@ If the dataset you want to benchmark is not supported yet in vLLM, even then you {"prompt": "What is the capital of India?"} {"prompt": "What is the capital of Iran?"} {"prompt": "What is the capital of China?"} -``` +``` ```bash # start server @@ -150,7 +150,7 @@ VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests ```bash # run benchmarking script -python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detailed \ +vllm bench serve --port 9001 --save-result --save-detailed \ --backend vllm \ --model meta-llama/Llama-3.1-8B-Instruct \ --endpoint /v1/completions \ @@ -174,7 +174,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests ``` ```bash -python3 vllm/benchmarks/benchmark_serving.py \ +vllm bench serve \ --backend openai-chat \ --model Qwen/Qwen2-VL-7B-Instruct \ --endpoint /v1/chat/completions \ @@ -194,7 +194,7 @@ VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \ ``` ``` bash -python3 benchmarks/benchmark_serving.py \ +vllm bench serve \ --model meta-llama/Meta-Llama-3-8B-Instruct \ --dataset-name hf \ --dataset-path likaixin/InstructCoder \ @@ -210,7 +210,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests **`lmms-lab/LLaVA-OneVision-Data`** ```bash -python3 vllm/benchmarks/benchmark_serving.py \ +vllm bench serve \ --backend openai-chat \ --model Qwen/Qwen2-VL-7B-Instruct \ --endpoint /v1/chat/completions \ @@ -224,7 +224,7 @@ python3 vllm/benchmarks/benchmark_serving.py \ **`Aeala/ShareGPT_Vicuna_unfiltered`** ```bash -python3 vllm/benchmarks/benchmark_serving.py \ +vllm bench serve \ --backend openai-chat \ --model Qwen/Qwen2-VL-7B-Instruct \ --endpoint /v1/chat/completions \ @@ -237,7 +237,7 @@ python3 vllm/benchmarks/benchmark_serving.py \ **`AI-MO/aimo-validation-aime`** ``` bash -python3 vllm/benchmarks/benchmark_serving.py \ +vllm bench serve \ --model Qwen/QwQ-32B \ --dataset-name hf \ --dataset-path AI-MO/aimo-validation-aime \ @@ -248,7 +248,7 @@ python3 vllm/benchmarks/benchmark_serving.py \ **`philschmid/mt-bench`** ``` bash -python3 vllm/benchmarks/benchmark_serving.py \ +vllm bench serve \ --model Qwen/QwQ-32B \ --dataset-name hf \ --dataset-path philschmid/mt-bench \ @@ -261,7 +261,7 @@ When using OpenAI-compatible backends such as `vllm`, optional sampling parameters can be specified. Example client command: ```bash -python3 vllm/benchmarks/benchmark_serving.py \ +vllm bench serve \ --backend vllm \ --model NousResearch/Hermes-3-Llama-3.1-8B \ --endpoint /v1/completions \ @@ -296,7 +296,7 @@ The following arguments can be used to control the ramp-up:
```bash -python3 vllm/benchmarks/benchmark_throughput.py \ +vllm bench throughput \ --model NousResearch/Hermes-3-Llama-3.1-8B \ --dataset-name sonnet \ --dataset-path vllm/benchmarks/sonnet.txt \ @@ -314,7 +314,7 @@ Total num output tokens: 1500 **VisionArena Benchmark for Vision Language Models** ``` bash -python3 vllm/benchmarks/benchmark_throughput.py \ +vllm bench throughput \ --model Qwen/Qwen2-VL-7B-Instruct \ --backend vllm-chat \ --dataset-name hf \ @@ -336,7 +336,7 @@ Total num output tokens: 1280 ``` bash VLLM_WORKER_MULTIPROC_METHOD=spawn \ VLLM_USE_V1=1 \ -python3 vllm/benchmarks/benchmark_throughput.py \ +vllm bench throughput \ --dataset-name=hf \ --dataset-path=likaixin/InstructCoder \ --model=meta-llama/Meta-Llama-3-8B-Instruct \ @@ -360,7 +360,7 @@ Total num output tokens: 204800 **`lmms-lab/LLaVA-OneVision-Data`** ```bash -python3 vllm/benchmarks/benchmark_throughput.py \ +vllm bench throughput \ --model Qwen/Qwen2-VL-7B-Instruct \ --backend vllm-chat \ --dataset-name hf \ @@ -373,7 +373,7 @@ python3 vllm/benchmarks/benchmark_throughput.py \ **`Aeala/ShareGPT_Vicuna_unfiltered`** ```bash -python3 vllm/benchmarks/benchmark_throughput.py \ +vllm bench throughput \ --model Qwen/Qwen2-VL-7B-Instruct \ --backend vllm-chat \ --dataset-name hf \ @@ -385,7 +385,7 @@ python3 vllm/benchmarks/benchmark_throughput.py \ **`AI-MO/aimo-validation-aime`** ```bash -python3 benchmarks/benchmark_throughput.py \ +vllm bench throughput \ --model Qwen/QwQ-32B \ --backend vllm \ --dataset-name hf \ @@ -399,7 +399,7 @@ python3 benchmarks/benchmark_throughput.py \ ``` bash # download dataset # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json -python3 vllm/benchmarks/benchmark_throughput.py \ +vllm bench throughput \ --model meta-llama/Llama-2-7b-hf \ --backend vllm \ --dataset_path /ShareGPT_V3_unfiltered_cleaned_split.json \ diff --git a/benchmarks/auto_tune/README.md b/benchmarks/auto_tune/README.md index ae5962fe92542..c479ff1aa29c0 100644 --- a/benchmarks/auto_tune/README.md +++ b/benchmarks/auto_tune/README.md @@ -105,7 +105,7 @@ After the script finishes, you will find the results in a new, timestamped direc - **Log Files**: The directory (`$BASE/auto-benchmark/YYYY_MM_DD_HH_MM/`) contains detailed logs for each run: - `vllm_log_...txt`: The log output from the vLLM server for each parameter combination. - - `bm_log_...txt`: The log output from the `benchmark_serving.py` script for each benchmark run. + - `bm_log_...txt`: The log output from the `vllm bench serve` command for each benchmark run. - **Final Result Summary**: A file named `result.txt` is created in the log directory. It contains a summary of each tested combination and concludes with the overall best parameters found. diff --git a/benchmarks/auto_tune/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh index 8d3e1d4bee352..3cd8580e065dd 100644 --- a/benchmarks/auto_tune/auto_tune.sh +++ b/benchmarks/auto_tune/auto_tune.sh @@ -1,6 +1,6 @@ #!/bin/bash -# This script aims to tune the best server parameter combinations to maximize throughput for given requirement. +# This script aims to tune the best server parameter combinations to maximize throughput for given requirement. # See details in README (benchmarks/auto_tune/README.md). TAG=$(date +"%Y_%m_%d_%H_%M") @@ -56,7 +56,7 @@ start_server() { local max_num_batched_tokens=$3 local vllm_log=$4 local profile_dir=$5 - + pkill -f vllm VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir vllm serve $MODEL \ @@ -73,9 +73,9 @@ start_server() { # wait for 10 minutes... server_started=0 - for i in {1..60}; do + for i in {1..60}; do RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout) - STATUS_CODE=$(echo "$RESPONSE" | tail -n 1) + STATUS_CODE=$(echo "$RESPONSE" | tail -n 1) if [[ "$STATUS_CODE" -eq 200 ]]; then server_started=1 break @@ -98,10 +98,10 @@ update_best_profile() { selected_profile_file= if [[ "$SYSTEM" == "TPU" ]]; then selected_profile_file="${sorted_paths[$profile_index]}/*.xplane.pb" - fi + fi if [[ "$SYSTEM" == "GPU" ]]; then selected_profile_file="${sorted_paths[$profile_index]}" - fi + fi rm -f $PROFILE_PATH/* cp $selected_profile_file $PROFILE_PATH } @@ -129,14 +129,14 @@ run_benchmark() { echo "server started." fi echo - + echo "run benchmark test..." meet_latency_requirement=0 # get a basic qps by using request-rate inf bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt" prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 )) adjusted_input_len=$(( INPUT_LEN - prefix_len )) - python3 benchmarks/benchmark_serving.py \ + vllm bench serve \ --backend vllm \ --model $MODEL \ --dataset-name random \ @@ -169,7 +169,7 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len )) curl -X POST http://0.0.0.0:8004/reset_prefix_cache sleep 5 bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt" - python3 benchmarks/benchmark_serving.py \ + vllm bench serve \ --backend vllm \ --model $MODEL \ --dataset-name random \ diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 4d2ea126b24a5..d8b960edaa468 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -11,6 +11,7 @@ from typing import Any, Optional import numpy as np from tqdm import tqdm +from typing_extensions import deprecated import vllm.envs as envs from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json @@ -34,6 +35,10 @@ def save_to_pytorch_benchmark_format( write_to_json(pt_file, pt_records) +@deprecated( + "benchmark_latency.py is deprecated and will be removed in a " + "future version. Please use 'vllm bench latency' instead.", +) def main(args: argparse.Namespace): print(args) diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index c597fb1068aba..a97fa280f37c0 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -38,6 +38,7 @@ from typing import Any, Literal, Optional import numpy as np from tqdm.asyncio import tqdm from transformers import PreTrainedTokenizerBase +from typing_extensions import deprecated from backend_request_func import ( ASYNC_REQUEST_FUNCS, @@ -593,6 +594,10 @@ def save_to_pytorch_benchmark_format( write_to_json(pt_file, pt_records) +@deprecated( + "benchmark_serving.py is deprecated and will be removed in a future " + "version. Please use 'vllm bench serve' instead.", +) def main(args: argparse.Namespace): print(args) random.seed(args.seed) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index c0a7f1d582505..c51b579686529 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -15,6 +15,7 @@ import torch import uvloop from tqdm import tqdm from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase +from typing_extensions import deprecated from benchmark_dataset import ( AIMODataset, @@ -382,6 +383,10 @@ def get_requests(args, tokenizer): return dataset_cls(**common_kwargs).sample(**sample_kwargs) +@deprecated( + "benchmark_throughput.py is deprecated and will be removed in a " + "future version. Please use 'vllm bench throughput' instead.", +) def main(args: argparse.Namespace): if args.seed is None: args.seed = 0 diff --git a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh index 94999630bae12..92f97ffabea2a 100644 --- a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh +++ b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh @@ -3,7 +3,7 @@ # benchmark the overhead of disaggregated prefill. # methodology: # - send all request to prefill vLLM instance. It will buffer KV cache. -# - then send all request to decode instance. +# - then send all request to decode instance. # - The TTFT of decode instance is the overhead. set -ex @@ -12,6 +12,8 @@ kill_gpu_processes() { # kill all processes on GPU. pgrep pt_main_thread | xargs -r kill -9 pgrep python3 | xargs -r kill -9 + # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445 + pgrep VLLM | xargs -r kill -9 sleep 10 # remove vllm config file @@ -61,7 +63,7 @@ benchmark() { --gpu-memory-utilization 0.6 \ --kv-transfer-config \ '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' & - + CUDA_VISIBLE_DEVICES=1 python3 \ -m vllm.entrypoints.openai.api_server \ @@ -76,38 +78,38 @@ benchmark() { wait_for_server 8200 # let the prefill instance finish prefill - python3 ../benchmark_serving.py \ - --backend vllm \ - --model $model \ - --dataset-name $dataset_name \ - --dataset-path $dataset_path \ - --sonnet-input-len $input_len \ - --sonnet-output-len "$output_len" \ - --sonnet-prefix-len $prefix_len \ - --num-prompts $num_prompts \ - --port 8100 \ - --save-result \ - --result-dir $results_folder \ - --result-filename disagg_prefill_tp1.json \ - --request-rate "inf" + vllm bench serve \ + --backend vllm \ + --model $model \ + --dataset-name $dataset_name \ + --dataset-path $dataset_path \ + --sonnet-input-len $input_len \ + --sonnet-output-len "$output_len" \ + --sonnet-prefix-len $prefix_len \ + --num-prompts $num_prompts \ + --port 8100 \ + --save-result \ + --result-dir $results_folder \ + --result-filename disagg_prefill_tp1.json \ + --request-rate "inf" # send the request to decode. # The TTFT of this command will be the overhead of disagg prefill impl. - python3 ../benchmark_serving.py \ - --backend vllm \ - --model $model \ - --dataset-name $dataset_name \ - --dataset-path $dataset_path \ - --sonnet-input-len $input_len \ - --sonnet-output-len "$output_len" \ - --sonnet-prefix-len $prefix_len \ - --num-prompts $num_prompts \ - --port 8200 \ - --save-result \ - --result-dir $results_folder \ - --result-filename disagg_prefill_tp1_overhead.json \ - --request-rate "$qps" + vllm bench serve \ + --backend vllm \ + --model $model \ + --dataset-name $dataset_name \ + --dataset-path $dataset_path \ + --sonnet-input-len $input_len \ + --sonnet-output-len "$output_len" \ + --sonnet-prefix-len $prefix_len \ + --num-prompts $num_prompts \ + --port 8200 \ + --save-result \ + --result-dir $results_folder \ + --result-filename disagg_prefill_tp1_overhead.json \ + --request-rate "$qps" kill_gpu_processes } diff --git a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh index eb5d891d0d4a5..af2bcba3ea57a 100644 --- a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh +++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh @@ -18,6 +18,8 @@ kill_gpu_processes() { # kill all processes on GPU. pgrep pt_main_thread | xargs -r kill -9 pgrep python3 | xargs -r kill -9 + # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445 + pgrep VLLM | xargs -r kill -9 for port in 8000 8100 8200; do lsof -t -i:$port | xargs -r kill -9; done sleep 1 } @@ -58,7 +60,7 @@ launch_chunked_prefill() { launch_disagg_prefill() { - model="meta-llama/Meta-Llama-3.1-8B-Instruct" + model="meta-llama/Meta-Llama-3.1-8B-Instruct" # disagg prefill CUDA_VISIBLE_DEVICES=0 python3 \ -m vllm.entrypoints.openai.api_server \ @@ -97,20 +99,20 @@ benchmark() { output_len=$2 tag=$3 - python3 ../benchmark_serving.py \ - --backend vllm \ - --model $model \ - --dataset-name $dataset_name \ - --dataset-path $dataset_path \ - --sonnet-input-len $input_len \ - --sonnet-output-len "$output_len" \ - --sonnet-prefix-len $prefix_len \ - --num-prompts $num_prompts \ - --port 8000 \ - --save-result \ - --result-dir $results_folder \ - --result-filename "$tag"-qps-"$qps".json \ - --request-rate "$qps" + vllm bench serve \ + --backend vllm \ + --model $model \ + --dataset-name $dataset_name \ + --dataset-path $dataset_path \ + --sonnet-input-len $input_len \ + --sonnet-output-len "$output_len" \ + --sonnet-prefix-len $prefix_len \ + --num-prompts $num_prompts \ + --port 8000 \ + --save-result \ + --result-dir $results_folder \ + --result-filename "$tag"-qps-"$qps".json \ + --request-rate "$qps" sleep 2 } diff --git a/benchmarks/kernels/benchmark_moe_align_block_size.py b/benchmarks/kernels/benchmark_moe_align_block_size.py index 1af5a21caf465..f540cff6261a8 100644 --- a/benchmarks/kernels/benchmark_moe_align_block_size.py +++ b/benchmarks/kernels/benchmark_moe_align_block_size.py @@ -5,9 +5,8 @@ import itertools import torch -from vllm import _custom_ops as ops from vllm.model_executor.layers.fused_moe.moe_align_block_size import ( - moe_align_block_size_triton, + moe_align_block_size, ) from vllm.triton_utils import triton @@ -21,60 +20,6 @@ def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor: ) -def check_correctness(num_tokens, num_experts=256, block_size=256, topk=8): - """ - Verifies vllm vs. Triton - """ - topk_ids = get_topk_ids(num_tokens, num_experts, topk) - - # 1. malloc space for triton and vllm - # malloc enough space (max_num_tokens_padded) for the sorted ids - max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1) - sorted_ids_triton = torch.empty( - (max_num_tokens_padded,), dtype=torch.int32, device="cuda" - ) - expert_ids_triton = torch.empty( - (max_num_tokens_padded // block_size,), dtype=torch.int32, device="cuda" - ) - num_tokens_post_pad_triton = torch.empty((1,), dtype=torch.int32, device="cuda") - - sorted_ids_vllm = torch.empty_like(sorted_ids_triton) - expert_ids_vllm = torch.empty_like(expert_ids_triton) - num_tokens_post_pad_vllm = torch.empty_like(num_tokens_post_pad_triton) - - # 2. run implementations - moe_align_block_size_triton( - topk_ids, - num_experts, - block_size, - sorted_ids_triton, - expert_ids_triton, - num_tokens_post_pad_triton, - ) - - ops.moe_align_block_size( - topk_ids, - num_experts, - block_size, - sorted_ids_vllm, - expert_ids_vllm, - num_tokens_post_pad_vllm, - ) - print(f"✅ VLLM implementation works with {num_experts} experts!") - - # 3. compare results - if torch.allclose(expert_ids_triton, expert_ids_vllm) and torch.allclose( - num_tokens_post_pad_triton, num_tokens_post_pad_vllm - ): - print("✅ Triton and VLLM implementations match.") - else: - print("❌ Triton and VLLM implementations DO NOT match.") - print("Triton expert_ids:", expert_ids_triton) - print("VLLM expert_ids:", expert_ids_vllm) - print("Triton num_tokens_post_pad:", num_tokens_post_pad_triton) - print("VLLM num_tokens_post_pad:", num_tokens_post_pad_vllm) - - # test configurations num_tokens_range = [1, 16, 256, 4096] num_experts_range = [16, 64, 224, 256, 280, 512] @@ -87,8 +32,8 @@ configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range x_names=["num_tokens", "num_experts", "topk"], x_vals=configs, line_arg="provider", - line_vals=["vllm", "triton"], # "triton" - line_names=["VLLM", "Triton"], # "Triton" + line_vals=["vllm"], + line_names=["vLLM"], plot_name="moe-align-block-size-performance", args={}, ) @@ -98,36 +43,11 @@ def benchmark(num_tokens, num_experts, topk, provider): block_size = 256 topk_ids = get_topk_ids(num_tokens, num_experts, topk) - max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1) - sorted_ids = torch.empty((max_num_tokens_padded,), dtype=torch.int32, device="cuda") - max_num_m_blocks = max_num_tokens_padded // block_size - expert_ids = torch.empty((max_num_m_blocks,), dtype=torch.int32, device="cuda") - num_tokens_post_pad = torch.empty((1,), dtype=torch.int32, device="cuda") - quantiles = [0.5, 0.2, 0.8] if provider == "vllm": ms, min_ms, max_ms = triton.testing.do_bench( - lambda: ops.moe_align_block_size( - topk_ids, - num_experts, - block_size, - sorted_ids.clone(), - expert_ids.clone(), - num_tokens_post_pad.clone(), - ), - quantiles=quantiles, - ) - elif provider == "triton": - ms, min_ms, max_ms = triton.testing.do_bench( - lambda: moe_align_block_size_triton( - topk_ids, - num_experts, - block_size, - sorted_ids.clone(), - expert_ids.clone(), - num_tokens_post_pad.clone(), - ), + lambda: moe_align_block_size(topk_ids, block_size, num_experts), quantiles=quantiles, ) @@ -151,6 +71,4 @@ if __name__ == "__main__": ) args = parser.parse_args() - print("Running correctness check...") - check_correctness(num_tokens=1024, num_experts=args.num_experts, topk=args.topk) benchmark.run(print_data=True, show_plots=True) diff --git a/benchmarks/kernels/benchmark_moe_permute_unpermute.py b/benchmarks/kernels/benchmark_moe_permute_unpermute.py index 4ed6900901442..04d2205aa3722 100644 --- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py +++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py @@ -8,12 +8,13 @@ import ray import torch from transformers import AutoConfig -from vllm.model_executor.layers.fused_moe.deep_gemm_moe import ( +from vllm.model_executor.layers.fused_moe.fused_moe import * +from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import ( _moe_permute, _moe_unpermute_and_reduce, + moe_permute, + moe_unpermute, ) -from vllm.model_executor.layers.fused_moe.fused_moe import * -from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import * from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize from vllm.platforms import current_platform from vllm.utils import FlexibleArgumentParser @@ -63,18 +64,19 @@ def benchmark_permute( def run(): if use_customized_permute: - (permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = ( - moe_permute( - qhidden_states, - topk_weights=topk_weights, - topk_ids=topk_ids, - token_expert_indices=token_expert_indices, - topk=topk, - n_expert=num_experts, - n_local_expert=num_experts, - expert_map=None, - align_block_size=align_block_size, - ) + ( + permuted_hidden_states, + a1q_scale, + first_token_off, + inv_perm_idx, + m_indices, + ) = moe_permute( + qhidden_states, + a1q_scale=None, + topk_ids=topk_ids, + n_expert=num_experts, + expert_map=None, + align_block_size=align_block_size, ) else: ( @@ -150,18 +152,19 @@ def benchmark_unpermute( def prepare(): if use_customized_permute: - (permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = ( - moe_permute( - qhidden_states, - topk_weights=topk_weights, - topk_ids=topk_ids, - token_expert_indices=token_expert_indices, - topk=topk, - n_expert=num_experts, - n_local_expert=num_experts, - expert_map=None, - align_block_size=align_block_size, - ) + ( + permuted_hidden_states, + a1q_scale, + first_token_off, + inv_perm_idx, + m_indices, + ) = moe_permute( + qhidden_states, + a1q_scale=None, + topk_ids=topk_ids, + n_expert=num_experts, + expert_map=None, + align_block_size=align_block_size, ) # convert to fp16/bf16 as gemm output return ( @@ -191,16 +194,19 @@ def benchmark_unpermute( def run(input: tuple): if use_customized_permute: - (permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = input + ( + permuted_hidden_states, + first_token_off, + inv_perm_idx, + m_indices, + ) = input + output = torch.empty_like(hidden_states) moe_unpermute( + output, permuted_hidden_states, topk_weights, - topk_ids, inv_perm_idx, first_token_off, - topk, - num_experts, - num_experts, ) else: ( @@ -211,7 +217,11 @@ def benchmark_unpermute( inv_perm, ) = input _moe_unpermute_and_reduce( - output_hidden_states, permuted_hidden_states, inv_perm, topk_weights + output_hidden_states, + permuted_hidden_states, + inv_perm, + topk_weights, + True, ) # JIT compilation & warmup diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp index f1738aee980b6..b20a054648428 100644 --- a/csrc/cpu/torch_bindings.cpp +++ b/csrc/cpu/torch_bindings.cpp @@ -151,7 +151,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding); // Quantization -#if defined(__AVX512F__) || defined(__aarch64__) +#if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__)) at::Tag stride_tag = at::Tag::needs_fixed_stride_order; // Compute int8 quantized tensor for given scaling factor. diff --git a/csrc/moe/moe_permute_unpermute_op.cu b/csrc/moe/moe_permute_unpermute_op.cu index a77471a7f2078..2922352a3f7cc 100644 --- a/csrc/moe/moe_permute_unpermute_op.cu +++ b/csrc/moe/moe_permute_unpermute_op.cu @@ -10,32 +10,28 @@ void moe_permute( const torch::Tensor& input, // [n_token, hidden] - const torch::Tensor& topk_weights, //[n_token, topk] - torch::Tensor& topk_ids, // [n_token, topk] + const torch::Tensor& topk_ids, // [n_token, topk] const torch::Tensor& token_expert_indices, // [n_token, topk] const std::optional& expert_map, // [n_expert] int64_t n_expert, int64_t n_local_expert, int64_t topk, const std::optional& align_block_size, - torch::Tensor& - permuted_input, // [topk * n_token/align_block_size_m, hidden] + torch::Tensor& permuted_input, // [permuted_size, hidden] torch::Tensor& expert_first_token_offset, // [n_local_expert + 1] - torch::Tensor& src_row_id2dst_row_id_map, // [n_token, topk] + torch::Tensor& inv_permuted_idx, // [n_token, topk] + torch::Tensor& permuted_idx, // [permute_size] torch::Tensor& m_indices) { // [align_expand_m] - TORCH_CHECK(topk_weights.scalar_type() == at::ScalarType::Float, - "topk_weights must be float32"); TORCH_CHECK(expert_first_token_offset.scalar_type() == at::ScalarType::Long, "expert_first_token_offset must be int64"); TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int, "topk_ids must be int32"); TORCH_CHECK(token_expert_indices.scalar_type() == at::ScalarType::Int, "token_expert_indices must be int32"); - TORCH_CHECK(src_row_id2dst_row_id_map.scalar_type() == at::ScalarType::Int, - "src_row_id2dst_row_id_map must be int32"); + TORCH_CHECK(inv_permuted_idx.scalar_type() == at::ScalarType::Int, + "inv_permuted_idx must be int32"); TORCH_CHECK(expert_first_token_offset.size(0) == n_local_expert + 1, "expert_first_token_offset shape != n_local_expert+1") - TORCH_CHECK( - src_row_id2dst_row_id_map.sizes() == token_expert_indices.sizes(), - "token_expert_indices shape must be same as src_row_id2dst_row_id_map"); + TORCH_CHECK(inv_permuted_idx.sizes() == token_expert_indices.sizes(), + "token_expert_indices shape must be same as inv_permuted_idx"); auto n_token = input.sizes()[0]; auto n_hidden = input.sizes()[1]; auto align_block_size_value = @@ -46,8 +42,9 @@ void moe_permute( auto sort_workspace = torch::empty( {sorter_size}, torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false)); + auto copy_topk_ids = topk_ids.clone(); // copy topk_ids for preprocess auto permuted_experts_id = torch::empty_like(topk_ids); - auto dst_row_id2src_row_id_map = torch::empty_like(src_row_id2dst_row_id_map); + auto sorted_row_idx = torch::empty_like(inv_permuted_idx); auto align_expert_first_token_offset = torch::zeros_like(expert_first_token_offset); @@ -67,24 +64,22 @@ void moe_permute( const int* expert_map_ptr = get_ptr(expert_map.value()); valid_num_ptr = get_ptr(expert_first_token_offset) + n_local_expert; - preprocessTopkIdLauncher(get_ptr(topk_ids), n_token * topk, + preprocessTopkIdLauncher(get_ptr(copy_topk_ids), n_token * topk, expert_map_ptr, n_expert, stream); } // expert sort topk expert id and scan expert id get expert_first_token_offset - sortAndScanExpert(get_ptr(topk_ids), get_ptr(token_expert_indices), - get_ptr(permuted_experts_id), - get_ptr(dst_row_id2src_row_id_map), - get_ptr(expert_first_token_offset), n_token, - n_expert, n_local_expert, topk, sorter, - get_ptr(sort_workspace), stream); + sortAndScanExpert( + get_ptr(copy_topk_ids), get_ptr(token_expert_indices), + get_ptr(permuted_experts_id), get_ptr(sorted_row_idx), + get_ptr(expert_first_token_offset), n_token, n_expert, + n_local_expert, topk, sorter, get_ptr(sort_workspace), stream); // dispatch expandInputRowsKernelLauncher MOE_DISPATCH(input.scalar_type(), [&] { expandInputRowsKernelLauncher( get_ptr(input), get_ptr(permuted_input), - get_ptr(topk_weights), get_ptr(permuted_experts_id), - get_ptr(dst_row_id2src_row_id_map), - get_ptr(src_row_id2dst_row_id_map), + get_ptr(permuted_experts_id), get_ptr(sorted_row_idx), + get_ptr(inv_permuted_idx), get_ptr(permuted_idx), get_ptr(expert_first_token_offset), n_token, valid_num_ptr, n_hidden, topk, n_local_expert, align_block_size_value, stream); }); @@ -101,32 +96,34 @@ void moe_permute( } void moe_unpermute( - const torch::Tensor& permuted_hidden_states, // [n_token * topk, hidden] - const torch::Tensor& topk_weights, //[n_token, topk] - const torch::Tensor& topk_ids, // [n_token, topk] - const torch::Tensor& src_row_id2dst_row_id_map, // [n_token, topk] - const torch::Tensor& expert_first_token_offset, // [n_local_expert+1] - int64_t n_expert, int64_t n_local_expert, int64_t topk, + const torch::Tensor& permuted_hidden_states, // [n_token * topk, hidden] + const torch::Tensor& topk_weights, // [n_token, topk] + const torch::Tensor& inv_permuted_idx, // [n_token, topk] + const std::optional& + expert_first_token_offset, // [n_local_expert+1] + int64_t topk, torch::Tensor& hidden_states // [n_token, hidden] ) { - TORCH_CHECK(src_row_id2dst_row_id_map.sizes() == topk_ids.sizes(), - "topk_ids shape must be same as src_row_id2dst_row_id_map"); - TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int, - "topk_ids must be int32"); TORCH_CHECK( permuted_hidden_states.scalar_type() == hidden_states.scalar_type(), - "topk_ids dtype must be same as src_row_id2dst_row_id_map"); + "permuted_hidden_states dtype must be same as hidden_states"); auto n_token = hidden_states.size(0); auto n_hidden = hidden_states.size(1); auto stream = at::cuda::getCurrentCUDAStream().stream(); - const int64_t* valid_ptr = - get_ptr(expert_first_token_offset) + n_local_expert; + + int64_t const* valid_ptr = nullptr; + if (expert_first_token_offset.has_value()) { + int n_local_expert = expert_first_token_offset.value().size(0) - 1; + valid_ptr = + get_ptr(expert_first_token_offset.value()) + n_local_expert; + } + MOE_DISPATCH(hidden_states.scalar_type(), [&] { finalizeMoeRoutingKernelLauncher( get_ptr(permuted_hidden_states), get_ptr(hidden_states), get_ptr(topk_weights), - get_ptr(src_row_id2dst_row_id_map), get_ptr(topk_ids), - n_token, n_hidden, topk, valid_ptr, stream); + get_ptr(inv_permuted_idx), n_token, n_hidden, topk, valid_ptr, + stream); }); } diff --git a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu index de2c153882d93..2271c1bc75b1f 100644 --- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu +++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu @@ -177,7 +177,7 @@ __global__ void getMIndicesKernel(int64_t* expert_first_token_offset, int tidx = threadIdx.x; extern __shared__ int64_t smem_expert_first_token_offset[]; for (int i = tidx; i <= num_local_expert; i += blockDim.x) { - smem_expert_first_token_offset[tidx] = __ldg(expert_first_token_offset + i); + smem_expert_first_token_offset[i] = __ldg(expert_first_token_offset + i); } __syncthreads(); auto last_token_offset = smem_expert_first_token_offset[eidx + 1]; diff --git a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h index 43c29721cd16e..108091efbefa8 100644 --- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h +++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h @@ -57,31 +57,19 @@ void sortAndScanExpert(int* expert_for_source_row, const int* source_rows, template void expandInputRowsKernelLauncher( - T const* unpermuted_input, T* permuted_output, - const float* unpermuted_scales, int* sorted_experts, + T const* unpermuted_input, T* permuted_output, int* sorted_experts, int const* expanded_dest_row_to_expanded_source_row, - int* expanded_source_row_to_expanded_dest_row, + int* expanded_source_row_to_expanded_dest_row, int* permuted_idx, int64_t* expert_first_token_offset, int64_t const num_rows, int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k, int num_local_experts, const int& align_block_size, cudaStream_t stream); -// Final kernel to unpermute and scale -// This kernel unpermutes the original data, does the k-way reduction and -// performs the final skip connection. -template -__global__ void finalizeMoeRoutingKernel( - T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output, - float const* scales, int const* expanded_source_row_to_expanded_dest_row, - int const* expert_for_source_row, int64_t const orig_cols, int64_t const k, - int64_t const* num_valid_ptr); - template void finalizeMoeRoutingKernelLauncher( T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output, float const* scales, int const* expanded_source_row_to_expanded_dest_row, - int const* expert_for_source_row, int64_t const num_rows, - int64_t const cols, int64_t const k, int64_t const* num_valid_ptr, - cudaStream_t stream); + int64_t const num_rows, int64_t const cols, int64_t const k, + int64_t const* num_valid_ptr, cudaStream_t stream); void preprocessTopkIdLauncher(int* topk_id_ptr, int size, const int* expert_map_ptr, int num_experts, diff --git a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl index ad0d390665a00..449243b92a283 100644 --- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl +++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl @@ -2,10 +2,9 @@ template __global__ void expandInputRowsKernel( - T const* unpermuted_input, T* permuted_output, - const float* unpermuted_scales, int* sorted_experts, + T const* unpermuted_input, T* permuted_output, int* sorted_experts, int const* expanded_dest_row_to_expanded_source_row, - int* expanded_source_row_to_expanded_dest_row, + int* expanded_source_row_to_expanded_dest_row, int* permuted_idx, int64_t* expert_first_token_offset, int64_t const num_rows, int64_t const* num_dest_rows, int64_t const cols, int64_t k, int num_local_experts, int align_block_size) { @@ -54,6 +53,10 @@ __global__ void expandInputRowsKernel( assert(expanded_dest_row <= INT32_MAX); expanded_source_row_to_expanded_dest_row[expanded_source_row] = static_cast(expanded_dest_row); + // skip non local expert token + if (!CHECK_SKIPPED || blockIdx.x < *num_dest_rows) { + permuted_idx[expanded_dest_row] = expanded_source_row; + } } if (!CHECK_SKIPPED || blockIdx.x < *num_dest_rows) { @@ -62,7 +65,7 @@ __global__ void expandInputRowsKernel( using DataElem = cutlass::Array; // Duplicate and permute rows - int64_t const source_row = expanded_source_row % num_rows; + int64_t const source_row = expanded_source_row / k; auto const* source_row_ptr = reinterpret_cast(unpermuted_input + source_row * cols); @@ -82,10 +85,9 @@ __global__ void expandInputRowsKernel( template void expandInputRowsKernelLauncher( - T const* unpermuted_input, T* permuted_output, - const float* unpermuted_scales, int* sorted_experts, + T const* unpermuted_input, T* permuted_output, int* sorted_experts, int const* expanded_dest_row_to_expanded_source_row, - int* expanded_source_row_to_expanded_dest_row, + int* expanded_source_row_to_expanded_dest_row, int* permuted_idx, int64_t* expert_first_token_offset, int64_t const num_rows, int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k, int num_local_experts, const int& align_block_size, cudaStream_t stream) { @@ -105,11 +107,11 @@ void expandInputRowsKernelLauncher( int64_t smem_size = sizeof(int64_t) * (num_local_experts + 1); func<<>>( - unpermuted_input, permuted_output, unpermuted_scales, sorted_experts, + unpermuted_input, permuted_output, sorted_experts, expanded_dest_row_to_expanded_source_row, - expanded_source_row_to_expanded_dest_row, expert_first_token_offset, - num_rows, num_valid_tokens_ptr, cols, k, num_local_experts, - align_block_size); + expanded_source_row_to_expanded_dest_row, permuted_idx, + expert_first_token_offset, num_rows, num_valid_tokens_ptr, cols, k, + num_local_experts, align_block_size); } template @@ -128,11 +130,9 @@ template __global__ void finalizeMoeRoutingKernel( T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output, float const* scales, int const* expanded_source_row_to_expanded_dest_row, - int const* expert_for_source_row, int64_t const orig_cols, int64_t const k, - int64_t const* num_valid_ptr) { + int64_t const orig_cols, int64_t const k, int64_t const* num_valid_ptr) { assert(orig_cols % 4 == 0); int64_t const original_row = blockIdx.x; - int64_t const num_rows = gridDim.x; auto const offset = original_row * orig_cols; OutputType* reduced_row_ptr = reduced_unpermuted_output + offset; int64_t const num_valid = *num_valid_ptr; @@ -159,14 +159,13 @@ __global__ void finalizeMoeRoutingKernel( ComputeElem thread_output; thread_output.fill(0); for (int k_idx = 0; k_idx < k; ++k_idx) { - int64_t const expanded_original_row = original_row + k_idx * num_rows; + int64_t const expanded_original_row = original_row * k + k_idx; int64_t const expanded_permuted_row = expanded_source_row_to_expanded_dest_row[expanded_original_row]; int64_t const k_offset = original_row * k + k_idx; float const row_scale = scales[k_offset]; - // Check after row_rescale has accumulated if (CHECK_SKIPPED && expanded_permuted_row >= num_valid) { continue; } @@ -189,9 +188,8 @@ template void finalizeMoeRoutingKernelLauncher( T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output, float const* scales, int const* expanded_source_row_to_expanded_dest_row, - int const* expert_for_source_row, int64_t const num_rows, - int64_t const cols, int64_t const k, int64_t const* num_valid_ptr, - cudaStream_t stream) { + int64_t const num_rows, int64_t const cols, int64_t const k, + int64_t const* num_valid_ptr, cudaStream_t stream) { int64_t const blocks = num_rows; int64_t const threads = 256; bool const check_finished = num_valid_ptr != nullptr; @@ -201,6 +199,5 @@ void finalizeMoeRoutingKernelLauncher( auto* const kernel = func_map[check_finished]; kernel<<>>( expanded_permuted_rows, reduced_unpermuted_output, scales, - expanded_source_row_to_expanded_dest_row, expert_for_source_row, cols, k, - num_valid_ptr); + expanded_source_row_to_expanded_dest_row, cols, k, num_valid_ptr); } diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp index 97df311d04409..d96e082f6ef11 100644 --- a/csrc/moe/torch_bindings.cpp +++ b/csrc/moe/torch_bindings.cpp @@ -56,18 +56,17 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) { " -> Tensor"); m.def( - "moe_permute(Tensor input, Tensor topk_weight, Tensor! topk_ids," + "moe_permute(Tensor input, Tensor topk_ids," "Tensor token_expert_indices, Tensor? expert_map, int n_expert," "int n_local_expert," "int topk, int? align_block_size,Tensor! permuted_input, Tensor! " - "expert_first_token_offset, Tensor! src_row_id2dst_row_id_map, Tensor! " - "m_indices)->()"); + "expert_first_token_offset, Tensor! inv_permuted_idx, Tensor! " + "permuted_idx, Tensor! m_indices)->()"); m.def( "moe_unpermute(Tensor permuted_hidden_states, Tensor topk_weights," - "Tensor topk_ids,Tensor src_row_id2dst_row_id_map, Tensor " - "expert_first_token_offset, int n_expert, int n_local_expert,int " - "topk, Tensor! hidden_states)->()"); + "Tensor inv_permuted_idx, Tensor? expert_first_token_offset, " + "int topk, Tensor! hidden_states)->()"); m.def("moe_permute_unpermute_supported() -> bool"); m.impl("moe_permute_unpermute_supported", &moe_permute_unpermute_supported); diff --git a/csrc/ops.h b/csrc/ops.h index 97a247d9d628c..207291eceb169 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -292,6 +292,11 @@ void per_token_group_quant_fp8(const torch::Tensor& input, torch::Tensor& output_q, torch::Tensor& output_s, int64_t group_size, double eps, double fp8_min, double fp8_max, bool scale_ue8m0); + +void per_token_group_quant_int8(const torch::Tensor& input, + torch::Tensor& output_q, + torch::Tensor& output_s, int64_t group_size, + double eps, double int8_min, double int8_max); #endif void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input, diff --git a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu index 5cd2ac179768b..6a81f159f46ae 100644 --- a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu +++ b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu @@ -1,6 +1,8 @@ #include #include +#include "../per_token_group_quant_8bit.h" + #include #include "../../dispatch_utils.h" @@ -336,3 +338,11 @@ void dynamic_scaled_int8_quant( } }); } + +void per_token_group_quant_int8(const torch::Tensor& input, + torch::Tensor& output_q, + torch::Tensor& output_s, int64_t group_size, + double eps, double int8_min, double int8_max) { + per_token_group_quant_8bit(input, output_q, output_s, group_size, eps, + int8_min, int8_max); +} \ No newline at end of file diff --git a/csrc/quantization/fp8/per_token_group_quant.cu b/csrc/quantization/fp8/per_token_group_quant.cu index afc41faeca902..2609054f2072b 100644 --- a/csrc/quantization/fp8/per_token_group_quant.cu +++ b/csrc/quantization/fp8/per_token_group_quant.cu @@ -1,6 +1,8 @@ #include #include +#include "../per_token_group_quant_8bit.h" + #include #include @@ -120,7 +122,7 @@ void per_token_group_quant_8bit(const torch::Tensor& input, torch::Tensor& output_q, torch::Tensor& output_s, int64_t group_size, double eps, double min_8bit, double max_8bit, - bool scale_ue8m0 = false) { + bool scale_ue8m0) { TORCH_CHECK(input.is_contiguous()); TORCH_CHECK(output_q.is_contiguous()); @@ -198,6 +200,8 @@ void per_token_group_quant_8bit(const torch::Tensor& input, input.scalar_type(), "per_token_group_quant_8bit", ([&] { if (dst_type == at::ScalarType::Float8_e4m3fn) { LAUNCH_KERNEL(scalar_t, c10::Float8_e4m3fn); + } else if (dst_type == at::ScalarType::Char) { + LAUNCH_KERNEL(scalar_t, int8_t); } })); diff --git a/csrc/quantization/machete/machete_prepacked_layout.cuh b/csrc/quantization/machete/machete_prepacked_layout.cuh index 81aaa6c4f3a28..4a7d6341e6c00 100644 --- a/csrc/quantization/machete/machete_prepacked_layout.cuh +++ b/csrc/quantization/machete/machete_prepacked_layout.cuh @@ -187,8 +187,12 @@ struct PrepackedLayoutBTemplate { CUTE_HOST_DEVICE static constexpr auto TVbNbKL_to_offset_copy( Shape_NKL shape_mkl) { auto layout = TVbNbKL_to_offset(shape_mkl); - return make_layout(coalesce(get<0>(layout)), get<1>(layout), - get<2>(layout)); + // for 4-bit elements, having >= 64 values per column + // allows TMA to load full 32-byte sectors + auto inner_layout = + make_layout(make_shape(_256{}, size<0>(layout) / _256{})); + + return make_layout(inner_layout, get<1>(layout), get<2>(layout)); } // ((BlockN, BlockK), (BlocksN, BlocksK), L) -> (storage_idx) diff --git a/csrc/quantization/per_token_group_quant_8bit.h b/csrc/quantization/per_token_group_quant_8bit.h new file mode 100644 index 0000000000000..537b61bc4303f --- /dev/null +++ b/csrc/quantization/per_token_group_quant_8bit.h @@ -0,0 +1,10 @@ +#pragma once +#include + +// TODO(wentao): refactor the folder to 8bit, then includes fp8 and int8 folders +// 8-bit per-token-group quantization helper used by both FP8 and INT8 +void per_token_group_quant_8bit(const torch::Tensor& input, + torch::Tensor& output_q, + torch::Tensor& output_s, int64_t group_size, + double eps, double min_8bit, double max_8bit, + bool scale_ue8m0 = false); \ No newline at end of file diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 95f8541bc9e2d..85b6abef00b03 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -624,6 +624,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.impl("per_token_group_fp8_quant", torch::kCUDA, &per_token_group_quant_fp8); + // Compute per-token-group INT8 quantized tensor and scaling factor. + ops.def( + "per_token_group_quant_int8(Tensor input, Tensor! output_q, Tensor! " + "output_s, int group_size, float eps, float int8_min, float int8_max) -> " + "()"); + ops.impl("per_token_group_quant_int8", torch::kCUDA, + &per_token_group_quant_int8); + // reorder weight for AllSpark Ampere W8A16 Fused Gemm kernel ops.def( "rearrange_kn_weight_as_n32k16_order(Tensor b_qweight, Tensor b_scales, " diff --git a/docker/Dockerfile.arm b/docker/Dockerfile.arm deleted file mode 100644 index bad093684239c..0000000000000 --- a/docker/Dockerfile.arm +++ /dev/null @@ -1,62 +0,0 @@ -# This vLLM Dockerfile is used to construct an image that can build and run vLLM on ARM CPU platform. - -FROM ubuntu:22.04 AS cpu-test-arm - -ENV CCACHE_DIR=/root/.cache/ccache - -ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache - -RUN --mount=type=cache,target=/var/cache/apt \ - apt-get update -y \ - && apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \ - && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \ - && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 - -# tcmalloc provides better memory allocation efficiency, e.g., holding memory in caches to speed up access of commonly-used objects. -RUN --mount=type=cache,target=/root/.cache/pip \ - pip install py-cpuinfo # Use this to gather CPU info and optimize based on ARM Neoverse cores - -# Set LD_PRELOAD for tcmalloc on ARM -ENV LD_PRELOAD="/usr/lib/aarch64-linux-gnu/libtcmalloc_minimal.so.4" - -RUN echo 'ulimit -c 0' >> ~/.bashrc - -WORKDIR /workspace - -ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" -ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} -RUN --mount=type=cache,target=/root/.cache/pip \ - --mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \ - pip install --upgrade pip && \ - pip install -r requirements/build.txt - -FROM cpu-test-arm AS build - -WORKDIR /workspace/vllm - -RUN --mount=type=cache,target=/root/.cache/pip \ - --mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \ - --mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \ - pip install -v -r requirements/cpu.txt - -COPY . . -ARG GIT_REPO_CHECK=0 -RUN --mount=type=bind,source=.git,target=.git \ - if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi - -# Disabling AVX512 specific optimizations for ARM -ARG VLLM_CPU_DISABLE_AVX512="true" -ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512} - -RUN --mount=type=cache,target=/root/.cache/pip \ - --mount=type=cache,target=/root/.cache/ccache \ - --mount=type=bind,source=.git,target=.git \ - VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \ - pip install dist/*.whl && \ - rm -rf dist - -WORKDIR /workspace/ - -RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks - -ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] \ No newline at end of file diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu index 982c1ddf27438..5e49e87131ece 100644 --- a/docker/Dockerfile.cpu +++ b/docker/Dockerfile.cpu @@ -1,4 +1,11 @@ -# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform. +# This vLLM Dockerfile is used to build images that can run vLLM on both x86_64 and arm64 CPU platforms. +# +# Supported platforms: +# - linux/amd64 (x86_64) +# - linux/arm64 (aarch64) +# +# Use the `--platform` option with `docker buildx build` to specify the target architecture, e.g.: +# docker buildx build --platform=linux/arm64 -f docker/Dockerfile.cpu . # # Build targets: # vllm-openai (default): used for serving deployment @@ -53,7 +60,20 @@ RUN --mount=type=cache,target=/root/.cache/uv \ uv pip install --upgrade pip && \ uv pip install -r requirements/cpu.txt -ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/venv/lib/libiomp5.so:$LD_PRELOAD" +ARG TARGETARCH +ENV TARGETARCH=${TARGETARCH} + +RUN if [ "$TARGETARCH" = "arm64" ]; then \ + PRELOAD_PATH="/usr/lib/aarch64-linux-gnu/libtcmalloc_minimal.so.4"; \ + else \ + PRELOAD_PATH="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/venv/lib/libiomp5.so"; \ + fi && \ + echo "export LD_PRELOAD=$PRELOAD_PATH" >> ~/.bashrc + +# Ensure that the LD_PRELOAD environment variable for export is in effect. +SHELL ["/bin/bash", "-c"] + +ENV LD_PRELOAD=${LD_PRELOAD} RUN echo 'ulimit -c 0' >> ~/.bashrc diff --git a/docker/Dockerfile.tpu b/docker/Dockerfile.tpu index 3474ff50de7bd..b9fc9def88190 100644 --- a/docker/Dockerfile.tpu +++ b/docker/Dockerfile.tpu @@ -1,4 +1,4 @@ -ARG NIGHTLY_DATE="20250714" +ARG NIGHTLY_DATE="20250724" ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.12_tpuvm_$NIGHTLY_DATE" FROM $BASE_IMAGE diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md index a5851cfe963d2..13c3bc2c7e031 100644 --- a/docs/contributing/profiling.md +++ b/docs/contributing/profiling.md @@ -9,10 +9,13 @@ We support tracing vLLM workers using the `torch.profiler` module. You can enabl The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` environment variable set. -When using `benchmarks/benchmark_serving.py`, you can enable profiling by passing the `--profile` flag. +When using `vllm bench serve`, you can enable profiling by passing the `--profile` flag. Traces can be visualized using . +!!! tip +You can directly call bench module without installing vllm using `python -m vllm.entrypoints.cli.main bench`. + !!! tip Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly. @@ -35,10 +38,10 @@ VLLM_TORCH_PROFILER_DIR=./vllm_profile \ --model meta-llama/Meta-Llama-3-70B ``` -benchmark_serving.py: +vllm bench command: ```bash -python benchmarks/benchmark_serving.py \ +vllm bench serve \ --backend vllm \ --model meta-llama/Meta-Llama-3-70B \ --dataset-name sharegpt \ @@ -69,13 +72,13 @@ apt install nsight-systems-cli For basic usage, you can just append `nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace=node` before any existing script you would run for offline inference. -The following is an example using the `benchmarks/benchmark_latency.py` script: +The following is an example using the `vllm bench latency` script: ```bash nsys profile -o report.nsys-rep \ --trace-fork-before-exec=true \ --cuda-graph-trace=node \ - python benchmarks/benchmark_latency.py \ +vllm bench latency \ --model meta-llama/Llama-3.1-8B-Instruct \ --num-iters-warmup 5 \ --num-iters 1 \ @@ -98,7 +101,7 @@ nsys profile -o report.nsys-rep \ vllm serve meta-llama/Llama-3.1-8B-Instruct # client -python benchmarks/benchmark_serving.py \ +vllm bench serve \ --backend vllm \ --model meta-llama/Llama-3.1-8B-Instruct \ --num-prompts 1 \ @@ -132,7 +135,7 @@ You can view these profiles either as summaries in the CLI, using `nsys stats [p ... ** CUDA GPU Kernel Summary (cuda_gpu_kern_sum): - Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name + Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name -------- --------------- --------- ----------- ----------- -------- --------- ----------- ---------------------------------------------------------------------------------------------------- 46.3 10,327,352,338 17,505 589,965.9 144,383.0 27,040 3,126,460 944,263.8 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize128x128x64_warpgroupsize1x1x1_execute_segment_k_of… 14.8 3,305,114,764 5,152 641,520.7 293,408.0 287,296 2,822,716 867,124.9 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize256x128x64_warpgroupsize2x1x1_execute_segment_k_of… @@ -143,7 +146,7 @@ You can view these profiles either as summaries in the CLI, using `nsys stats [p 2.6 587,283,113 37,824 15,526.7 3,008.0 2,719 2,517,756 139,091.1 std::enable_if(int)0&&vllm::_typeConvert::exists, void>::type vllm::fused_add_rms_norm_kern… 1.9 418,362,605 18,912 22,121.5 3,871.0 3,328 2,523,870 175,248.2 void vllm::rotary_embedding_kernel(const long *, T1 *, T1 *, const T1 *, in… 0.7 167,083,069 18,880 8,849.7 2,240.0 1,471 2,499,996 101,436.1 void vllm::reshape_and_cache_flash_kernel<__nv_bfloat16, __nv_bfloat16, (vllm::Fp8KVCacheDataType)0… - ... + ... ``` GUI example: diff --git a/docs/design/v1/p2p_nccl_connector.md b/docs/design/v1/p2p_nccl_connector.md index 9f6acf3291dd2..9d334f8873d97 100644 --- a/docs/design/v1/p2p_nccl_connector.md +++ b/docs/design/v1/p2p_nccl_connector.md @@ -3,14 +3,14 @@ An implementation of xPyD with dynamic scaling based on point-to-point communica # Detailed Design ## Overall Process -As shown in Figure 1, the overall process of this **PD disaggregation** solution is described through a request flow: +As shown in Figure 1, the overall process of this **PD disaggregation** solution is described through a request flow: -1. The client sends an HTTP request to the Proxy/Router's `/v1/completions` interface. -2. The Proxy/Router selects a **1P1D (1 Prefill instance + 1 Decode instance)** through either through round-robin or random selection, generates a `request_id` (rules to be introduced later), modifies the `max_tokens` in the HTTP request message to **1**, and then forwards the request to the **P instance**. -3. Immediately afterward, the Proxy/Router forwards the **original HTTP request** to the **D instance**. -4. The **P instance** performs **Prefill** and then **actively sends the generated KV cache** to the D instance (using **PUT_ASYNC** mode). The D instance's `zmq_addr` can be resolved through the `request_id`. -5. The **D instance** has a **dedicated thread** for receiving the KV cache (to avoid blocking the main process). The received KV cache is saved into the **GPU memory buffer**, the size of which is determined by the vLLM startup parameter `kv_buffer_size`. When the GPU buffer is full, the KV cache is stored in the **local Tensor memory pool**. -6. During the **Decode**, the D instance's main process retrieves the KV cache (transmitted by the P instance) from either the **GPU buffer** or the **memory pool**, thereby **skipping Prefill**. +1. The client sends an HTTP request to the Proxy/Router's `/v1/completions` interface. +2. The Proxy/Router selects a **1P1D (1 Prefill instance + 1 Decode instance)** through either through round-robin or random selection, generates a `request_id` (rules to be introduced later), modifies the `max_tokens` in the HTTP request message to **1**, and then forwards the request to the **P instance**. +3. Immediately afterward, the Proxy/Router forwards the **original HTTP request** to the **D instance**. +4. The **P instance** performs **Prefill** and then **actively sends the generated KV cache** to the D instance (using **PUT_ASYNC** mode). The D instance's `zmq_addr` can be resolved through the `request_id`. +5. The **D instance** has a **dedicated thread** for receiving the KV cache (to avoid blocking the main process). The received KV cache is saved into the **GPU memory buffer**, the size of which is determined by the vLLM startup parameter `kv_buffer_size`. When the GPU buffer is full, the KV cache is stored in the **local Tensor memory pool**. +6. During the **Decode**, the D instance's main process retrieves the KV cache (transmitted by the P instance) from either the **GPU buffer** or the **memory pool**, thereby **skipping Prefill**. 7. After completing **Decode**, the D instance returns the result to the **Proxy/Router**, which then forwards it to the **client**. ![image1](https://github.com/user-attachments/assets/fb01bde6-755b-49f7-ad45-48a94b1e10a7) @@ -291,7 +291,7 @@ curl -X POST -s http://10.0.1.1:10001/v1/completions \ ??? console "Command" ```shell - python3 benchmark_serving.py \ + vllm bench serve \ --backend vllm \ --model base_model \ --tokenizer meta-llama/Llama-3.1-8B-Instruct \ diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md index e820ace4f8fe7..e83dfdb11dadc 100644 --- a/docs/features/multimodal_inputs.md +++ b/docs/features/multimodal_inputs.md @@ -177,6 +177,70 @@ Multi-image input can be extended to perform video captioning. We show this with You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary instead of using multi-image input. +Instead of NumPy arrays, you can also pass `'torch.Tensor'` instances, as shown in this example using Qwen2.5-VL: + +??? code + + ```python + from transformers import AutoProcessor + from vllm import LLM, SamplingParams + from qwen_vl_utils import process_vision_info + + model_path = "Qwen/Qwen2.5-VL-3B-Instruct/" + video_path = "https://content.pexels.com/videos/free-videos.mp4" + + llm = LLM( + model=model_path, + gpu_memory_utilization=0.8, + enforce_eager=True, + limit_mm_per_prompt={"video": 1}, + ) + + sampling_params = SamplingParams( + max_tokens=1024, + ) + + video_messages = [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": [ + {"type": "text", "text": "describe this video."}, + { + "type": "video", + "video": video_path, + "total_pixels": 20480 * 28 * 28, + "min_pixels": 16 * 28 * 28 + } + ] + }, + ] + + messages = video_messages + processor = AutoProcessor.from_pretrained(model_path) + prompt = processor.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + ) + + image_inputs, video_inputs = process_vision_info(messages) + mm_data = {} + if video_inputs is not None: + mm_data["video"] = video_inputs + + llm_inputs = { + "prompt": prompt, + "multi_modal_data": mm_data, + } + + outputs = llm.generate([llm_inputs], sampling_params=sampling_params) + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + ``` + + !!! note + 'process_vision_info' is only applicable to Qwen2.5-VL and similar models. + Full example: ### Audio Inputs diff --git a/docs/features/quantization/README.md b/docs/features/quantization/README.md index e8c3b11230786..e18c128f30fc9 100644 --- a/docs/features/quantization/README.md +++ b/docs/features/quantization/README.md @@ -6,6 +6,7 @@ Contents: - [Supported Hardware](supported_hardware.md) - [AutoAWQ](auto_awq.md) +- [AutoRound](auto_round.md) - [BitsAndBytes](bnb.md) - [BitBLAS](bitblas.md) - [GGUF](gguf.md) diff --git a/docs/features/quantization/auto_round.md b/docs/features/quantization/auto_round.md new file mode 100644 index 0000000000000..2dfd847bb7d9a --- /dev/null +++ b/docs/features/quantization/auto_round.md @@ -0,0 +1,103 @@ +# AutoRound + +[AutoRound](https://github.com/intel/auto-round) is Intel’s advanced quantization algorithm designed to produce highly efficient **INT2, INT3, INT4, and INT8** +quantized large language models—striking an optimal balance between accuracy and deployment performance. + +AutoRound applies weight-only quantization to transformer-based models, enabling significant memory savings and faster +inference while maintaining near-original accuracy. It supports a wide range of hardware platforms, including **CPUs, +Intel GPUs, HPUs, and CUDA-enabled devices**. + +Please refer to the [AutoRound guide](https://github.com/intel/auto-round/blob/main/docs/step_by_step.md) for more details. + +Key Features: + +✅ **AutoRound, AutoAWQ, AutoGPTQ, and GGUF** are supported + +✅ **10+ vision-language models (VLMs)** are supported + +✅ **Per-layer mixed-bit quantization** for fine-grained control + +✅ **RTN (Round-To-Nearest) mode** for quick quantization with slight accuracy loss + +✅ **Multiple quantization recipes**: best, base, and light + +✅ Advanced utilities such as immediate packing and support for **10+ backends** + +## Installation + +```bash +uv pip install auto-round +``` + +## Quantizing a model + +For VLMs, please change to `auto-round-mllm` in CLI usage and `AutoRoundMLLM` in API usage. + +### CLI usage + +```bash +auto-round \ + --model Qwen/Qwen3-0.6B \ + --bits 4 \ + --group_size 128 \ + --format "auto_round" \ + --output_dir ./tmp_autoround +``` + +```bash +auto-round \ + --model Qwen/Qwen3-0.6B \ + --format "gguf:q4_k_m" \ + --output_dir ./tmp_autoround +``` + +### API usage + +```python +from transformers import AutoModelForCausalLM, AutoTokenizer +from auto_round import AutoRound + +model_name = "Qwen/Qwen3-0.6B" +model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto") +tokenizer = AutoTokenizer.from_pretrained(model_name) + +bits, group_size, sym = 4, 128, True +autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym) + +# the best accuracy, 4-5X slower, low_gpu_mem_usage could save ~20G but ~30% slower +# autoround = AutoRound(model, tokenizer, nsamples=512, iters=1000, low_gpu_mem_usage=True, bits=bits, group_size=group_size, sym=sym) + +# 2-3X speedup, slight accuracy drop at W4G128 +# autoround = AutoRound(model, tokenizer, nsamples=128, iters=50, lr=5e-3, bits=bits, group_size=group_size, sym=sym ) + +output_dir = "./tmp_autoround" +# format= 'auto_round'(default), 'auto_gptq', 'auto_awq' +autoround.quantize_and_save(output_dir, format="auto_round") +``` + +## Running a quantized model with vLLM + +Here is some example code to run auto-round format in vLLM: + +```python +from vllm import LLM, SamplingParams + +prompts = [ + "Hello, my name is", +] +sampling_params = SamplingParams(temperature=0.6, top_p=0.95) +model_name = "Intel/DeepSeek-R1-0528-Qwen3-8B-int4-AutoRound" +llm = LLM(model=model_name) + +outputs = llm.generate(prompts, sampling_params) + +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +``` + +# Acknowledgement + +Special thanks to open-source low precision libraries such as AutoGPTQ, AutoAWQ, GPTQModel, Triton, Marlin, and +ExLLaMAV2 for providing low-precision CUDA kernels, which are leveraged in AutoRound. diff --git a/docs/getting_started/installation/cpu/arm.inc.md b/docs/getting_started/installation/cpu/arm.inc.md index 63ae351b395fb..cac578eefb1d7 100644 --- a/docs/getting_started/installation/cpu/arm.inc.md +++ b/docs/getting_started/installation/cpu/arm.inc.md @@ -33,7 +33,7 @@ Testing has been conducted on AWS Graviton3 instances for compatibility. # --8<-- [end:pre-built-images] # --8<-- [start:build-image-from-source] ```bash -docker build -f docker/Dockerfile.arm \ +docker build -f docker/Dockerfile.cpu \ --tag vllm-cpu-env . # Launching OpenAI server diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 0143d137ff3f9..355ac57094195 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -365,6 +365,7 @@ th { | `Grok1ModelForCausalLM` | Grok1 | `hpcai-tech/grok-1`. | ✅︎ | ✅︎ | ✅︎ | | `HunYuanDenseV1ForCausalLM` | Hunyuan-7B-Instruct-0124 | `tencent/Hunyuan-7B-Instruct-0124` | ✅︎ | | ✅︎ | | `HunYuanMoEV1ForCausalLM` | Hunyuan-80B-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | | ✅︎ | +| `HCXVisionForCausalLM` | HyperCLOVAX-SEED-Vision-Instruct-3B | `naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B` | | | ✅︎ | | `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ | | `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ | | `InternLM3ForCausalLM` | InternLM3 | `internlm/internlm3-8b-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | @@ -592,6 +593,7 @@ Specified using `--task generate`. | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ | | `H2OVLChatModel` | H2OVL | T + IE+ | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ | | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ | +| `InternS1ForConditionalGeneration` | Intern-S1 | T + IE+ + VE+ | `internlm/Intern-S1`, etc. | | ✅︎ | ✅︎ | | `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + IE+ + (VE+) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ | | `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + IE+ + VE+ | `Kwai-Keye/Keye-VL-8B-Preview` | | | ✅︎ | | `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I+ | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | | ✅︎ | @@ -612,6 +614,7 @@ Specified using `--task generate`. | `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + IE | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ | ⚠️ | | `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + IE+ | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ | ✅︎ | | `Phi4MMForCausalLM` | Phi-4-multimodal | T + I+ / T + A+ / I+ + A+ | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `Phi4MultimodalForCausalLM` | Phi-4-multimodal (HF Transformers) | T + I+ / T + A+ / I+ + A+ | `microsoft/Phi-4-multimodal-instruct` (with revision `refs/pr/70`), etc. | ✅︎ | ✅︎ | ✅︎ | | `PixtralForConditionalGeneration` | Mistral 3 (Mistral format), Pixtral (Mistral format) | T + I+ | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistralai/Pixtral-12B-2409`, etc. | | ✅︎ | ✅︎ | | `QwenVLForConditionalGeneration`^ | Qwen-VL | T + IE+ | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A+ | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ | ✅︎ | diff --git a/docs/training/rlhf.md b/docs/training/rlhf.md index 4f75e4e01495c..f608a630ab7a5 100644 --- a/docs/training/rlhf.md +++ b/docs/training/rlhf.md @@ -2,10 +2,14 @@ Reinforcement Learning from Human Feedback (RLHF) is a technique that fine-tunes language models using human-generated preference data to align model outputs with desired behaviors. -vLLM can be used to generate the completions for RLHF. The best way to do this is with libraries like [TRL](https://github.com/huggingface/trl), [OpenRLHF](https://github.com/OpenRLHF/OpenRLHF) and [verl](https://github.com/volcengine/verl). +vLLM can be used to generate the completions for RLHF. Some ways to do this include using libraries like [TRL](https://github.com/huggingface/trl), [OpenRLHF](https://github.com/OpenRLHF/OpenRLHF), [verl](https://github.com/volcengine/verl) and [unsloth](https://github.com/unslothai/unsloth). See the following basic examples to get started if you don't want to use an existing library: - [Training and inference processes are located on separate GPUs (inspired by OpenRLHF)](../examples/offline_inference/rlhf.md) - [Training and inference processes are colocated on the same GPUs using Ray](../examples/offline_inference/rlhf_colocate.md) - [Utilities for performing RLHF with vLLM](../examples/offline_inference/rlhf_utils.md) + +See the following notebooks showing how to use vLLM for GRPO: + +- [Qwen-3 4B GRPO using Unsloth + vLLM](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_(4B)-GRPO.ipynb) diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py index 8014cb53f16a8..01d6a188be994 100644 --- a/examples/offline_inference/audio_language.py +++ b/examples/offline_inference/audio_language.py @@ -190,6 +190,37 @@ def run_phi4mm(question: str, audio_count: int) -> ModelRequestData: ) +def run_phi4_multimodal(question: str, audio_count: int) -> ModelRequestData: + """ + Phi-4-multimodal-instruct supports both image and audio inputs. Here, we + show how to process audio inputs. + """ + model_path = snapshot_download( + "microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70" + ) + # Since the vision-lora and speech-lora co-exist with the base model, + # we have to manually specify the path of the lora weights. + speech_lora_path = os.path.join(model_path, "speech-lora") + placeholders = "<|audio|>" * audio_count + + prompts = f"<|user|>{placeholders}{question}<|end|><|assistant|>" + + engine_args = EngineArgs( + model=model_path, + max_model_len=12800, + max_num_seqs=2, + enable_lora=True, + max_lora_rank=320, + limit_mm_per_prompt={"audio": audio_count}, + ) + + return ModelRequestData( + engine_args=engine_args, + prompt=prompts, + lora_requests=[LoRARequest("speech", 1, speech_lora_path)], + ) + + # Qwen2-Audio def run_qwen2_audio(question: str, audio_count: int) -> ModelRequestData: model_name = "Qwen/Qwen2-Audio-7B-Instruct" @@ -303,6 +334,7 @@ model_example_map = { "granite_speech": run_granite_speech, "minicpmo": run_minicpmo, "phi4_mm": run_phi4mm, + "phi4_multimodal": run_phi4_multimodal, "qwen2_audio": run_qwen2_audio, "qwen2_5_omni": run_qwen2_5_omni, "ultravox": run_ultravox, diff --git a/examples/offline_inference/prithvi_geospatial_mae.py b/examples/offline_inference/prithvi_geospatial_mae.py index 4fdc7a3cf709e..b6007b9f46301 100644 --- a/examples/offline_inference/prithvi_geospatial_mae.py +++ b/examples/offline_inference/prithvi_geospatial_mae.py @@ -3,12 +3,12 @@ import argparse import datetime import os -import re from typing import Union import albumentations import numpy as np import rasterio +import regex as re import torch from einops import rearrange from terratorch.datamodules import Sen1Floods11NonGeoDataModule diff --git a/examples/offline_inference/save_sharded_state.py b/examples/offline_inference/save_sharded_state.py index 9b154e370642b..d6b8b7e6838d7 100644 --- a/examples/offline_inference/save_sharded_state.py +++ b/examples/offline_inference/save_sharded_state.py @@ -29,6 +29,7 @@ import shutil from pathlib import Path from vllm import LLM, EngineArgs +from vllm.model_executor.model_loader import ShardedStateLoader from vllm.utils import FlexibleArgumentParser @@ -39,7 +40,10 @@ def parse_args(): "--output", "-o", required=True, type=str, help="path to output checkpoint" ) parser.add_argument( - "--file-pattern", type=str, help="string pattern of saved filenames" + "--file-pattern", + type=str, + default=ShardedStateLoader.DEFAULT_PATTERN, + help="string pattern of saved filenames", ) parser.add_argument( "--max-file-size", diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index e4811c023377f..6f23a29e72f71 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -316,6 +316,85 @@ def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData: ) +# naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B +def run_hyperclovax_seed_vision( + questions: list[str], modality: str +) -> ModelRequestData: + model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B" + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + + engine_args = EngineArgs( + model=model_name, + trust_remote_code=True, + max_model_len=8192 if modality == "image" else 16384, + limit_mm_per_prompt={modality: 1}, + ) + + messages = list() + for question in questions: + if modality == "image": + """ + ocr: List the words in the image in raster order. + Even if the word order feels unnatural for reading, + the model will handle it as long as it follows raster order. + e.g. "Naver, CLOVA, bigshane" + lens_keywords: List the entity names in the image. + e.g. "iPhone" + lens_local_keywords: List the entity names with quads in the image. + e.g. "[0.07, 0.21, 0.92, 0.90] iPhone" + """ + messages.append( + [ + { + "role": "user", + "content": [ + { + "type": "image", + "ocr": "", + "lens_keywords": "", + "lens_local_keywords": "", + }, + { + "type": "text", + "text": question, + }, + ], + } + ] + ) + elif modality == "video": + messages.append( + [ + { + "role": "user", + "content": [ + { + "type": "video", + }, + { + "type": "text", + "text": question, + }, + ], + } + ] + ) + else: + raise ValueError(f"Unsupported modality: {modality}") + + prompts = tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + ) + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + stop_token_ids=None, + ) + + # Idefics3-8B-Llama3 def run_idefics3(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" @@ -389,6 +468,39 @@ def run_tarsier(questions: list[str], modality: str) -> ModelRequestData: ) +# Intern-S1 +def run_interns1(questions: list[str], modality: str) -> ModelRequestData: + model_name = "internlm/Intern-S1" + + engine_args = EngineArgs( + model=model_name, + trust_remote_code=True, + max_model_len=8192, + max_num_seqs=2, + limit_mm_per_prompt={modality: 1}, + enforce_eager=True, + ) + + if modality == "image": + placeholder = "" + elif modality == "video": + placeholder = "